From c998647beb6c8278460e632e1433fee4dda557a4 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 15 Feb 2023 18:10:47 -0800 Subject: [PATCH 01/33] remove gmt_offset --- cpp/src/io/orc/stripe_data.cu | 8 ++-- cpp/src/io/orc/timezone.cpp | 12 ++---- cpp/src/io/orc/timezone.cuh | 77 ++++++++++++----------------------- 3 files changed, 33 insertions(+), 64 deletions(-) diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index d0d077d2611..2f58aa51236 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -1446,7 +1446,8 @@ __global__ void __launch_bounds__(block_size) } if (!is_dictionary(s->chunk.encoding_kind)) { s->chunk.dictionary_start = 0; } - s->top.data.utc_epoch = kORCTimeToUTC - tz_table.gmt_offset; + s->top.data.utc_epoch = + kORCTimeToUTC - get_gmt_offset(tz_table.ttimes, tz_table.offsets, kORCTimeToUTC); bytestream_init(&s->bs, s->chunk.streams[CI_DATA], s->chunk.strm_len[CI_DATA]); bytestream_init(&s->bs2, s->chunk.streams[CI_DATA2], s->chunk.strm_len[CI_DATA2]); @@ -1772,9 +1773,8 @@ __global__ void __launch_bounds__(block_size) int64_t seconds = s->vals.i64[t + vals_skipped] + s->top.data.utc_epoch; int64_t nanos = secondary_val; nanos = (nanos >> 3) * kTimestampNanoScale[nanos & 7]; - if (!tz_table.ttimes.empty()) { - seconds += get_gmt_offset(tz_table.ttimes, tz_table.offsets, seconds); - } + seconds += get_gmt_offset(tz_table.ttimes, tz_table.offsets, seconds); + // Adjust seconds only for negative timestamps with positive nanoseconds. // Alternative way to represent negative timestamps is with negative nanoseconds // in which case the adjustment in not needed. diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp index 810dfe87320..cbc7fe99d46 100644 --- a/cpp/src/io/orc/timezone.cpp +++ b/cpp/src/io/orc/timezone.cpp @@ -26,9 +26,6 @@ namespace io { constexpr uint32_t tzif_magic = ('T' << 0) | ('Z' << 8) | ('i' << 16) | ('f' << 24); std::string const tzif_system_directory = "/usr/share/zoneinfo/"; -// Seconds from Jan 1st, 1970 to Jan 1st, 2015 -constexpr int64_t orc_utc_offset = 1420070400; - #pragma pack(push, 1) /** * @brief 32-bit TZif header @@ -461,12 +458,9 @@ timezone_table build_timezone_transition_table(std::string const& timezone_name, .count(); } - rmm::device_uvector d_ttimes = cudf::detail::make_device_uvector_async(ttimes, stream); - rmm::device_uvector d_offsets = cudf::detail::make_device_uvector_async(offsets, stream); - auto const gmt_offset = get_gmt_offset(ttimes, offsets, orc_utc_offset); - stream.synchronize(); - - return {gmt_offset, std::move(d_ttimes), std::move(d_offsets)}; + auto d_ttimes = cudf::detail::make_device_uvector_async(ttimes, stream); + auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, stream); + return {std::move(d_ttimes), std::move(d_offsets)}; } } // namespace io diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh index 52736d6451a..55036badd64 100644 --- a/cpp/src/io/orc/timezone.cuh +++ b/cpp/src/io/orc/timezone.cuh @@ -33,7 +33,6 @@ namespace cudf { namespace io { struct timezone_table_view { - int32_t gmt_offset = 0; cudf::device_span ttimes; cudf::device_span offsets; }; @@ -58,58 +57,36 @@ static constexpr uint32_t cycle_entry_cnt = 2 * cycle_years; * * @return GMT offset */ -CUDF_HOST_DEVICE inline int32_t get_gmt_offset_impl(int64_t const* ttimes, - int32_t const* offsets, - size_t count, - int64_t ts) -{ - // Returns start of the range if all elements are larger than the input timestamp - auto last_less_equal_ttime_idx = [&](long begin_idx, long end_idx, int64_t ts) { - auto const first_larger_ttime = - thrust::upper_bound(thrust::seq, ttimes + begin_idx, ttimes + end_idx, ts); - // Element before the first larger element is the last one less of equal - return std::max(first_larger_ttime - ttimes - 1, begin_idx); - }; - - auto const file_entry_cnt = count - cycle_entry_cnt; - // Search in the file entries if the timestamp is in range - if (ts <= ttimes[file_entry_cnt - 1]) { - return offsets[last_less_equal_ttime_idx(0, file_entry_cnt, ts)]; - } else { - // Search in the 400-year cycle if outside of the file entries range - return offsets[last_less_equal_ttime_idx( - file_entry_cnt, count, (ts + cycle_seconds) % cycle_seconds)]; - } -} - -/** - * @brief Host `get_gmt_offset` interface. - * - * Implemented in `get_gmt_offset_impl`. - */ -inline __host__ int32_t get_gmt_offset(cudf::host_span ttimes, - cudf::host_span offsets, - int64_t ts) -{ - CUDF_EXPECTS(ttimes.size() == offsets.size(), - "transition times and offsets must have the same length"); - return get_gmt_offset_impl(ttimes.begin(), offsets.begin(), ttimes.size(), ts); -} - -/** - * @brief Device `get_gmt_offset` interface. - * - * Implemented in `get_gmt_offset_impl`. - */ inline __device__ int32_t get_gmt_offset(cudf::device_span ttimes, cudf::device_span offsets, int64_t ts) { - return get_gmt_offset_impl(ttimes.begin(), offsets.begin(), ttimes.size(), ts); + if (ttimes.empty()) { return 0; } + + auto const ts_ttime_it = [&]() { + auto last_less_equal = [](auto begin, auto end, int64_t value) { + auto const first_larger = thrust::upper_bound(thrust::seq, begin, end, value); + // Return start of the range if all elements are larger than the value + if (first_larger == begin) return begin; + // Element before the first larger element is the last one less or equal + return first_larger - 1; + }; + + auto const file_entry_end = ttimes.begin() + (ttimes.size() - cycle_entry_cnt); + + if (ts <= *(file_entry_end - 1)) { + // Search the file entries if the timestamp is in range + return last_less_equal(ttimes.begin(), file_entry_end, ts); + } else { + // Search the 400-year cycle if outside of the file entries range + return last_less_equal(file_entry_end, ttimes.end(), (ts + cycle_seconds) % cycle_seconds); + } + }(); + + return offsets[ts_ttime_it - ttimes.begin()]; } class timezone_table { - int32_t gmt_offset = 0; rmm::device_uvector ttimes; rmm::device_uvector offsets; @@ -118,13 +95,11 @@ class timezone_table { timezone_table() : ttimes{0, cudf::get_default_stream()}, offsets{0, cudf::get_default_stream()} { } - timezone_table(int32_t gmt_offset, - rmm::device_uvector&& ttimes, - rmm::device_uvector&& offsets) - : gmt_offset{gmt_offset}, ttimes{std::move(ttimes)}, offsets{std::move(offsets)} + timezone_table(rmm::device_uvector&& ttimes, rmm::device_uvector&& offsets) + : ttimes{std::move(ttimes)}, offsets{std::move(offsets)} { } - [[nodiscard]] timezone_table_view view() const { return {gmt_offset, ttimes, offsets}; } + [[nodiscard]] timezone_table_view view() const { return {ttimes, offsets}; } }; /** From 623d25c3f53afca6b7ac28077f64eec6cb36994d Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 16 Feb 2023 15:37:46 -0800 Subject: [PATCH 02/33] minor clean up --- cpp/src/io/orc/stripe_data.cu | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 2f58aa51236..e7eb7d43dcd 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -1771,10 +1771,11 @@ __global__ void __launch_bounds__(block_size) } case TIMESTAMP: { int64_t seconds = s->vals.i64[t + vals_skipped] + s->top.data.utc_epoch; - int64_t nanos = secondary_val; - nanos = (nanos >> 3) * kTimestampNanoScale[nanos & 7]; seconds += get_gmt_offset(tz_table.ttimes, tz_table.offsets, seconds); + int64_t nanos = secondary_val; + nanos = (nanos >> 3) * kTimestampNanoScale[nanos & 7]; + // Adjust seconds only for negative timestamps with positive nanoseconds. // Alternative way to represent negative timestamps is with negative nanoseconds // in which case the adjustment in not needed. @@ -1788,18 +1789,16 @@ __global__ void __launch_bounds__(block_size) using cuda::std::chrono::duration_cast; switch (s->chunk.timestamp_type_id) { case type_id::TIMESTAMP_SECONDS: - return d_s.count() + duration_cast(d_ns).count(); + return (d_s + duration_cast(d_ns)).count(); case type_id::TIMESTAMP_MILLISECONDS: - return duration_cast(d_s).count() + - duration_cast(d_ns).count(); + return (d_s + duration_cast(d_ns)).count(); case type_id::TIMESTAMP_MICROSECONDS: - return duration_cast(d_s).count() + - duration_cast(d_ns).count(); + return (d_s + duration_cast(d_ns)).count(); case type_id::TIMESTAMP_NANOSECONDS: default: - return duration_cast(d_s).count() + - d_ns.count(); // nanoseconds as output in case of `type_id::EMPTY` and - // `type_id::TIMESTAMP_NANOSECONDS` + // nanoseconds as output in case of `type_id::EMPTY` and + // `type_id::TIMESTAMP_NANOSECONDS` + return (d_s + d_ns).count(); } }(); From c5518ef2df603880ebcca712734aa2dbcd0b6631 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 16 Feb 2023 15:46:04 -0800 Subject: [PATCH 03/33] another one --- cpp/src/io/orc/timezone.cuh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh index 55036badd64..866ddeacdce 100644 --- a/cpp/src/io/orc/timezone.cuh +++ b/cpp/src/io/orc/timezone.cuh @@ -52,7 +52,6 @@ static constexpr uint32_t cycle_entry_cnt = 2 * cycle_years; * beyond the one covered by the TZif file * @param offsets Time offsets in specific intervals; trailing `cycle_entry_cnt` entries are used * for all times beyond the one covered by the TZif file - * @param count Number of elements in @p ttimes and @p offsets * @param ts ORC timestamp * * @return GMT offset @@ -64,7 +63,7 @@ inline __device__ int32_t get_gmt_offset(cudf::device_span ttimes if (ttimes.empty()) { return 0; } auto const ts_ttime_it = [&]() { - auto last_less_equal = [](auto begin, auto end, int64_t value) { + auto last_less_equal = [](auto begin, auto end, auto value) { auto const first_larger = thrust::upper_bound(thrust::seq, begin, end, value); // Return start of the range if all elements are larger than the value if (first_larger == begin) return begin; From 979ea3f7003d6cc37828fa97e8cd461a8cb27074 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 17 Feb 2023 11:24:48 -0800 Subject: [PATCH 04/33] return table! --- cpp/src/io/orc/orc_gpu.hpp | 2 +- cpp/src/io/orc/reader_impl.cu | 11 +++++----- cpp/src/io/orc/reader_impl.hpp | 4 ++-- cpp/src/io/orc/stripe_data.cu | 9 ++++----- cpp/src/io/orc/timezone.cpp | 37 +++++++++++++++++++++++++--------- cpp/src/io/orc/timezone.cuh | 37 +++++++++------------------------- 6 files changed, 51 insertions(+), 49 deletions(-) diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp index 43f0565845c..eceac1b5527 100644 --- a/cpp/src/io/orc/orc_gpu.hpp +++ b/cpp/src/io/orc/orc_gpu.hpp @@ -294,7 +294,7 @@ void DecodeOrcColumnData(ColumnDesc* chunks, uint32_t num_columns, uint32_t num_stripes, size_t first_row, - timezone_table_view tz_table, + table_device_view tz_table, uint32_t num_rowgroups, uint32_t rowidx_stride, size_t level, diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 96eb20e1e66..2d3d65ae9cb 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -603,7 +603,7 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector const& void reader::impl::decode_stream_data(cudf::detail::hostdevice_2dvector& chunks, size_t num_dicts, size_t skip_rows, - timezone_table_view tz_table, + table_device_view tz_table, cudf::detail::hostdevice_2dvector& row_groups, size_t row_index_stride, std::vector& out_buffers, @@ -915,11 +915,11 @@ reader::impl::impl(std::vector>&& sources, decimal128_columns = options.get_decimal128_columns(); } -timezone_table reader::impl::compute_timezone_table( +std::unique_ptr reader::impl::compute_timezone_table( const std::vector& selected_stripes, rmm::cuda_stream_view stream) { - if (selected_stripes.empty()) return {}; + if (selected_stripes.empty()) return std::make_unique(); auto const has_timestamp_column = std::any_of( selected_columns.levels.cbegin(), selected_columns.levels.cend(), [&](auto& col_lvl) { @@ -927,7 +927,7 @@ timezone_table reader::impl::compute_timezone_table( return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP; }); }); - if (not has_timestamp_column) return {}; + if (not has_timestamp_column) return std::make_unique(); return build_timezone_transition_table(selected_stripes[0].stripe_info[0].second->writerTimezone, stream); @@ -1238,10 +1238,11 @@ table_with_metadata reader::impl::read(size_type skip_rows, } if (not is_level_data_empty) { + auto const tz_table_dview = table_device_view::create(tz_table->view(), stream); decode_stream_data(chunks, num_dict_entries, skip_rows, - tz_table.view(), + *tz_table_dview, row_groups, _metadata.get_row_index_stride(), out_buffers[level], diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp index 96492e4c2b2..4ed7b489b1c 100644 --- a/cpp/src/io/orc/reader_impl.hpp +++ b/cpp/src/io/orc/reader_impl.hpp @@ -145,7 +145,7 @@ class reader::impl { void decode_stream_data(cudf::detail::hostdevice_2dvector& chunks, size_t num_dicts, size_t skip_rows, - timezone_table_view tz_table, + table_device_view tz_table, cudf::detail::hostdevice_2dvector& row_groups, size_t row_index_stride, std::vector& out_buffers, @@ -210,7 +210,7 @@ class reader::impl { * * @return Timezone table with timestamp offsets */ - timezone_table compute_timezone_table( + std::unique_ptr
compute_timezone_table( const std::vector& selected_stripes, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index e7eb7d43dcd..4b322576aab 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -1374,7 +1374,7 @@ template __global__ void __launch_bounds__(block_size) gpuDecodeOrcColumnData(ColumnDesc* chunks, DictionaryEntry* global_dictionary, - timezone_table_view tz_table, + table_device_view tz_table, device_2dspan row_groups, size_t first_row, uint32_t rowidx_stride, @@ -1446,8 +1446,7 @@ __global__ void __launch_bounds__(block_size) } if (!is_dictionary(s->chunk.encoding_kind)) { s->chunk.dictionary_start = 0; } - s->top.data.utc_epoch = - kORCTimeToUTC - get_gmt_offset(tz_table.ttimes, tz_table.offsets, kORCTimeToUTC); + s->top.data.utc_epoch = kORCTimeToUTC - get_gmt_offset(tz_table, kORCTimeToUTC); bytestream_init(&s->bs, s->chunk.streams[CI_DATA], s->chunk.strm_len[CI_DATA]); bytestream_init(&s->bs2, s->chunk.streams[CI_DATA2], s->chunk.strm_len[CI_DATA2]); @@ -1771,7 +1770,7 @@ __global__ void __launch_bounds__(block_size) } case TIMESTAMP: { int64_t seconds = s->vals.i64[t + vals_skipped] + s->top.data.utc_epoch; - seconds += get_gmt_offset(tz_table.ttimes, tz_table.offsets, seconds); + seconds += get_gmt_offset(tz_table, seconds); int64_t nanos = secondary_val; nanos = (nanos >> 3) * kTimestampNanoScale[nanos & 7]; @@ -1886,7 +1885,7 @@ void __host__ DecodeOrcColumnData(ColumnDesc* chunks, uint32_t num_columns, uint32_t num_stripes, size_t first_row, - timezone_table_view tz_table, + table_device_view tz_table, uint32_t num_rowgroups, uint32_t rowidx_stride, size_t level, diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp index cbc7fe99d46..066825e55d5 100644 --- a/cpp/src/io/orc/timezone.cpp +++ b/cpp/src/io/orc/timezone.cpp @@ -15,7 +15,9 @@ */ #include "timezone.cuh" +#include #include +#include #include #include @@ -370,18 +372,18 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year) return trans.time + cuda::std::chrono::duration_cast(duration_D{day}).count(); } -timezone_table build_timezone_transition_table(std::string const& timezone_name, - rmm::cuda_stream_view stream) +std::unique_ptr
build_timezone_transition_table(std::string const& timezone_name, + rmm::cuda_stream_view stream) { if (timezone_name == "UTC" || timezone_name.empty()) { // Return an empty table for UTC - return {}; + return std::make_unique(); } timezone_file const tzf(timezone_name); - std::vector ttimes(1); - std::vector offsets(1); + std::vector ttimes(1); + std::vector offsets(1); // One ancient rule entry, one per TZ file entry, 2 entries per year in the future cycle ttimes.reserve(1 + tzf.timecnt() + cycle_entry_cnt); offsets.reserve(1 + tzf.timecnt() + cycle_entry_cnt); @@ -404,7 +406,7 @@ timezone_table build_timezone_transition_table(std::string const& timezone_name, if (tzf.typecnt() == 0 || tzf.ttype[0].utcoff == 0) { // No transitions, offset is zero; Table would be a no-op. // Return an empty table to speed up parsing. - return {}; + return std::make_unique(); } // No transitions to use for the time/offset - use the first offset and apply to all timestamps ttimes[0] = std::numeric_limits::max(); @@ -458,9 +460,26 @@ timezone_table build_timezone_transition_table(std::string const& timezone_name, .count(); } - auto d_ttimes = cudf::detail::make_device_uvector_async(ttimes, stream); - auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, stream); - return {std::move(d_ttimes), std::move(d_offsets)}; + std::vector> tz_table_columns; + tz_table_columns.emplace_back(make_timestamp_column( + data_type{type_id::TIMESTAMP_SECONDS}, ttimes.size(), mask_state::UNALLOCATED, stream)); + tz_table_columns.emplace_back(make_duration_column( + data_type{type_id::DURATION_SECONDS}, offsets.size(), mask_state::UNALLOCATED, stream)); + + CUDF_CUDA_TRY(cudaMemcpyAsync(tz_table_columns[0]->mutable_view().head(), + ttimes.data(), + ttimes.size() * sizeof(timestamp_s::rep), + cudaMemcpyDefault, + stream.value())); + CUDF_CUDA_TRY(cudaMemcpyAsync(tz_table_columns[1]->mutable_view().head(), + offsets.data(), + offsets.size() * sizeof(duration_s::rep), + cudaMemcpyDefault, + stream.value())); + // Need to finish copies before ttimes and offsets go out of scope + stream.synchronize(); + + return std::make_unique(std::move(tz_table_columns)); } } // namespace io diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh index 866ddeacdce..9af0d76e1db 100644 --- a/cpp/src/io/orc/timezone.cuh +++ b/cpp/src/io/orc/timezone.cuh @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include #include @@ -32,11 +33,6 @@ namespace cudf { namespace io { -struct timezone_table_view { - cudf::device_span ttimes; - cudf::device_span offsets; -}; - // Cycle in which the time offsets repeat static constexpr int32_t cycle_years = 400; // Number of seconds in 400 years @@ -56,11 +52,14 @@ static constexpr uint32_t cycle_entry_cnt = 2 * cycle_years; * * @return GMT offset */ -inline __device__ int32_t get_gmt_offset(cudf::device_span ttimes, - cudf::device_span offsets, - int64_t ts) +inline __device__ duration_s::rep get_gmt_offset(table_device_view tz_table, int64_t ts) { - if (ttimes.empty()) { return 0; } + if (tz_table.num_rows() == 0) { return 0; } + + cudf::device_span ttimes(tz_table.column(0).head(), + static_cast(tz_table.num_rows())); + cudf::device_span offsets(tz_table.column(1).head(), + static_cast(tz_table.num_rows())); auto const ts_ttime_it = [&]() { auto last_less_equal = [](auto begin, auto end, auto value) { @@ -85,22 +84,6 @@ inline __device__ int32_t get_gmt_offset(cudf::device_span ttimes return offsets[ts_ttime_it - ttimes.begin()]; } -class timezone_table { - rmm::device_uvector ttimes; - rmm::device_uvector offsets; - - public: - // Safe to use the default stream, device_uvectors will not change after they are created empty - timezone_table() : ttimes{0, cudf::get_default_stream()}, offsets{0, cudf::get_default_stream()} - { - } - timezone_table(rmm::device_uvector&& ttimes, rmm::device_uvector&& offsets) - : ttimes{std::move(ttimes)}, offsets{std::move(offsets)} - { - } - [[nodiscard]] timezone_table_view view() const { return {ttimes, offsets}; } -}; - /** * @brief Creates a transition table to convert ORC timestamps to UTC. * @@ -111,8 +94,8 @@ class timezone_table { * * @return The transition table for the given timezone */ -timezone_table build_timezone_transition_table(std::string const& timezone_name, - rmm::cuda_stream_view stream); +std::unique_ptr
build_timezone_transition_table(std::string const& timezone_name, + rmm::cuda_stream_view stream); } // namespace io } // namespace cudf From 328db73afdec41fd92e24b800d6f9c55a75e7fa9 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 17 Feb 2023 14:05:42 -0800 Subject: [PATCH 05/33] create tz table content properly --- cpp/src/io/orc/reader_impl.hpp | 2 +- cpp/src/io/orc/stripe_data.cu | 2 +- cpp/src/io/orc/timezone.cpp | 37 +++++++++++++++++++--------------- cpp/src/io/orc/timezone.cuh | 2 +- 4 files changed, 24 insertions(+), 19 deletions(-) diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp index 4ed7b489b1c..94b0fdc09d2 100644 --- a/cpp/src/io/orc/reader_impl.hpp +++ b/cpp/src/io/orc/reader_impl.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 4b322576aab..c0e3f8e820e 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp index 066825e55d5..7ac47f86de7 100644 --- a/cpp/src/io/orc/timezone.cpp +++ b/cpp/src/io/orc/timezone.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -460,22 +460,27 @@ std::unique_ptr
build_timezone_transition_table(std::string const& timezo .count(); } + CUDF_EXPECTS(ttimes.size() == offsets.size(), + "Error reading TZif file for timezone " + timezone_name); + + std::vector ttimes_typed; + ttimes_typed.reserve(ttimes.size()); + std::transform(ttimes.cbegin(), ttimes.cend(), std::back_inserter(ttimes_typed), [](auto ts) { + return timestamp_s{duration_s{ts}}; + }); + std::vector offsets_typed; + offsets_typed.reserve(offsets.size()); + std::transform(offsets.cbegin(), offsets.cend(), std::back_inserter(offsets_typed), [](auto ts) { + return duration_s{ts}; + }); + + auto d_ttimes = cudf::detail::make_device_uvector_async(ttimes_typed, stream); + auto d_offsets = cudf::detail::make_device_uvector_async(offsets_typed, stream); + std::vector> tz_table_columns; - tz_table_columns.emplace_back(make_timestamp_column( - data_type{type_id::TIMESTAMP_SECONDS}, ttimes.size(), mask_state::UNALLOCATED, stream)); - tz_table_columns.emplace_back(make_duration_column( - data_type{type_id::DURATION_SECONDS}, offsets.size(), mask_state::UNALLOCATED, stream)); - - CUDF_CUDA_TRY(cudaMemcpyAsync(tz_table_columns[0]->mutable_view().head(), - ttimes.data(), - ttimes.size() * sizeof(timestamp_s::rep), - cudaMemcpyDefault, - stream.value())); - CUDF_CUDA_TRY(cudaMemcpyAsync(tz_table_columns[1]->mutable_view().head(), - offsets.data(), - offsets.size() * sizeof(duration_s::rep), - cudaMemcpyDefault, - stream.value())); + tz_table_columns.emplace_back(std::make_unique(std::move(d_ttimes))); + tz_table_columns.emplace_back(std::make_unique(std::move(d_offsets))); + // Need to finish copies before ttimes and offsets go out of scope stream.synchronize(); diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh index 9af0d76e1db..da605c120ff 100644 --- a/cpp/src/io/orc/timezone.cuh +++ b/cpp/src/io/orc/timezone.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 640abd6b807ee7dd1cbf01bd64dddcb0049ed0d2 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 17 Feb 2023 15:30:36 -0800 Subject: [PATCH 06/33] take timestamp_s in get_gmt_offset --- cpp/src/io/orc/stripe_data.cu | 5 +++-- cpp/src/io/orc/timezone.cuh | 18 +++++++++++------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index c0e3f8e820e..b30766d9a8b 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -1446,7 +1446,8 @@ __global__ void __launch_bounds__(block_size) } if (!is_dictionary(s->chunk.encoding_kind)) { s->chunk.dictionary_start = 0; } - s->top.data.utc_epoch = kORCTimeToUTC - get_gmt_offset(tz_table, kORCTimeToUTC); + s->top.data.utc_epoch = + kORCTimeToUTC - get_gmt_offset(tz_table, timestamp_s{duration_s{kORCTimeToUTC}}); bytestream_init(&s->bs, s->chunk.streams[CI_DATA], s->chunk.strm_len[CI_DATA]); bytestream_init(&s->bs2, s->chunk.streams[CI_DATA2], s->chunk.strm_len[CI_DATA2]); @@ -1770,7 +1771,7 @@ __global__ void __launch_bounds__(block_size) } case TIMESTAMP: { int64_t seconds = s->vals.i64[t + vals_skipped] + s->top.data.utc_epoch; - seconds += get_gmt_offset(tz_table, seconds); + seconds += get_gmt_offset(tz_table, timestamp_s{duration_s{seconds}}); int64_t nanos = secondary_val; nanos = (nanos >> 3) * kTimestampNanoScale[nanos & 7]; diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh index da605c120ff..8098217eee6 100644 --- a/cpp/src/io/orc/timezone.cuh +++ b/cpp/src/io/orc/timezone.cuh @@ -35,12 +35,16 @@ namespace io { // Cycle in which the time offsets repeat static constexpr int32_t cycle_years = 400; -// Number of seconds in 400 years -static constexpr int64_t cycle_seconds = - cuda::std::chrono::duration_cast(duration_D{365 * cycle_years + (100 - 3)}).count(); // Two entries per year, over the length of the cycle static constexpr uint32_t cycle_entry_cnt = 2 * cycle_years; +inline __device__ auto project_to_cycle(timestamp_s ts) +{ + static constexpr duration_s cycle_s = + cuda::std::chrono::duration_cast(duration_D{365 * cycle_years + (100 - 3)}); + return timestamp_s{(ts.time_since_epoch() + cycle_s) % cycle_s}; +} + /** * @brief Returns the GMT offset for a given date and given timezone table. * @@ -52,12 +56,12 @@ static constexpr uint32_t cycle_entry_cnt = 2 * cycle_years; * * @return GMT offset */ -inline __device__ duration_s::rep get_gmt_offset(table_device_view tz_table, int64_t ts) +inline __device__ duration_s::rep get_gmt_offset(table_device_view tz_table, timestamp_s ts) { if (tz_table.num_rows() == 0) { return 0; } - cudf::device_span ttimes(tz_table.column(0).head(), - static_cast(tz_table.num_rows())); + cudf::device_span ttimes(tz_table.column(0).head(), + static_cast(tz_table.num_rows())); cudf::device_span offsets(tz_table.column(1).head(), static_cast(tz_table.num_rows())); @@ -77,7 +81,7 @@ inline __device__ duration_s::rep get_gmt_offset(table_device_view tz_table, int return last_less_equal(ttimes.begin(), file_entry_end, ts); } else { // Search the 400-year cycle if outside of the file entries range - return last_less_equal(file_entry_end, ttimes.end(), (ts + cycle_seconds) % cycle_seconds); + return last_less_equal(file_entry_end, ttimes.end(), project_to_cycle(ts)); } }(); From 6e3057813afc2d6dfe3e4772f3ccfbe692cb7be0 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 17 Feb 2023 15:37:29 -0800 Subject: [PATCH 07/33] return duration --- cpp/src/io/orc/stripe_data.cu | 4 ++-- cpp/src/io/orc/timezone.cuh | 8 +++----- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index b30766d9a8b..7f3f9cf4f8f 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -1447,7 +1447,7 @@ __global__ void __launch_bounds__(block_size) if (!is_dictionary(s->chunk.encoding_kind)) { s->chunk.dictionary_start = 0; } s->top.data.utc_epoch = - kORCTimeToUTC - get_gmt_offset(tz_table, timestamp_s{duration_s{kORCTimeToUTC}}); + kORCTimeToUTC - get_gmt_offset(tz_table, timestamp_s{duration_s{kORCTimeToUTC}}).count(); bytestream_init(&s->bs, s->chunk.streams[CI_DATA], s->chunk.strm_len[CI_DATA]); bytestream_init(&s->bs2, s->chunk.streams[CI_DATA2], s->chunk.strm_len[CI_DATA2]); @@ -1771,7 +1771,7 @@ __global__ void __launch_bounds__(block_size) } case TIMESTAMP: { int64_t seconds = s->vals.i64[t + vals_skipped] + s->top.data.utc_epoch; - seconds += get_gmt_offset(tz_table, timestamp_s{duration_s{seconds}}); + seconds += get_gmt_offset(tz_table, timestamp_s{duration_s{seconds}}).count(); int64_t nanos = secondary_val; nanos = (nanos >> 3) * kTimestampNanoScale[nanos & 7]; diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh index 8098217eee6..5de9fff4734 100644 --- a/cpp/src/io/orc/timezone.cuh +++ b/cpp/src/io/orc/timezone.cuh @@ -56,14 +56,12 @@ inline __device__ auto project_to_cycle(timestamp_s ts) * * @return GMT offset */ -inline __device__ duration_s::rep get_gmt_offset(table_device_view tz_table, timestamp_s ts) +inline __device__ duration_s get_gmt_offset(table_device_view tz_table, timestamp_s ts) { - if (tz_table.num_rows() == 0) { return 0; } + if (tz_table.num_rows() == 0) { return duration_s{0}; } cudf::device_span ttimes(tz_table.column(0).head(), static_cast(tz_table.num_rows())); - cudf::device_span offsets(tz_table.column(1).head(), - static_cast(tz_table.num_rows())); auto const ts_ttime_it = [&]() { auto last_less_equal = [](auto begin, auto end, auto value) { @@ -85,7 +83,7 @@ inline __device__ duration_s::rep get_gmt_offset(table_device_view tz_table, tim } }(); - return offsets[ts_ttime_it - ttimes.begin()]; + return tz_table.column(1).element(ts_ttime_it - ttimes.begin()); } /** From 9ae2a26abaeabca2b250ed28ba1de5a558aab725 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 17 Feb 2023 16:29:06 -0800 Subject: [PATCH 08/33] use chrono types in ORC --- cpp/src/io/orc/stripe_data.cu | 31 ++++++++++++++----------------- cpp/src/io/orc/stripe_enc.cu | 4 ++-- 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 7f3f9cf4f8f..1f17b7a4d80 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -43,8 +43,8 @@ inline __device__ uint8_t is_rlev1(uint8_t encoding_mode) { return encoding_mode inline __device__ uint8_t is_dictionary(uint8_t encoding_mode) { return encoding_mode & 1; } -static __device__ __constant__ int64_t kORCTimeToUTC = - 1420070400; // Seconds from January 1st, 1970 to January 1st, 2015 +// Seconds from January 1st, 1970 to January 1st, 2015 +static __device__ __constant__ duration_s orc_utc_epoch = duration_s{1420070400}; struct orc_bytestream_s { const uint8_t* base; @@ -101,7 +101,7 @@ struct orc_datadec_state_s { uint32_t max_vals; // max # of non-zero values to decode in this batch uint32_t nrows; // # of rows in current batch (up to block_size) uint32_t buffered_count; // number of buffered values in the secondary data stream - int64_t utc_epoch; // kORCTimeToUTC - gmtOffset + duration_s tz_epoch; // orc_utc_epoch - gmtOffset RowGroup index; }; @@ -1446,8 +1446,7 @@ __global__ void __launch_bounds__(block_size) } if (!is_dictionary(s->chunk.encoding_kind)) { s->chunk.dictionary_start = 0; } - s->top.data.utc_epoch = - kORCTimeToUTC - get_gmt_offset(tz_table, timestamp_s{duration_s{kORCTimeToUTC}}).count(); + s->top.data.tz_epoch = orc_utc_epoch - get_gmt_offset(tz_table, timestamp_s{orc_utc_epoch}); bytestream_init(&s->bs, s->chunk.streams[CI_DATA], s->chunk.strm_len[CI_DATA]); bytestream_init(&s->bs2, s->chunk.streams[CI_DATA2], s->chunk.strm_len[CI_DATA2]); @@ -1770,35 +1769,33 @@ __global__ void __launch_bounds__(block_size) break; } case TIMESTAMP: { - int64_t seconds = s->vals.i64[t + vals_skipped] + s->top.data.utc_epoch; - seconds += get_gmt_offset(tz_table, timestamp_s{duration_s{seconds}}).count(); + auto seconds = s->top.data.tz_epoch + duration_s{s->vals.i64[t + vals_skipped]}; + // Convert to UTC + seconds += get_gmt_offset(tz_table, timestamp_s{seconds}); - int64_t nanos = secondary_val; - nanos = (nanos >> 3) * kTimestampNanoScale[nanos & 7]; + duration_ns nanos = duration_ns{(static_cast(secondary_val) >> 3) * + kTimestampNanoScale[secondary_val & 7]}; // Adjust seconds only for negative timestamps with positive nanoseconds. // Alternative way to represent negative timestamps is with negative nanoseconds // in which case the adjustment in not needed. // Comparing with 999999 instead of zero to match the apache writer. - if (seconds < 0 and nanos > 999999) { seconds -= 1; } - - duration_ns d_ns{nanos}; - duration_s d_s{seconds}; + if (seconds.count() < 0 and nanos.count() > 999999) { seconds -= duration_s{1}; } static_cast(data_out)[row] = [&]() { using cuda::std::chrono::duration_cast; switch (s->chunk.timestamp_type_id) { case type_id::TIMESTAMP_SECONDS: - return (d_s + duration_cast(d_ns)).count(); + return (seconds + duration_cast(nanos)).count(); case type_id::TIMESTAMP_MILLISECONDS: - return (d_s + duration_cast(d_ns)).count(); + return (seconds + duration_cast(nanos)).count(); case type_id::TIMESTAMP_MICROSECONDS: - return (d_s + duration_cast(d_ns)).count(); + return (seconds + duration_cast(nanos)).count(); case type_id::TIMESTAMP_NANOSECONDS: default: // nanoseconds as output in case of `type_id::EMPTY` and // `type_id::TIMESTAMP_NANOSECONDS` - return (d_s + d_ns).count(); + return (seconds + nanos).count(); } }(); diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 9032e3d2502..98b5b1d1db1 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -49,7 +49,7 @@ constexpr int scratch_buffer_size = 512 * 4; // Workaround replaces zero-length patch lists by a dummy zero patch constexpr bool zero_pll_war = true; -static __device__ __constant__ int64_t kORCTimeToUTC = +static __device__ __constant__ int64_t orc_utc_epoch = 1420070400; // Seconds from January 1st, 1970 to January 1st, 2015 struct byterle_enc_state_s { @@ -814,7 +814,7 @@ __global__ void __launch_bounds__(block_size) int32_t ts_scale = powers_of_ten[9 - min(s->chunk.scale, 9)]; int64_t seconds = ts / ts_scale; int64_t nanos = (ts - seconds * ts_scale); - s->vals.i64[nz_idx] = seconds - kORCTimeToUTC; + s->vals.i64[nz_idx] = seconds - orc_utc_epoch; if (nanos != 0) { // Trailing zeroes are encoded in the lower 3-bits uint32_t zeroes = 0; From 76d0ae118cbbe4696b04fe0db354e5f773d52e55 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 17 Feb 2023 16:39:13 -0800 Subject: [PATCH 09/33] add path param --- cpp/src/io/orc/reader_impl.cu | 4 ++-- cpp/src/io/orc/timezone.cpp | 9 +++++---- cpp/src/io/orc/timezone.cuh | 4 +++- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 2d3d65ae9cb..6ba9443eff3 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -929,8 +929,8 @@ std::unique_ptr
reader::impl::compute_timezone_table( }); if (not has_timestamp_column) return std::make_unique(); - return build_timezone_transition_table(selected_stripes[0].stripe_info[0].second->writerTimezone, - stream); + return build_timezone_transition_table( + {}, selected_stripes[0].stripe_info[0].second->writerTimezone, stream); } table_with_metadata reader::impl::read(size_type skip_rows, diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp index 7ac47f86de7..9b098f973cd 100644 --- a/cpp/src/io/orc/timezone.cpp +++ b/cpp/src/io/orc/timezone.cpp @@ -126,12 +126,12 @@ struct timezone_file { "Number of transition times is larger than the file size."); } - timezone_file(std::string const& timezone_name) + timezone_file(std::optional const& tzif_dir, std::string const& timezone_name) { using std::ios_base; // Open the input file - std::string const tz_filename = tzif_system_directory + timezone_name; + auto const tz_filename = tzif_dir.value_or(tzif_system_directory) + timezone_name; std::ifstream fin; fin.open(tz_filename, ios_base::in | ios_base::binary | ios_base::ate); CUDF_EXPECTS(fin, "Failed to open the timezone file."); @@ -372,7 +372,8 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year) return trans.time + cuda::std::chrono::duration_cast(duration_D{day}).count(); } -std::unique_ptr
build_timezone_transition_table(std::string const& timezone_name, +std::unique_ptr
build_timezone_transition_table(std::optional const& tzif_dir, + std::string const& timezone_name, rmm::cuda_stream_view stream) { if (timezone_name == "UTC" || timezone_name.empty()) { @@ -380,7 +381,7 @@ std::unique_ptr
build_timezone_transition_table(std::string const& timezo return std::make_unique(); } - timezone_file const tzf(timezone_name); + timezone_file const tzf(tzif_dir, timezone_name); std::vector ttimes(1); std::vector offsets(1); diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh index 5de9fff4734..4630e80d472 100644 --- a/cpp/src/io/orc/timezone.cuh +++ b/cpp/src/io/orc/timezone.cuh @@ -91,12 +91,14 @@ inline __device__ duration_s get_gmt_offset(table_device_view tz_table, timestam * * Uses system's TZif files. Assumes little-endian platform when parsing these files. * + * @param tzif_dir TODO * @param timezone_name standard timezone name (for example, "US/Pacific") * @param stream CUDA stream used for device memory operations and kernel launches * * @return The transition table for the given timezone */ -std::unique_ptr
build_timezone_transition_table(std::string const& timezone_name, +std::unique_ptr
build_timezone_transition_table(std::optional const& tzif_dir, + std::string const& timezone_name, rmm::cuda_stream_view stream); } // namespace io From 449648da476dac5e7ba9943e4be4cefdaeb0df14 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 17 Feb 2023 17:20:58 -0800 Subject: [PATCH 10/33] style --- cpp/src/io/orc/stripe_enc.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 98b5b1d1db1..76ac1640d9b 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From f05cc01f1db368229b36188aaad72c5c7822270e Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 28 Feb 2023 13:06:07 -0800 Subject: [PATCH 11/33] move declarations to /include --- .../orc => include/cudf/detail}/timezone.cuh | 38 ++------------ cpp/include/cudf/detail/timezone.hpp | 49 +++++++++++++++++++ cpp/src/io/orc/orc_gpu.hpp | 2 +- cpp/src/io/orc/reader_impl.cu | 4 +- cpp/src/io/orc/timezone.cpp | 8 ++- 5 files changed, 60 insertions(+), 41 deletions(-) rename cpp/{src/io/orc => include/cudf/detail}/timezone.cuh (68%) create mode 100644 cpp/include/cudf/detail/timezone.hpp diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/include/cudf/detail/timezone.cuh similarity index 68% rename from cpp/src/io/orc/timezone.cuh rename to cpp/include/cudf/detail/timezone.cuh index 4630e80d472..434302374d9 100644 --- a/cpp/src/io/orc/timezone.cuh +++ b/cpp/include/cudf/detail/timezone.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,30 +13,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#pragma once + #pragma once #include #include -#include -#include #include -#include +#include -#include #include #include -#include -#include -#include +namespace cudf::detail { -namespace cudf { -namespace io { - -// Cycle in which the time offsets repeat -static constexpr int32_t cycle_years = 400; -// Two entries per year, over the length of the cycle -static constexpr uint32_t cycle_entry_cnt = 2 * cycle_years; inline __device__ auto project_to_cycle(timestamp_s ts) { @@ -86,20 +74,4 @@ inline __device__ duration_s get_gmt_offset(table_device_view tz_table, timestam return tz_table.column(1).element(ts_ttime_it - ttimes.begin()); } -/** - * @brief Creates a transition table to convert ORC timestamps to UTC. - * - * Uses system's TZif files. Assumes little-endian platform when parsing these files. - * - * @param tzif_dir TODO - * @param timezone_name standard timezone name (for example, "US/Pacific") - * @param stream CUDA stream used for device memory operations and kernel launches - * - * @return The transition table for the given timezone - */ -std::unique_ptr
build_timezone_transition_table(std::optional const& tzif_dir, - std::string const& timezone_name, - rmm::cuda_stream_view stream); - -} // namespace io -} // namespace cudf +} diff --git a/cpp/include/cudf/detail/timezone.hpp b/cpp/include/cudf/detail/timezone.hpp new file mode 100644 index 00000000000..4718cb50580 --- /dev/null +++ b/cpp/include/cudf/detail/timezone.hpp @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #pragma once + +#include + +#include + +#include +#include +#include + +namespace cudf::detail { + +// Cycle in which the time offsets repeat +static constexpr int32_t cycle_years = 400; +// Number of future entires in the timezone transition table: +// Two entries per year, over the length of the cycle +static constexpr uint32_t cycle_entry_cnt = 2 * cycle_years; + +/** + * @brief Creates a transition table to convert ORC timestamps to UTC. + * + * Uses system's TZif files. Assumes little-endian platform when parsing these files. + * + * @param tzif_dir The directory where the TZif files are located + * @param timezone_name standard timezone name (for example, "US/Pacific") + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return The transition table for the given timezone + */ +std::unique_ptr
make_timezone_transition_table(std::optional const& tzif_dir, + std::string const& timezone_name, + rmm::cuda_stream_view stream); + +} diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp index eceac1b5527..05560a3ca62 100644 --- a/cpp/src/io/orc/orc_gpu.hpp +++ b/cpp/src/io/orc/orc_gpu.hpp @@ -16,7 +16,7 @@ #pragma once -#include "timezone.cuh" +#include #include "orc.hpp" diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 6ba9443eff3..1df1ddddeb6 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -23,7 +23,6 @@ #include "orc_gpu.hpp" #include "reader_impl.hpp" -#include "timezone.cuh" #include #include @@ -32,6 +31,7 @@ #include #include +#include #include #include #include @@ -929,7 +929,7 @@ std::unique_ptr
reader::impl::compute_timezone_table( }); if (not has_timestamp_column) return std::make_unique(); - return build_timezone_transition_table( + return cudf::detail::make_timezone_transition_table( {}, selected_stripes[0].stripe_info[0].second->writerTimezone, stream); } diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp index 9b098f973cd..ed3cbea356f 100644 --- a/cpp/src/io/orc/timezone.cpp +++ b/cpp/src/io/orc/timezone.cpp @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "timezone.cuh" +#include #include #include @@ -22,8 +22,7 @@ #include #include -namespace cudf { -namespace io { +namespace cudf::detail { constexpr uint32_t tzif_magic = ('T' << 0) | ('Z' << 8) | ('i' << 16) | ('f' << 24); std::string const tzif_system_directory = "/usr/share/zoneinfo/"; @@ -372,7 +371,7 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year) return trans.time + cuda::std::chrono::duration_cast(duration_D{day}).count(); } -std::unique_ptr
build_timezone_transition_table(std::optional const& tzif_dir, +std::unique_ptr
make_timezone_transition_table(std::optional const& tzif_dir, std::string const& timezone_name, rmm::cuda_stream_view stream) { @@ -488,5 +487,4 @@ std::unique_ptr
build_timezone_transition_table(std::optional(std::move(tz_table_columns)); } -} // namespace io } // namespace cudf From 7c5e32c0e0e20bf9627389057054df51b7310f67 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 28 Feb 2023 13:16:04 -0800 Subject: [PATCH 12/33] drop cont& --- cpp/include/cudf/detail/timezone.hpp | 2 +- cpp/src/io/orc/timezone.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/detail/timezone.hpp b/cpp/include/cudf/detail/timezone.hpp index 4718cb50580..d39a52c6886 100644 --- a/cpp/include/cudf/detail/timezone.hpp +++ b/cpp/include/cudf/detail/timezone.hpp @@ -42,7 +42,7 @@ static constexpr uint32_t cycle_entry_cnt = 2 * cycle_years; * * @return The transition table for the given timezone */ -std::unique_ptr
make_timezone_transition_table(std::optional const& tzif_dir, +std::unique_ptr
make_timezone_transition_table(std::optional tzif_dir, std::string const& timezone_name, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp index ed3cbea356f..4094ef78ada 100644 --- a/cpp/src/io/orc/timezone.cpp +++ b/cpp/src/io/orc/timezone.cpp @@ -371,7 +371,7 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year) return trans.time + cuda::std::chrono::duration_cast(duration_D{day}).count(); } -std::unique_ptr
make_timezone_transition_table(std::optional const& tzif_dir, +std::unique_ptr
make_timezone_transition_table(std::optional tzif_dir, std::string const& timezone_name, rmm::cuda_stream_view stream) { From ebca7abb118802cfd18ec539db1a7b7def7ceb4b Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 28 Feb 2023 13:29:21 -0800 Subject: [PATCH 13/33] more param clean up --- cpp/include/cudf/detail/timezone.hpp | 4 ++-- cpp/src/io/orc/timezone.cpp | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/detail/timezone.hpp b/cpp/include/cudf/detail/timezone.hpp index d39a52c6886..a941f89a4f9 100644 --- a/cpp/include/cudf/detail/timezone.hpp +++ b/cpp/include/cudf/detail/timezone.hpp @@ -42,8 +42,8 @@ static constexpr uint32_t cycle_entry_cnt = 2 * cycle_years; * * @return The transition table for the given timezone */ -std::unique_ptr
make_timezone_transition_table(std::optional tzif_dir, - std::string const& timezone_name, +std::unique_ptr
make_timezone_transition_table(std::optional tzif_dir, + std::string_view timezone_name, rmm::cuda_stream_view stream); } diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp index 4094ef78ada..b98f15c118a 100644 --- a/cpp/src/io/orc/timezone.cpp +++ b/cpp/src/io/orc/timezone.cpp @@ -21,6 +21,7 @@ #include #include +#include namespace cudf::detail { @@ -125,12 +126,12 @@ struct timezone_file { "Number of transition times is larger than the file size."); } - timezone_file(std::optional const& tzif_dir, std::string const& timezone_name) + timezone_file(std::optional const& tzif_dir, std::string_view timezone_name) { using std::ios_base; // Open the input file - auto const tz_filename = tzif_dir.value_or(tzif_system_directory) + timezone_name; + auto const tz_filename = std::filesystem::path{tzif_dir.value_or(tzif_system_directory)} / timezone_name; std::ifstream fin; fin.open(tz_filename, ios_base::in | ios_base::binary | ios_base::ate); CUDF_EXPECTS(fin, "Failed to open the timezone file."); From dcb717919aa041879866b46115651e314f2556b3 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 28 Feb 2023 13:38:18 -0800 Subject: [PATCH 14/33] lil fix --- cpp/src/io/orc/timezone.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp index b98f15c118a..2e5a04c4140 100644 --- a/cpp/src/io/orc/timezone.cpp +++ b/cpp/src/io/orc/timezone.cpp @@ -372,8 +372,8 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year) return trans.time + cuda::std::chrono::duration_cast(duration_D{day}).count(); } -std::unique_ptr
make_timezone_transition_table(std::optional tzif_dir, - std::string const& timezone_name, +std::unique_ptr
make_timezone_transition_table(std::optional tzif_dir, + std::string_view timezone_name, rmm::cuda_stream_view stream) { if (timezone_name == "UTC" || timezone_name.empty()) { @@ -462,7 +462,7 @@ std::unique_ptr
make_timezone_transition_table(std::optional } CUDF_EXPECTS(ttimes.size() == offsets.size(), - "Error reading TZif file for timezone " + timezone_name); + "Error reading TZif file for timezone " + std::string{timezone_name}); std::vector ttimes_typed; ttimes_typed.reserve(ttimes.size()); From e3a656aa30077a72ba36f5ba12ce22882a862546 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 28 Feb 2023 13:48:35 -0800 Subject: [PATCH 15/33] style --- conda/recipes/libcudf/meta.yaml | 2 ++ cpp/include/cudf/detail/timezone.cuh | 7 +++---- cpp/include/cudf/detail/timezone.hpp | 12 ++++++------ cpp/src/io/orc/reader_impl.cu | 2 +- cpp/src/io/orc/timezone.cpp | 13 +++++++------ 5 files changed, 19 insertions(+), 17 deletions(-) diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index fbfcf6e71a2..cd89b751e1d 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -120,6 +120,8 @@ outputs: - test -f $PREFIX/include/cudf/detail/tdigest/tdigest.hpp - test -f $PREFIX/include/cudf/detail/transform.hpp - test -f $PREFIX/include/cudf/detail/transpose.hpp + - test -f $PREFIX/include/cudf/detail/timezone.cuh + - test -f $PREFIX/include/cudf/detail/timezone.hpp - test -f $PREFIX/include/cudf/detail/unary.hpp - test -f $PREFIX/include/cudf/detail/utilities/alignment.hpp - test -f $PREFIX/include/cudf/detail/utilities/default_stream.hpp diff --git a/cpp/include/cudf/detail/timezone.cuh b/cpp/include/cudf/detail/timezone.cuh index 434302374d9..6aef53a08e7 100644 --- a/cpp/include/cudf/detail/timezone.cuh +++ b/cpp/include/cudf/detail/timezone.cuh @@ -13,19 +13,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #pragma once +#pragma once +#include #include #include #include -#include #include #include namespace cudf::detail { - inline __device__ auto project_to_cycle(timestamp_s ts) { static constexpr duration_s cycle_s = @@ -74,4 +73,4 @@ inline __device__ duration_s get_gmt_offset(table_device_view tz_table, timestam return tz_table.column(1).element(ts_ttime_it - ttimes.begin()); } -} +} // namespace cudf::detail diff --git a/cpp/include/cudf/detail/timezone.hpp b/cpp/include/cudf/detail/timezone.hpp index a941f89a4f9..7ed678fce84 100644 --- a/cpp/include/cudf/detail/timezone.hpp +++ b/cpp/include/cudf/detail/timezone.hpp @@ -13,21 +13,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #pragma once +#pragma once #include #include -#include #include #include +#include namespace cudf::detail { // Cycle in which the time offsets repeat static constexpr int32_t cycle_years = 400; -// Number of future entires in the timezone transition table: +// Number of future entries in the timezone transition table: // Two entries per year, over the length of the cycle static constexpr uint32_t cycle_entry_cnt = 2 * cycle_years; @@ -43,7 +43,7 @@ static constexpr uint32_t cycle_entry_cnt = 2 * cycle_years; * @return The transition table for the given timezone */ std::unique_ptr
make_timezone_transition_table(std::optional tzif_dir, - std::string_view timezone_name, - rmm::cuda_stream_view stream); + std::string_view timezone_name, + rmm::cuda_stream_view stream); -} +} // namespace cudf::detail diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 1df1ddddeb6..fd0bd5fef76 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -29,9 +29,9 @@ #include #include +#include #include #include -#include #include #include #include diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp index 2e5a04c4140..b208798bcb0 100644 --- a/cpp/src/io/orc/timezone.cpp +++ b/cpp/src/io/orc/timezone.cpp @@ -20,8 +20,8 @@ #include #include -#include #include +#include namespace cudf::detail { @@ -126,12 +126,13 @@ struct timezone_file { "Number of transition times is larger than the file size."); } - timezone_file(std::optional const& tzif_dir, std::string_view timezone_name) + timezone_file(std::optional tzif_dir, std::string_view timezone_name) { using std::ios_base; // Open the input file - auto const tz_filename = std::filesystem::path{tzif_dir.value_or(tzif_system_directory)} / timezone_name; + auto const tz_filename = + std::filesystem::path{tzif_dir.value_or(tzif_system_directory)} / timezone_name; std::ifstream fin; fin.open(tz_filename, ios_base::in | ios_base::binary | ios_base::ate); CUDF_EXPECTS(fin, "Failed to open the timezone file."); @@ -373,8 +374,8 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year) } std::unique_ptr
make_timezone_transition_table(std::optional tzif_dir, - std::string_view timezone_name, - rmm::cuda_stream_view stream) + std::string_view timezone_name, + rmm::cuda_stream_view stream) { if (timezone_name == "UTC" || timezone_name.empty()) { // Return an empty table for UTC @@ -488,4 +489,4 @@ std::unique_ptr
make_timezone_transition_table(std::optional(std::move(tz_table_columns)); } -} // namespace cudf +} // namespace cudf::detail From b6ec20cf9f35e2a1ca40149c07674c6502f1fbb0 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 28 Feb 2023 15:42:54 -0800 Subject: [PATCH 16/33] lambda --- cpp/include/cudf/detail/timezone.cuh | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/cpp/include/cudf/detail/timezone.cuh b/cpp/include/cudf/detail/timezone.cuh index 6aef53a08e7..f2530da6781 100644 --- a/cpp/include/cudf/detail/timezone.cuh +++ b/cpp/include/cudf/detail/timezone.cuh @@ -25,13 +25,6 @@ namespace cudf::detail { -inline __device__ auto project_to_cycle(timestamp_s ts) -{ - static constexpr duration_s cycle_s = - cuda::std::chrono::duration_cast(duration_D{365 * cycle_years + (100 - 3)}); - return timestamp_s{(ts.time_since_epoch() + cycle_s) % cycle_s}; -} - /** * @brief Returns the GMT offset for a given date and given timezone table. * @@ -65,6 +58,11 @@ inline __device__ duration_s get_gmt_offset(table_device_view tz_table, timestam // Search the file entries if the timestamp is in range return last_less_equal(ttimes.begin(), file_entry_end, ts); } else { + auto project_to_cycle = [](timestamp_s ts) { + static constexpr duration_s cycle_s = + cuda::std::chrono::duration_cast(duration_D{365 * cycle_years + (100 - 3)}); + return timestamp_s{(ts.time_since_epoch() + cycle_s) % cycle_s}; + }; // Search the 400-year cycle if outside of the file entries range return last_less_equal(file_entry_end, ttimes.end(), project_to_cycle(ts)); } From 49ffb2c5084ae330623981e01c5a2b56db561abe Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 6 Mar 2023 15:11:03 -0800 Subject: [PATCH 17/33] move out of detail --- conda/recipes/libcudf/meta.yaml | 2 +- cpp/include/cudf/{detail => }/timezone.hpp | 4 ++-- cpp/src/io/orc/reader_impl.cu | 2 +- cpp/src/io/orc/timezone.cpp | 8 ++++++-- 4 files changed, 10 insertions(+), 6 deletions(-) rename cpp/include/cudf/{detail => }/timezone.hpp (96%) diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index cd89b751e1d..74351bc1bc2 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -82,6 +82,7 @@ outputs: - test -f $PREFIX/include/cudf/concatenate.hpp - test -f $PREFIX/include/cudf/copying.hpp - test -f $PREFIX/include/cudf/datetime.hpp + - test -f $PREFIX/include/cudf/timezone.hpp - test -f $PREFIX/include/cudf/detail/aggregation/aggregation.hpp - test -f $PREFIX/include/cudf/detail/aggregation/result_cache.hpp - test -f $PREFIX/include/cudf/detail/binaryop.hpp @@ -121,7 +122,6 @@ outputs: - test -f $PREFIX/include/cudf/detail/transform.hpp - test -f $PREFIX/include/cudf/detail/transpose.hpp - test -f $PREFIX/include/cudf/detail/timezone.cuh - - test -f $PREFIX/include/cudf/detail/timezone.hpp - test -f $PREFIX/include/cudf/detail/unary.hpp - test -f $PREFIX/include/cudf/detail/utilities/alignment.hpp - test -f $PREFIX/include/cudf/detail/utilities/default_stream.hpp diff --git a/cpp/include/cudf/detail/timezone.hpp b/cpp/include/cudf/timezone.hpp similarity index 96% rename from cpp/include/cudf/detail/timezone.hpp rename to cpp/include/cudf/timezone.hpp index 7ed678fce84..a36f58f2ea5 100644 --- a/cpp/include/cudf/detail/timezone.hpp +++ b/cpp/include/cudf/timezone.hpp @@ -23,7 +23,7 @@ #include #include -namespace cudf::detail { +namespace cudf { // Cycle in which the time offsets repeat static constexpr int32_t cycle_years = 400; @@ -46,4 +46,4 @@ std::unique_ptr
make_timezone_transition_table(std::optional reader::impl::compute_timezone_table( }); if (not has_timestamp_column) return std::make_unique(); - return cudf::detail::make_timezone_transition_table( + return make_timezone_transition_table( {}, selected_stripes[0].stripe_info[0].second->writerTimezone, stream); } diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp index b208798bcb0..a5b6664b0f0 100644 --- a/cpp/src/io/orc/timezone.cpp +++ b/cpp/src/io/orc/timezone.cpp @@ -23,7 +23,9 @@ #include #include -namespace cudf::detail { +namespace cudf { + +namespace { constexpr uint32_t tzif_magic = ('T' << 0) | ('Z' << 8) | ('i' << 16) | ('f' << 24); std::string const tzif_system_directory = "/usr/share/zoneinfo/"; @@ -373,6 +375,8 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year) return trans.time + cuda::std::chrono::duration_cast(duration_D{day}).count(); } +} // namespace + std::unique_ptr
make_timezone_transition_table(std::optional tzif_dir, std::string_view timezone_name, rmm::cuda_stream_view stream) @@ -489,4 +493,4 @@ std::unique_ptr
make_timezone_transition_table(std::optional(std::move(tz_table_columns)); } -} // namespace cudf::detail +} // namespace cudf From b90dd7f3d4aaaed38a7645ce304dd28a8c1ced8e Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 6 Mar 2023 15:42:32 -0800 Subject: [PATCH 18/33] fix includes --- cpp/include/cudf/detail/timezone.cuh | 2 +- cpp/src/io/orc/reader_impl.cu | 2 +- cpp/src/io/orc/timezone.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/detail/timezone.cuh b/cpp/include/cudf/detail/timezone.cuh index f2530da6781..06a6037a8c9 100644 --- a/cpp/include/cudf/detail/timezone.cuh +++ b/cpp/include/cudf/detail/timezone.cuh @@ -15,7 +15,7 @@ */ #pragma once -#include +#include #include #include #include diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 715c6858f06..8a1e6e2e2de 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -29,7 +29,7 @@ #include #include -#include +#include #include #include #include diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp index a5b6664b0f0..b844cb80b21 100644 --- a/cpp/src/io/orc/timezone.cpp +++ b/cpp/src/io/orc/timezone.cpp @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include +#include #include #include From e55b950e808f321f5463fe50f1914e27aa01340b Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 6 Mar 2023 15:52:04 -0800 Subject: [PATCH 19/33] style --- cpp/include/cudf/detail/timezone.cuh | 2 +- cpp/src/io/orc/reader_impl.cu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/detail/timezone.cuh b/cpp/include/cudf/detail/timezone.cuh index 06a6037a8c9..1bd96a79f7c 100644 --- a/cpp/include/cudf/detail/timezone.cuh +++ b/cpp/include/cudf/detail/timezone.cuh @@ -15,8 +15,8 @@ */ #pragma once -#include #include +#include #include #include diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 8a1e6e2e2de..00caeb18c74 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -29,10 +29,10 @@ #include #include -#include #include #include #include +#include #include #include #include From bb172e4dbeb0ecd2390b9ddd40a119a9b6c617da Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 13 Mar 2023 15:36:54 -0700 Subject: [PATCH 20/33] improve naming --- cpp/include/cudf/detail/timezone.cuh | 12 ++++++------ cpp/include/cudf/timezone.hpp | 6 +++--- cpp/src/io/orc/timezone.cpp | 6 +++--- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/cpp/include/cudf/detail/timezone.cuh b/cpp/include/cudf/detail/timezone.cuh index 1bd96a79f7c..c2a1003ce76 100644 --- a/cpp/include/cudf/detail/timezone.cuh +++ b/cpp/include/cudf/detail/timezone.cuh @@ -28,10 +28,10 @@ namespace cudf::detail { /** * @brief Returns the GMT offset for a given date and given timezone table. * - * @param ttimes Transition times; trailing `cycle_entry_cnt` entries are used for all times + * @param ttimes Transition times; trailing `solar_cycle_entry_count` entries are used for all times * beyond the one covered by the TZif file - * @param offsets Time offsets in specific intervals; trailing `cycle_entry_cnt` entries are used - * for all times beyond the one covered by the TZif file + * @param offsets Time offsets in specific intervals; trailing `solar_cycle_entry_count` entries are + * used for all times beyond the one covered by the TZif file * @param ts ORC timestamp * * @return GMT offset @@ -52,15 +52,15 @@ inline __device__ duration_s get_gmt_offset(table_device_view tz_table, timestam return first_larger - 1; }; - auto const file_entry_end = ttimes.begin() + (ttimes.size() - cycle_entry_cnt); + auto const file_entry_end = ttimes.begin() + (ttimes.size() - solar_cycle_entry_count); if (ts <= *(file_entry_end - 1)) { // Search the file entries if the timestamp is in range return last_less_equal(ttimes.begin(), file_entry_end, ts); } else { auto project_to_cycle = [](timestamp_s ts) { - static constexpr duration_s cycle_s = - cuda::std::chrono::duration_cast(duration_D{365 * cycle_years + (100 - 3)}); + static constexpr duration_s cycle_s = cuda::std::chrono::duration_cast( + duration_D{365 * solar_cycle_years + (100 - 3)}); return timestamp_s{(ts.time_since_epoch() + cycle_s) % cycle_s}; }; // Search the 400-year cycle if outside of the file entries range diff --git a/cpp/include/cudf/timezone.hpp b/cpp/include/cudf/timezone.hpp index a36f58f2ea5..5413abe4721 100644 --- a/cpp/include/cudf/timezone.hpp +++ b/cpp/include/cudf/timezone.hpp @@ -26,10 +26,10 @@ namespace cudf { // Cycle in which the time offsets repeat -static constexpr int32_t cycle_years = 400; +static constexpr int32_t solar_cycle_years = 400; // Number of future entries in the timezone transition table: -// Two entries per year, over the length of the cycle -static constexpr uint32_t cycle_entry_cnt = 2 * cycle_years; +// Two entries per year, over the length of the Gragorian calendar's solar cycle +static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years; /** * @brief Creates a transition table to convert ORC timestamps to UTC. diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp index b844cb80b21..e7a80141df9 100644 --- a/cpp/src/io/orc/timezone.cpp +++ b/cpp/src/io/orc/timezone.cpp @@ -391,8 +391,8 @@ std::unique_ptr
make_timezone_transition_table(std::optional ttimes(1); std::vector offsets(1); // One ancient rule entry, one per TZ file entry, 2 entries per year in the future cycle - ttimes.reserve(1 + tzf.timecnt() + cycle_entry_cnt); - offsets.reserve(1 + tzf.timecnt() + cycle_entry_cnt); + ttimes.reserve(1 + tzf.timecnt() + solar_cycle_entry_count); + offsets.reserve(1 + tzf.timecnt() + solar_cycle_entry_count); size_t earliest_std_idx = 0; for (size_t t = 0; t < tzf.timecnt(); t++) { auto const ttime = tzf.transition_times[t]; @@ -445,7 +445,7 @@ std::unique_ptr
make_timezone_transition_table(std::optional Date: Mon, 13 Mar 2023 15:38:36 -0700 Subject: [PATCH 21/33] typo --- cpp/include/cudf/timezone.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/timezone.hpp b/cpp/include/cudf/timezone.hpp index 5413abe4721..5e895c42c01 100644 --- a/cpp/include/cudf/timezone.hpp +++ b/cpp/include/cudf/timezone.hpp @@ -28,7 +28,7 @@ namespace cudf { // Cycle in which the time offsets repeat static constexpr int32_t solar_cycle_years = 400; // Number of future entries in the timezone transition table: -// Two entries per year, over the length of the Gragorian calendar's solar cycle +// Two entries per year, over the length of the Gregorian calendar's solar cycle static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years; /** From 2216ccd3362e42eb721243054749902f675853eb Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 13 Mar 2023 15:39:10 -0700 Subject: [PATCH 22/33] moar clarifications! --- cpp/include/cudf/timezone.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/timezone.hpp b/cpp/include/cudf/timezone.hpp index 5e895c42c01..f6364e9da58 100644 --- a/cpp/include/cudf/timezone.hpp +++ b/cpp/include/cudf/timezone.hpp @@ -25,7 +25,7 @@ namespace cudf { -// Cycle in which the time offsets repeat +// Cycle in which the time offsets repeat in Gregorian calendar static constexpr int32_t solar_cycle_years = 400; // Number of future entries in the timezone transition table: // Two entries per year, over the length of the Gregorian calendar's solar cycle From 9b517b631d5ea803e1177c962ef08b4d0434061b Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 13 Mar 2023 16:38:45 -0700 Subject: [PATCH 23/33] move constant to common header --- cpp/src/io/orc/orc.hpp | 2 ++ cpp/src/io/orc/stripe_data.cu | 6 ++---- cpp/src/io/orc/stripe_enc.cu | 3 --- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp index 44882b71925..bf97d5b04d2 100644 --- a/cpp/src/io/orc/orc.hpp +++ b/cpp/src/io/orc/orc.hpp @@ -38,6 +38,8 @@ namespace io { namespace orc { static constexpr uint32_t block_header_size = 3; +// Seconds from January 1st, 1970 to January 1st, 2015 +static constexpr int64_t orc_utc_epoch = 1420070400; struct PostScript { uint64_t footerLength = 0; // the length of the footer section in bytes diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 1f17b7a4d80..95fb6ce5eb5 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -43,9 +43,6 @@ inline __device__ uint8_t is_rlev1(uint8_t encoding_mode) { return encoding_mode inline __device__ uint8_t is_dictionary(uint8_t encoding_mode) { return encoding_mode & 1; } -// Seconds from January 1st, 1970 to January 1st, 2015 -static __device__ __constant__ duration_s orc_utc_epoch = duration_s{1420070400}; - struct orc_bytestream_s { const uint8_t* base; uint32_t pos; @@ -1446,7 +1443,8 @@ __global__ void __launch_bounds__(block_size) } if (!is_dictionary(s->chunk.encoding_kind)) { s->chunk.dictionary_start = 0; } - s->top.data.tz_epoch = orc_utc_epoch - get_gmt_offset(tz_table, timestamp_s{orc_utc_epoch}); + static constexpr duration_s d_orc_utc_epoch = duration_s{orc_utc_epoch}; + s->top.data.tz_epoch = d_orc_utc_epoch - get_gmt_offset(tz_table, timestamp_s{d_orc_utc_epoch}); bytestream_init(&s->bs, s->chunk.streams[CI_DATA], s->chunk.strm_len[CI_DATA]); bytestream_init(&s->bs2, s->chunk.streams[CI_DATA2], s->chunk.strm_len[CI_DATA2]); diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 76ac1640d9b..427167e2d0f 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -49,9 +49,6 @@ constexpr int scratch_buffer_size = 512 * 4; // Workaround replaces zero-length patch lists by a dummy zero patch constexpr bool zero_pll_war = true; -static __device__ __constant__ int64_t orc_utc_epoch = - 1420070400; // Seconds from January 1st, 1970 to January 1st, 2015 - struct byterle_enc_state_s { uint32_t literal_run; uint32_t repeat_run; From 675b9d76b2728c6642b28d2cd05bbd22a5aa8906 Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 13 Mar 2023 16:42:34 -0700 Subject: [PATCH 24/33] yaml sort --- conda/recipes/libcudf/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index aedf4280c5f..6806c97a6e1 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -123,9 +123,9 @@ outputs: - test -f $PREFIX/include/cudf/detail/stream_compaction.hpp - test -f $PREFIX/include/cudf/detail/structs/utilities.hpp - test -f $PREFIX/include/cudf/detail/tdigest/tdigest.hpp + - test -f $PREFIX/include/cudf/detail/timezone.cuh - test -f $PREFIX/include/cudf/detail/transform.hpp - test -f $PREFIX/include/cudf/detail/transpose.hpp - - test -f $PREFIX/include/cudf/detail/timezone.cuh - test -f $PREFIX/include/cudf/detail/unary.hpp - test -f $PREFIX/include/cudf/detail/utilities/alignment.hpp - test -f $PREFIX/include/cudf/detail/utilities/default_stream.hpp From 3cf32ee23004382af42ed6919b85407ebce9334a Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 13 Mar 2023 16:52:29 -0700 Subject: [PATCH 25/33] use UT instead of GMT --- cpp/include/cudf/detail/timezone.cuh | 6 +++--- cpp/src/io/orc/orc.hpp | 2 +- cpp/src/io/orc/stripe_data.cu | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/include/cudf/detail/timezone.cuh b/cpp/include/cudf/detail/timezone.cuh index c2a1003ce76..b30183a96c9 100644 --- a/cpp/include/cudf/detail/timezone.cuh +++ b/cpp/include/cudf/detail/timezone.cuh @@ -26,7 +26,7 @@ namespace cudf::detail { /** - * @brief Returns the GMT offset for a given date and given timezone table. + * @brief Returns the UT offset for a given date and given timezone table. * * @param ttimes Transition times; trailing `solar_cycle_entry_count` entries are used for all times * beyond the one covered by the TZif file @@ -34,9 +34,9 @@ namespace cudf::detail { * used for all times beyond the one covered by the TZif file * @param ts ORC timestamp * - * @return GMT offset + * @return offset from UT, in seconds */ -inline __device__ duration_s get_gmt_offset(table_device_view tz_table, timestamp_s ts) +inline __device__ duration_s get_ut_offset(table_device_view tz_table, timestamp_s ts) { if (tz_table.num_rows() == 0) { return duration_s{0}; } diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp index bf97d5b04d2..b97d3f57656 100644 --- a/cpp/src/io/orc/orc.hpp +++ b/cpp/src/io/orc/orc.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 95fb6ce5eb5..ade024d198e 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -98,7 +98,7 @@ struct orc_datadec_state_s { uint32_t max_vals; // max # of non-zero values to decode in this batch uint32_t nrows; // # of rows in current batch (up to block_size) uint32_t buffered_count; // number of buffered values in the secondary data stream - duration_s tz_epoch; // orc_utc_epoch - gmtOffset + duration_s tz_epoch; // orc_ut_epoch - ut_offset RowGroup index; }; @@ -1444,7 +1444,7 @@ __global__ void __launch_bounds__(block_size) if (!is_dictionary(s->chunk.encoding_kind)) { s->chunk.dictionary_start = 0; } static constexpr duration_s d_orc_utc_epoch = duration_s{orc_utc_epoch}; - s->top.data.tz_epoch = d_orc_utc_epoch - get_gmt_offset(tz_table, timestamp_s{d_orc_utc_epoch}); + s->top.data.tz_epoch = d_orc_utc_epoch - get_ut_offset(tz_table, timestamp_s{d_orc_ut_epoch}); bytestream_init(&s->bs, s->chunk.streams[CI_DATA], s->chunk.strm_len[CI_DATA]); bytestream_init(&s->bs2, s->chunk.streams[CI_DATA2], s->chunk.strm_len[CI_DATA2]); @@ -1769,7 +1769,7 @@ __global__ void __launch_bounds__(block_size) case TIMESTAMP: { auto seconds = s->top.data.tz_epoch + duration_s{s->vals.i64[t + vals_skipped]}; // Convert to UTC - seconds += get_gmt_offset(tz_table, timestamp_s{seconds}); + seconds += get_ut_offset(tz_table, timestamp_s{seconds}); duration_ns nanos = duration_ns{(static_cast(secondary_val) >> 3) * kTimestampNanoScale[secondary_val & 7]}; From 3b527457ce586da0facc22f36bece7ff561f1ada Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 13 Mar 2023 16:53:48 -0700 Subject: [PATCH 26/33] fix --- cpp/src/io/orc/stripe_data.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index ade024d198e..8e698dd9dff 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -1444,7 +1444,7 @@ __global__ void __launch_bounds__(block_size) if (!is_dictionary(s->chunk.encoding_kind)) { s->chunk.dictionary_start = 0; } static constexpr duration_s d_orc_utc_epoch = duration_s{orc_utc_epoch}; - s->top.data.tz_epoch = d_orc_utc_epoch - get_ut_offset(tz_table, timestamp_s{d_orc_ut_epoch}); + s->top.data.tz_epoch = d_orc_utc_epoch - get_ut_offset(tz_table, timestamp_s{d_orc_utc_epoch}); bytestream_init(&s->bs, s->chunk.streams[CI_DATA], s->chunk.strm_len[CI_DATA]); bytestream_init(&s->bs2, s->chunk.streams[CI_DATA2], s->chunk.strm_len[CI_DATA2]); From d326b694f391986d0e91109b43ddfdf4d8bf7863 Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 13 Mar 2023 17:16:24 -0700 Subject: [PATCH 27/33] naming --- cpp/include/cudf/detail/timezone.cuh | 17 ++++++------ cpp/src/io/orc/timezone.cpp | 39 +++++++++++++++------------- 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/cpp/include/cudf/detail/timezone.cuh b/cpp/include/cudf/detail/timezone.cuh index b30183a96c9..7be8afa1368 100644 --- a/cpp/include/cudf/detail/timezone.cuh +++ b/cpp/include/cudf/detail/timezone.cuh @@ -28,8 +28,8 @@ namespace cudf::detail { /** * @brief Returns the UT offset for a given date and given timezone table. * - * @param ttimes Transition times; trailing `solar_cycle_entry_count` entries are used for all times - * beyond the one covered by the TZif file + * @param transition_times Transition times; trailing `solar_cycle_entry_count` entries are used for + * all times beyond the one covered by the TZif file * @param offsets Time offsets in specific intervals; trailing `solar_cycle_entry_count` entries are * used for all times beyond the one covered by the TZif file * @param ts ORC timestamp @@ -40,8 +40,8 @@ inline __device__ duration_s get_ut_offset(table_device_view tz_table, timestamp { if (tz_table.num_rows() == 0) { return duration_s{0}; } - cudf::device_span ttimes(tz_table.column(0).head(), - static_cast(tz_table.num_rows())); + cudf::device_span transition_times(tz_table.column(0).head(), + static_cast(tz_table.num_rows())); auto const ts_ttime_it = [&]() { auto last_less_equal = [](auto begin, auto end, auto value) { @@ -52,11 +52,12 @@ inline __device__ duration_s get_ut_offset(table_device_view tz_table, timestamp return first_larger - 1; }; - auto const file_entry_end = ttimes.begin() + (ttimes.size() - solar_cycle_entry_count); + auto const file_entry_end = + transition_times.begin() + (transition_times.size() - solar_cycle_entry_count); if (ts <= *(file_entry_end - 1)) { // Search the file entries if the timestamp is in range - return last_less_equal(ttimes.begin(), file_entry_end, ts); + return last_less_equal(transition_times.begin(), file_entry_end, ts); } else { auto project_to_cycle = [](timestamp_s ts) { static constexpr duration_s cycle_s = cuda::std::chrono::duration_cast( @@ -64,11 +65,11 @@ inline __device__ duration_s get_ut_offset(table_device_view tz_table, timestamp return timestamp_s{(ts.time_since_epoch() + cycle_s) % cycle_s}; }; // Search the 400-year cycle if outside of the file entries range - return last_less_equal(file_entry_end, ttimes.end(), project_to_cycle(ts)); + return last_less_equal(file_entry_end, transition_times.end(), project_to_cycle(ts)); } }(); - return tz_table.column(1).element(ts_ttime_it - ttimes.begin()); + return tz_table.column(1).element(ts_ttime_it - transition_times.begin()); } } // namespace cudf::detail diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp index e7a80141df9..b7eb88a49d3 100644 --- a/cpp/src/io/orc/timezone.cpp +++ b/cpp/src/io/orc/timezone.cpp @@ -388,10 +388,10 @@ std::unique_ptr
make_timezone_transition_table(std::optional ttimes(1); + std::vector transition_times(1); std::vector offsets(1); // One ancient rule entry, one per TZ file entry, 2 entries per year in the future cycle - ttimes.reserve(1 + tzf.timecnt() + solar_cycle_entry_count); + transition_times.reserve(1 + tzf.timecnt() + solar_cycle_entry_count); offsets.reserve(1 + tzf.timecnt() + solar_cycle_entry_count); size_t earliest_std_idx = 0; for (size_t t = 0; t < tzf.timecnt(); t++) { @@ -399,15 +399,17 @@ std::unique_ptr
make_timezone_transition_table(std::optional make_timezone_transition_table(std::optional(); } // No transitions to use for the time/offset - use the first offset and apply to all timestamps - ttimes[0] = std::numeric_limits::max(); - offsets[0] = tzf.ttype[0].utcoff; + transition_times[0] = std::numeric_limits::max(); + offsets[0] = tzf.ttype[0].utcoff; } // Generate entries for times after the last transition @@ -450,14 +452,14 @@ std::unique_ptr
make_timezone_transition_table(std::optional ttimes.rbegin()[0]) { - std::swap(ttimes.rbegin()[0], ttimes.rbegin()[1]); + if (transition_times.rbegin()[1] > transition_times.rbegin()[0]) { + std::swap(transition_times.rbegin()[0], transition_times.rbegin()[1]); std::swap(offsets.rbegin()[0], offsets.rbegin()[1]); } @@ -466,14 +468,15 @@ std::unique_ptr
make_timezone_transition_table(std::optional ttimes_typed; - ttimes_typed.reserve(ttimes.size()); - std::transform(ttimes.cbegin(), ttimes.cend(), std::back_inserter(ttimes_typed), [](auto ts) { - return timestamp_s{duration_s{ts}}; - }); + ttimes_typed.reserve(transition_times.size()); + std::transform(transition_times.cbegin(), + transition_times.cend(), + std::back_inserter(ttimes_typed), + [](auto ts) { return timestamp_s{duration_s{ts}}; }); std::vector offsets_typed; offsets_typed.reserve(offsets.size()); std::transform(offsets.cbegin(), offsets.cend(), std::back_inserter(offsets_typed), [](auto ts) { @@ -487,7 +490,7 @@ std::unique_ptr
make_timezone_transition_table(std::optional(std::move(d_ttimes))); tz_table_columns.emplace_back(std::make_unique(std::move(d_offsets))); - // Need to finish copies before ttimes and offsets go out of scope + // Need to finish copies before transition_times and offsets go out of scope stream.synchronize(); return std::make_unique(std::move(tz_table_columns)); From 3e28bdf099788b20d61713f8b34b0d7bae0cea5b Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 13 Mar 2023 17:17:02 -0700 Subject: [PATCH 28/33] change example --- cpp/include/cudf/timezone.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/timezone.hpp b/cpp/include/cudf/timezone.hpp index f6364e9da58..3679ef89170 100644 --- a/cpp/include/cudf/timezone.hpp +++ b/cpp/include/cudf/timezone.hpp @@ -37,7 +37,7 @@ static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years; * Uses system's TZif files. Assumes little-endian platform when parsing these files. * * @param tzif_dir The directory where the TZif files are located - * @param timezone_name standard timezone name (for example, "US/Pacific") + * @param timezone_name standard timezone name (for example, "America/Los_Angeles") * @param stream CUDA stream used for device memory operations and kernel launches * * @return The transition table for the given timezone From 9050605412429ddab88969d60e5aea5583b72db6 Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 13 Mar 2023 17:28:48 -0700 Subject: [PATCH 29/33] clarify obscure calc --- cpp/include/cudf/detail/timezone.cuh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/include/cudf/detail/timezone.cuh b/cpp/include/cudf/detail/timezone.cuh index 7be8afa1368..830ee1a7fa6 100644 --- a/cpp/include/cudf/detail/timezone.cuh +++ b/cpp/include/cudf/detail/timezone.cuh @@ -60,8 +60,12 @@ inline __device__ duration_s get_ut_offset(table_device_view tz_table, timestamp return last_less_equal(transition_times.begin(), file_entry_end, ts); } else { auto project_to_cycle = [](timestamp_s ts) { + // Years divisible by four are leap years + // Exceptions are years divisible by 100, but not divisible by 400 + static constexpr int32_t num_leap_years_in_cycle = + solar_cycle_years / 4 - (solar_cycle_years / 100 - solar_cycle_years / 400); static constexpr duration_s cycle_s = cuda::std::chrono::duration_cast( - duration_D{365 * solar_cycle_years + (100 - 3)}); + duration_D{365 * solar_cycle_years + num_leap_years_in_cycle}); return timestamp_s{(ts.time_since_epoch() + cycle_s) % cycle_s}; }; // Search the 400-year cycle if outside of the file entries range From af02c84e5f9b06f12b8b909e0c9245cbe987b457 Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 13 Mar 2023 18:28:19 -0700 Subject: [PATCH 30/33] separate detail API --- conda/recipes/libcudf/meta.yaml | 1 + cpp/include/cudf/detail/timezone.hpp | 36 ++++++++++++++++++++++++++++ cpp/include/cudf/timezone.hpp | 14 +++++------ cpp/src/io/orc/reader_impl.cu | 4 ++-- cpp/src/io/orc/timezone.cpp | 18 ++++++++++++-- 5 files changed, 62 insertions(+), 11 deletions(-) create mode 100644 cpp/include/cudf/detail/timezone.hpp diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 6806c97a6e1..55803baedc4 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -124,6 +124,7 @@ outputs: - test -f $PREFIX/include/cudf/detail/structs/utilities.hpp - test -f $PREFIX/include/cudf/detail/tdigest/tdigest.hpp - test -f $PREFIX/include/cudf/detail/timezone.cuh + - test -f $PREFIX/include/cudf/detail/timezone.hpp - test -f $PREFIX/include/cudf/detail/transform.hpp - test -f $PREFIX/include/cudf/detail/transpose.hpp - test -f $PREFIX/include/cudf/detail/unary.hpp diff --git a/cpp/include/cudf/detail/timezone.hpp b/cpp/include/cudf/detail/timezone.hpp new file mode 100644 index 00000000000..f7f97c0a7c2 --- /dev/null +++ b/cpp/include/cudf/detail/timezone.hpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +namespace cudf::detail { + +/** + * @copydoc cudf::make_timezone_transition_table(std::optional, std::string_view, + * rmm::mr::device_memory_resource*) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr
make_timezone_transition_table( + std::optional tzif_dir, + std::string_view timezone_name, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} // namespace cudf::detail diff --git a/cpp/include/cudf/timezone.hpp b/cpp/include/cudf/timezone.hpp index 3679ef89170..09744dcb06b 100644 --- a/cpp/include/cudf/timezone.hpp +++ b/cpp/include/cudf/timezone.hpp @@ -15,15 +15,14 @@ */ #pragma once -#include - -#include +#include #include #include #include namespace cudf { +class table; // Cycle in which the time offsets repeat in Gregorian calendar static constexpr int32_t solar_cycle_years = 400; @@ -38,12 +37,13 @@ static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years; * * @param tzif_dir The directory where the TZif files are located * @param timezone_name standard timezone name (for example, "America/Los_Angeles") - * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned table's device memory. * * @return The transition table for the given timezone */ -std::unique_ptr
make_timezone_transition_table(std::optional tzif_dir, - std::string_view timezone_name, - rmm::cuda_stream_view stream); +std::unique_ptr
make_timezone_transition_table( + std::optional tzif_dir, + std::string_view timezone_name, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace cudf diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 00caeb18c74..fd0bd5fef76 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -29,10 +29,10 @@ #include #include +#include #include #include #include -#include #include #include #include @@ -929,7 +929,7 @@ std::unique_ptr
reader::impl::compute_timezone_table( }); if (not has_timestamp_column) return std::make_unique(); - return make_timezone_transition_table( + return cudf::detail::make_timezone_transition_table( {}, selected_stripes[0].stripe_info[0].second->writerTimezone, stream); } diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp index b7eb88a49d3..eec24993994 100644 --- a/cpp/src/io/orc/timezone.cpp +++ b/cpp/src/io/orc/timezone.cpp @@ -13,9 +13,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include +#include #include +#include #include #include @@ -379,7 +380,19 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year) std::unique_ptr
make_timezone_transition_table(std::optional tzif_dir, std::string_view timezone_name, - rmm::cuda_stream_view stream) + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::make_timezone_transition_table( + tzif_dir, timezone_name, cudf::get_default_stream(), mr); +} + +namespace detail { + +std::unique_ptr
make_timezone_transition_table(std::optional tzif_dir, + std::string_view timezone_name, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (timezone_name == "UTC" || timezone_name.empty()) { // Return an empty table for UTC @@ -496,4 +509,5 @@ std::unique_ptr
make_timezone_transition_table(std::optional(std::move(tz_table_columns)); } +} // namespace detail } // namespace cudf From bd9943f998ce0e74acd274e9db37f5a6a29d50f8 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 21 Mar 2023 13:13:30 -0700 Subject: [PATCH 31/33] move timezone.cpp out of ORC --- cpp/CMakeLists.txt | 2 +- cpp/src/{io/orc => datetime}/timezone.cpp | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename cpp/src/{io/orc => datetime}/timezone.cpp (100%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 0848af2a916..13583378134 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -369,7 +369,7 @@ add_library( src/io/orc/stripe_data.cu src/io/orc/stripe_enc.cu src/io/orc/stripe_init.cu - src/io/orc/timezone.cpp + src/datetime/timezone.cpp src/io/orc/writer_impl.cu src/io/parquet/compact_protocol_reader.cpp src/io/parquet/compact_protocol_writer.cpp diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/datetime/timezone.cpp similarity index 100% rename from cpp/src/io/orc/timezone.cpp rename to cpp/src/datetime/timezone.cpp From 69dd660ff6a68eaa00c31c314886c6d522284bd0 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 21 Mar 2023 13:37:54 -0700 Subject: [PATCH 32/33] make_timezone_transition_table comment --- cpp/include/cudf/timezone.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/include/cudf/timezone.hpp b/cpp/include/cudf/timezone.hpp index 09744dcb06b..75a3b0e27e9 100644 --- a/cpp/include/cudf/timezone.hpp +++ b/cpp/include/cudf/timezone.hpp @@ -34,6 +34,11 @@ static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years; * @brief Creates a transition table to convert ORC timestamps to UTC. * * Uses system's TZif files. Assumes little-endian platform when parsing these files. + * The transition table starts with the entries from the TZif file. For timestamps after the file's + * last transition, the table includes entires that form a `solar_cycle_years`-year cycle (future + * entries). This portion of the table has `solar_cycle_entry_count` elements, as it assumes two + * transitions per year from Daylight Saving Time. If the timezone does not have DST, the table will + * still include the future entries, which will all have the same offset. * * @param tzif_dir The directory where the TZif files are located * @param timezone_name standard timezone name (for example, "America/Los_Angeles") From a5709e10153ba91e560528ce2062516b9d06ac3c Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 21 Mar 2023 13:50:05 -0700 Subject: [PATCH 33/33] type fix Co-authored-by: Bradley Dice --- cpp/include/cudf/timezone.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/timezone.hpp b/cpp/include/cudf/timezone.hpp index 75a3b0e27e9..56678c73811 100644 --- a/cpp/include/cudf/timezone.hpp +++ b/cpp/include/cudf/timezone.hpp @@ -35,7 +35,7 @@ static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years; * * Uses system's TZif files. Assumes little-endian platform when parsing these files. * The transition table starts with the entries from the TZif file. For timestamps after the file's - * last transition, the table includes entires that form a `solar_cycle_years`-year cycle (future + * last transition, the table includes entries that form a `solar_cycle_years`-year cycle (future * entries). This portion of the table has `solar_cycle_entry_count` elements, as it assumes two * transitions per year from Daylight Saving Time. If the timezone does not have DST, the table will * still include the future entries, which will all have the same offset.