From b15c7623e2f0eb2b5a32c7a8d4ad561b84308761 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Sun, 5 Nov 2023 20:01:02 +0800 Subject: [PATCH 01/35] Add timestamp parser Signed-off-by: Chong Gao --- src/main/cpp/CMakeLists.txt | 1 + src/main/cpp/src/datetime_parser.cu | 453 +++++++++++++++++++++++++++ src/main/cpp/src/datetime_parser.hpp | 67 ++++ 3 files changed, 521 insertions(+) create mode 100644 src/main/cpp/src/datetime_parser.cu create mode 100644 src/main/cpp/src/datetime_parser.hpp diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index 1ad65687e2..4eabade61b 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -165,6 +165,7 @@ add_library( src/cast_float_to_string.cu src/cast_string.cu src/cast_string_to_float.cu + src/datetime_parser.cu src/datetime_rebase.cu src/decimal_utils.cu src/histogram.cu diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu new file mode 100644 index 0000000000..75a21493dd --- /dev/null +++ b/src/main/cpp/src/datetime_parser.cu @@ -0,0 +1,453 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "datetime_parser.hpp" + +namespace { + +using timestamp_components = spark_rapids_jni::timestamp_components; + +/** + * Get the timestamp from epoch from a local date time in a specific time zone. + * Note: local date time may be overlap or gap, refer to `ZonedDateTime.of` + * + */ +__device__ cudf::timestamp_us +create_timestamp_from_components_and_zone(timestamp_components local_timestamp_components, + cudf::string_view time_zone) { + // TODO: implements: + // val localDateTime = LocalDateTime.of(localDate, localTime) + // val zonedDateTime = ZonedDateTime.of(localDateTime, zoneId) + // val instant = Instant.from(zonedDateTime) // main work + // instantToMicros(instant) + // here just return a zero + return cudf::timestamp_us{cudf::duration_us{0L}}; +} + +__device__ __host__ inline bool is_digit(const char chr) { + return (chr >= '0' && chr <= '9'); +} + +__device__ __host__ inline bool is_whitespace(const char chr) { + switch (chr) { + case ' ': + case '\r': + case '\t': + case '\n': return true; + default: return false; + } +} + +/** + * first trim the time zone, + * then format (+|-)h:mm, (+|-)hh:m or (+|-)h:m to (+|-)hh:mm + * Refer to: https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/ + * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L39 + */ +__device__ __host__ cudf::string_view format_zone_id(const cudf::string_view &time_zone_id) { + const char *curr_ptr = time_zone_id.data(); + const char *end_ptr = curr_ptr + time_zone_id.size_bytes(); + + // trim left + int num_of_left_white_space = 0; + while (curr_ptr < end_ptr && is_whitespace(*curr_ptr)) { + ++curr_ptr; + ++num_of_left_white_space; + } + // trim right + while (curr_ptr < (end_ptr - 1) && is_whitespace(*(end_ptr - 1))) { + --end_ptr; + } + + const int length_after_trim = end_ptr - curr_ptr; + int state = 0; + char ret[] = "+00:00"; // save the formatted result + bool is_valid_form = true; // is one form of: (+|-)h:mm$, (+|-)hh:m$, (+|-)h:m$, (+|-)hh:mm$ + int curr_digit_num = 0; + while (curr_ptr <= end_ptr && is_valid_form) { + char chr = *curr_ptr; + if (0 == state) { // expect '+' or '-' + if (curr_ptr == end_ptr || !('+' == chr || '-' == chr)) { // get $ + is_valid_form = false; + } else { // get '+' or '-' + ret[0] = chr; + state = 1; + } + } else if (1 == state) { // exepct hour digits then ':' + if (curr_ptr == end_ptr) { // get $ + is_valid_form = false; + } else if (is_digit(chr) && curr_digit_num < 2) { // get digit + ++curr_digit_num; + // set hh part + ret[1] = ret[2]; + ret[2] = chr; + } else if (':' == chr && curr_digit_num > 0) { // get ':' + curr_digit_num = 0; + state = 2; + } else { + is_valid_form = false; + } + } else if (2 == state) { // expect minute digits then '$' + if (curr_ptr == end_ptr && curr_digit_num > 0) { // get $ + state = 3; // success state + } else if (is_digit(chr) && curr_digit_num < 2) { // get digit + ++curr_digit_num; + // set mm part + ret[4] = ret[5]; + ret[5] = chr; + } else { + is_valid_form = false; + } + } + ++curr_ptr; + } + + if (3 == state) { + // success + return cudf::string_view(ret, 6); + } else { + // failed to format, just trim time zone id + return cudf::string_view(time_zone_id.data() + num_of_left_white_space, length_after_trim); + } +} + +__device__ __host__ bool is_valid_digits(int segment, int digits) { + // A Long is able to represent a timestamp within [+-]200 thousand years + const int constexpr maxDigitsYear = 6; + // For the nanosecond part, more than 6 digits is allowed, but will be truncated. + return segment == 6 || (segment == 0 && digits >= 4 && digits <= maxDigitsYear) || + // For the zoneId segment(7), it's could be zero digits when it's a region-based zone ID + (segment == 7 && digits <= 2) || + (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2); +} + +/** + * + * Try to parse timestamp string and get a tuple which contains: + * - timestamp_components in timestamp string: (year, month, day, hour, minute, seconds, + * microseconds). If timestamp string does not contain date and only contains time, then + * (year,month,day) is a invalid value (-1, -1, -1). If timestamp string is invalid, then all the + * components is -1. + * - time zone in timestamp string, use default time zone if it's empty + * + * Note: the returned time zone is not validated + * + * Refer to: https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/ + * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394 + */ +__device__ __host__ thrust::pair +parse_string_to_timestamp_components_tz(cudf::string_view timestamp_str, + cudf::string_view default_time_zone) { + auto error_compoments = timestamp_components{-1, -1, -1, -1, -1, -1, -1}; + auto error_time_zone = cudf::string_view(); + + if (timestamp_str.empty()) { + return thrust::make_pair(error_compoments, error_time_zone); + } + + const char *curr_ptr = timestamp_str.data(); + const char *end_ptr = curr_ptr + timestamp_str.size_bytes(); + + // trim left + while (curr_ptr < end_ptr && is_whitespace(*curr_ptr)) { + ++curr_ptr; + } + // trim right + while (curr_ptr < end_ptr - 1 && is_whitespace(*(end_ptr - 1))) { + --end_ptr; + } + + if (curr_ptr == end_ptr) { + return thrust::make_pair(error_compoments, error_time_zone); + } + + const char *const bytes = curr_ptr; + const size_t bytes_length = end_ptr - curr_ptr; + + thrust::optional tz; + int segments[] = {1, 1, 1, 0, 0, 0, 0, 0, 0}; + int segments_len = 9; + int i = 0; + int current_segment_value = 0; + int current_segment_digits = 0; + size_t j = 0; + int digits_milli = 0; + bool just_time = false; + thrust::optional year_sign; + if ('-' == bytes[j] || '+' == bytes[j]) { + if ('-' == bytes[j]) { + year_sign = -1; + } else { + year_sign = 1; + } + j += 1; + } + + while (j < bytes_length) { + char b = bytes[j]; + int parsed_value = static_cast(b - '0'); + if (parsed_value < 0 || parsed_value > 9) { + if (0 == j && 'T' == b) { + just_time = true; + i += 3; + } else if (i < 2) { + if (b == '-') { + if (!is_valid_digits(i, current_segment_digits)) { + return thrust::make_pair(error_compoments, error_time_zone); + } + segments[i] = current_segment_value; + current_segment_value = 0; + current_segment_digits = 0; + i += 1; + } else if (0 == i && ':' == b && !year_sign.has_value()) { + just_time = true; + if (!is_valid_digits(3, current_segment_digits)) { + return thrust::make_pair(error_compoments, error_time_zone); + } + segments[3] = current_segment_value; + current_segment_value = 0; + current_segment_digits = 0; + i = 4; + } else { + return thrust::make_pair(error_compoments, error_time_zone); + } + } else if (2 == i) { + if (' ' == b || 'T' == b) { + if (!is_valid_digits(i, current_segment_digits)) { + return thrust::make_pair(error_compoments, error_time_zone); + } + segments[i] = current_segment_value; + current_segment_value = 0; + current_segment_digits = 0; + i += 1; + } else { + return thrust::make_pair(error_compoments, error_time_zone); + } + } else if (3 == i || 4 == i) { + if (':' == b) { + if (!is_valid_digits(i, current_segment_digits)) { + return thrust::make_pair(error_compoments, error_time_zone); + } + segments[i] = current_segment_value; + current_segment_value = 0; + current_segment_digits = 0; + i += 1; + } else { + return thrust::make_pair(error_compoments, error_time_zone); + } + } else if (5 == i || 6 == i) { + if ('.' == b && 5 == i) { + if (!is_valid_digits(i, current_segment_digits)) { + return thrust::make_pair(error_compoments, error_time_zone); + } + segments[i] = current_segment_value; + current_segment_value = 0; + current_segment_digits = 0; + i += 1; + } else { + if (!is_valid_digits(i, current_segment_digits)) { + return thrust::make_pair(error_compoments, error_time_zone); + } + segments[i] = current_segment_value; + current_segment_value = 0; + current_segment_digits = 0; + i += 1; + tz = cudf::string_view(bytes + j, (bytes_length - j)); + j = bytes_length - 1; + } + if (i == 6 && '.' != b) { + i += 1; + } + } else { + if (i < segments_len && (':' == b || ' ' == b)) { + if (!is_valid_digits(i, current_segment_digits)) { + return thrust::make_pair(error_compoments, error_time_zone); + } + segments[i] = current_segment_value; + current_segment_value = 0; + current_segment_digits = 0; + i += 1; + } else { + return thrust::make_pair(error_compoments, error_time_zone); + } + } + } else { + if (6 == i) { + digits_milli += 1; + } + // We will truncate the nanosecond part if there are more than 6 digits, which results + // in loss of precision + if (6 != i || current_segment_digits < 6) { + current_segment_value = current_segment_value * 10 + parsed_value; + } + current_segment_digits += 1; + } + j += 1; + } + + if (!is_valid_digits(i, current_segment_digits)) { + return thrust::make_pair(error_compoments, error_time_zone); + } + segments[i] = current_segment_value; + + while (digits_milli < 6) { + segments[6] *= 10; + digits_milli += 1; + } + + cudf::string_view timze_zone; + if (tz.has_value()) { + timze_zone = format_zone_id(tz.value()); + } else { + timze_zone = default_time_zone; + } + + segments[0] *= year_sign.value_or(1); + // above is translated from Spark. + + // set components + auto components = timestamp_components{segments[0], + static_cast(segments[1]), + static_cast(segments[2]), + static_cast(segments[3]), + static_cast(segments[4]), + static_cast(segments[5]), + segments[6]}; + if (just_time) { + components.year = components.month = components.day = -1; + } + return thrust::make_pair(components, timze_zone); +} + +struct parse_timestamp_string_fn { + cudf::column_device_view const d_strings; + cudf::string_view default_time_zone; + + __device__ cudf::timestamp_us operator()(const cudf::size_type &idx) const { + auto const d_str = d_strings.element(idx); + auto components_tz = parse_string_to_timestamp_components_tz(d_str, default_time_zone); + return create_timestamp_from_components_and_zone(components_tz.first, components_tz.second); + } +}; + +/** + * + * Trims and parses timestamp string column to a timestamp column and a time zone + * column + * + */ +std::unique_ptr parse_string_to_timestamp_and_time_zone( + cudf::strings_column_view const &input, cudf::string_view default_time_zone, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { + auto d_strings = cudf::column_device_view::create(input.parent(), stream); + + auto output_timestamp = cudf::make_timestamp_column( + cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}, input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count(), stream, mr); + + thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + output_timestamp->mutable_view().begin(), + parse_timestamp_string_fn{*d_strings, default_time_zone}); + + return output_timestamp; +} + +} // namespace + +namespace spark_rapids_jni { + +/** + * + * Trims and parses timestamp string column to a timestamp components column and a time zone + * column, then create timestamp column + * Refer to: https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/ + * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394 + * + * @param input input string column view. + * @param default_time_zone if input string does not contain a time zone, use this time zone. + * @returns timestamp components column and time zone string. + * be empty. + */ +std::unique_ptr parse_string_to_timestamp(cudf::strings_column_view const &input, + cudf::string_view default_time_zone) { + auto timestamp_type = cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}; + if (input.size() == 0) { + return cudf::make_empty_column(timestamp_type.id()); + } + + auto const stream = cudf::get_default_stream(); + auto const mr = rmm::mr::get_current_device_resource(); + return parse_string_to_timestamp_and_time_zone(input, default_time_zone, stream, mr); +} + +/** + * + * Refer to `SparkDateTimeUtils.stringToTimestampWithoutTimeZone` + */ +std::unique_ptr +string_to_timestamp_without_time_zone(cudf::strings_column_view const &input, + bool allow_time_zone) { + // TODO + throw std::runtime_error("Not implemented!!!"); +} + +/** + * + * Refer to `SparkDateTimeUtils.stringToTimestamp` + */ +std::unique_ptr string_to_timestamp(cudf::strings_column_view const &input, + cudf::string_view time_zone) { + // TODO + throw std::runtime_error("Not implemented!!!"); +} + +} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp new file mode 100644 index 0000000000..509fcf8008 --- /dev/null +++ b/src/main/cpp/src/datetime_parser.hpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +namespace spark_rapids_jni { + +/** + * represents local date time in a time zone. + */ +struct timestamp_components { + int32_t year; // max 6 digits + int8_t month; + int8_t day; + int8_t hour; + int8_t minute; + int8_t second; + int32_t microseconds; +}; + +thrust::pair +parse_string_to_timestamp_components_tz(cudf::string_view timestamp_str, + cudf::string_view default_time_zone); + +/** + * + * Trims and parses timestamp string column to a timestamp components column and a time zone + * column, then create timestamp column + * Refer to: https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/ + * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394 + * + * @param input input string column view. + * @param default_time_zone if input string does not contain a time zone, use this time zone. + * @returns timestamp components column and time zone string. + * be empty. + */ +std::unique_ptr parse_string_to_timestamp(cudf::strings_column_view const &input, + cudf::string_view default_time_zone); + +/** + * + * Refer to `SparkDateTimeUtils.stringToTimestampWithoutTimeZone` + */ +std::unique_ptr +string_to_timestamp_without_time_zone(cudf::strings_column_view const &input, bool allow_time_zone); + +/** + * + * Refer to `SparkDateTimeUtils.stringToTimestamp` + */ +std::unique_ptr string_to_timestamp(cudf::strings_column_view const &input, + cudf::string_view time_zone); + +} // namespace spark_rapids_jni From 73e0f7e86f807d0763d1198d48c235161222327d Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Tue, 12 Dec 2023 10:37:26 +0800 Subject: [PATCH 02/35] Refine parser --- src/main/cpp/src/datetime_parser.cu | 269 ++++++++------------------- src/main/cpp/src/datetime_parser.hpp | 81 ++++---- 2 files changed, 125 insertions(+), 225 deletions(-) diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index 75a21493dd..14ed0a9c9f 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -32,48 +32,46 @@ #include #include #include +#include #include #include -#include -#include #include #include -#include #include #include -#include -#include #include -#include #include -#include #include +#include #include "datetime_parser.hpp" namespace { -using timestamp_components = spark_rapids_jni::timestamp_components; - /** - * Get the timestamp from epoch from a local date time in a specific time zone. - * Note: local date time may be overlap or gap, refer to `ZonedDateTime.of` - * + * represents local date time in a time zone. */ -__device__ cudf::timestamp_us -create_timestamp_from_components_and_zone(timestamp_components local_timestamp_components, - cudf::string_view time_zone) { - // TODO: implements: - // val localDateTime = LocalDateTime.of(localDate, localTime) - // val zonedDateTime = ZonedDateTime.of(localDateTime, zoneId) - // val instant = Instant.from(zonedDateTime) // main work - // instantToMicros(instant) - // here just return a zero - return cudf::timestamp_us{cudf::duration_us{0L}}; -} +struct timestamp_components { + int32_t year; // max 6 digits + int8_t month; + int8_t day; + int8_t hour; + int8_t minute; + int8_t second; + int32_t microseconds; +}; -__device__ __host__ inline bool is_digit(const char chr) { - return (chr >= '0' && chr <= '9'); +/** + * convert a local time in a time zone to UTC timestamp + */ +__device__ __host__ thrust::tuple +to_utc_timestamp(timestamp_components components, cudf::string_view const &time_zone) { + // TODO replace the temp implementation + long v = 365L * 86400L * 1000000L * components.year + 30L * 86400L * 1000000L * components.month + + 86400L * 1000000L * components.day + 3600L * 1000000L * components.hour + + 60L * 1000000L * components.minute + 1000000L * components.second + + components.microseconds; + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{v}}, true); } __device__ __host__ inline bool is_whitespace(const char chr) { @@ -86,79 +84,6 @@ __device__ __host__ inline bool is_whitespace(const char chr) { } } -/** - * first trim the time zone, - * then format (+|-)h:mm, (+|-)hh:m or (+|-)h:m to (+|-)hh:mm - * Refer to: https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/ - * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L39 - */ -__device__ __host__ cudf::string_view format_zone_id(const cudf::string_view &time_zone_id) { - const char *curr_ptr = time_zone_id.data(); - const char *end_ptr = curr_ptr + time_zone_id.size_bytes(); - - // trim left - int num_of_left_white_space = 0; - while (curr_ptr < end_ptr && is_whitespace(*curr_ptr)) { - ++curr_ptr; - ++num_of_left_white_space; - } - // trim right - while (curr_ptr < (end_ptr - 1) && is_whitespace(*(end_ptr - 1))) { - --end_ptr; - } - - const int length_after_trim = end_ptr - curr_ptr; - int state = 0; - char ret[] = "+00:00"; // save the formatted result - bool is_valid_form = true; // is one form of: (+|-)h:mm$, (+|-)hh:m$, (+|-)h:m$, (+|-)hh:mm$ - int curr_digit_num = 0; - while (curr_ptr <= end_ptr && is_valid_form) { - char chr = *curr_ptr; - if (0 == state) { // expect '+' or '-' - if (curr_ptr == end_ptr || !('+' == chr || '-' == chr)) { // get $ - is_valid_form = false; - } else { // get '+' or '-' - ret[0] = chr; - state = 1; - } - } else if (1 == state) { // exepct hour digits then ':' - if (curr_ptr == end_ptr) { // get $ - is_valid_form = false; - } else if (is_digit(chr) && curr_digit_num < 2) { // get digit - ++curr_digit_num; - // set hh part - ret[1] = ret[2]; - ret[2] = chr; - } else if (':' == chr && curr_digit_num > 0) { // get ':' - curr_digit_num = 0; - state = 2; - } else { - is_valid_form = false; - } - } else if (2 == state) { // expect minute digits then '$' - if (curr_ptr == end_ptr && curr_digit_num > 0) { // get $ - state = 3; // success state - } else if (is_digit(chr) && curr_digit_num < 2) { // get digit - ++curr_digit_num; - // set mm part - ret[4] = ret[5]; - ret[5] = chr; - } else { - is_valid_form = false; - } - } - ++curr_ptr; - } - - if (3 == state) { - // success - return cudf::string_view(ret, 6); - } else { - // failed to format, just trim time zone id - return cudf::string_view(time_zone_id.data() + num_of_left_white_space, length_after_trim); - } -} - __device__ __host__ bool is_valid_digits(int segment, int digits) { // A Long is able to represent a timestamp within [+-]200 thousand years const int constexpr maxDigitsYear = 6; @@ -169,28 +94,14 @@ __device__ __host__ bool is_valid_digits(int segment, int digits) { (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2); } -/** - * - * Try to parse timestamp string and get a tuple which contains: - * - timestamp_components in timestamp string: (year, month, day, hour, minute, seconds, - * microseconds). If timestamp string does not contain date and only contains time, then - * (year,month,day) is a invalid value (-1, -1, -1). If timestamp string is invalid, then all the - * components is -1. - * - time zone in timestamp string, use default time zone if it's empty - * - * Note: the returned time zone is not validated - * - * Refer to: https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/ - * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394 - */ -__device__ __host__ thrust::pair -parse_string_to_timestamp_components_tz(cudf::string_view timestamp_str, - cudf::string_view default_time_zone) { - auto error_compoments = timestamp_components{-1, -1, -1, -1, -1, -1, -1}; - auto error_time_zone = cudf::string_view(); +__device__ __host__ thrust::tuple +parse_string_to_timestamp_us(cudf::string_view const ×tamp_str, const char *default_time_zone, + cudf::size_type default_time_zone_char_len, bool allow_time_zone, + bool allow_special_expressions, bool ansi_mode) { + auto error_us = cudf::timestamp_us{cudf::duration_us{0}}; if (timestamp_str.empty()) { - return thrust::make_pair(error_compoments, error_time_zone); + return thrust::make_tuple(error_us, false); } const char *curr_ptr = timestamp_str.data(); @@ -206,7 +117,7 @@ parse_string_to_timestamp_components_tz(cudf::string_view timestamp_str, } if (curr_ptr == end_ptr) { - return thrust::make_pair(error_compoments, error_time_zone); + return thrust::make_tuple(error_us, false); } const char *const bytes = curr_ptr; @@ -241,7 +152,7 @@ parse_string_to_timestamp_components_tz(cudf::string_view timestamp_str, } else if (i < 2) { if (b == '-') { if (!is_valid_digits(i, current_segment_digits)) { - return thrust::make_pair(error_compoments, error_time_zone); + return thrust::make_tuple(error_us, false); } segments[i] = current_segment_value; current_segment_value = 0; @@ -250,43 +161,43 @@ parse_string_to_timestamp_components_tz(cudf::string_view timestamp_str, } else if (0 == i && ':' == b && !year_sign.has_value()) { just_time = true; if (!is_valid_digits(3, current_segment_digits)) { - return thrust::make_pair(error_compoments, error_time_zone); + return thrust::make_tuple(error_us, false); } segments[3] = current_segment_value; current_segment_value = 0; current_segment_digits = 0; i = 4; } else { - return thrust::make_pair(error_compoments, error_time_zone); + return thrust::make_tuple(error_us, false); } } else if (2 == i) { if (' ' == b || 'T' == b) { if (!is_valid_digits(i, current_segment_digits)) { - return thrust::make_pair(error_compoments, error_time_zone); + return thrust::make_tuple(error_us, false); } segments[i] = current_segment_value; current_segment_value = 0; current_segment_digits = 0; i += 1; } else { - return thrust::make_pair(error_compoments, error_time_zone); + return thrust::make_tuple(error_us, false); } } else if (3 == i || 4 == i) { if (':' == b) { if (!is_valid_digits(i, current_segment_digits)) { - return thrust::make_pair(error_compoments, error_time_zone); + return thrust::make_tuple(error_us, false); } segments[i] = current_segment_value; current_segment_value = 0; current_segment_digits = 0; i += 1; } else { - return thrust::make_pair(error_compoments, error_time_zone); + return thrust::make_tuple(error_us, false); } } else if (5 == i || 6 == i) { if ('.' == b && 5 == i) { if (!is_valid_digits(i, current_segment_digits)) { - return thrust::make_pair(error_compoments, error_time_zone); + return thrust::make_tuple(error_us, false); } segments[i] = current_segment_value; current_segment_value = 0; @@ -294,7 +205,7 @@ parse_string_to_timestamp_components_tz(cudf::string_view timestamp_str, i += 1; } else { if (!is_valid_digits(i, current_segment_digits)) { - return thrust::make_pair(error_compoments, error_time_zone); + return thrust::make_tuple(error_us, false); } segments[i] = current_segment_value; current_segment_value = 0; @@ -309,14 +220,14 @@ parse_string_to_timestamp_components_tz(cudf::string_view timestamp_str, } else { if (i < segments_len && (':' == b || ' ' == b)) { if (!is_valid_digits(i, current_segment_digits)) { - return thrust::make_pair(error_compoments, error_time_zone); + return thrust::make_tuple(error_us, false); } segments[i] = current_segment_value; current_segment_value = 0; current_segment_digits = 0; i += 1; } else { - return thrust::make_pair(error_compoments, error_time_zone); + return thrust::make_tuple(error_us, false); } } } else { @@ -334,7 +245,7 @@ parse_string_to_timestamp_components_tz(cudf::string_view timestamp_str, } if (!is_valid_digits(i, current_segment_digits)) { - return thrust::make_pair(error_compoments, error_time_zone); + return thrust::make_tuple(error_us, false); } segments[i] = current_segment_value; @@ -345,13 +256,13 @@ parse_string_to_timestamp_components_tz(cudf::string_view timestamp_str, cudf::string_view timze_zone; if (tz.has_value()) { - timze_zone = format_zone_id(tz.value()); + timze_zone = tz.value(); } else { - timze_zone = default_time_zone; + timze_zone = cudf::string_view(default_time_zone, default_time_zone_char_len); } segments[0] *= year_sign.value_or(1); - // above is translated from Spark. + // above is ported from Spark. // set components auto components = timestamp_components{segments[0], @@ -364,90 +275,74 @@ parse_string_to_timestamp_components_tz(cudf::string_view timestamp_str, if (just_time) { components.year = components.month = components.day = -1; } - return thrust::make_pair(components, timze_zone); + return to_utc_timestamp(components, timze_zone); } struct parse_timestamp_string_fn { cudf::column_device_view const d_strings; - cudf::string_view default_time_zone; + const char *default_time_zone; + cudf::size_type default_time_zone_char_len; + bool allow_time_zone; + bool allow_special_expressions; + bool ansi_mode; - __device__ cudf::timestamp_us operator()(const cudf::size_type &idx) const { + __device__ thrust::tuple operator()(const cudf::size_type &idx) const { auto const d_str = d_strings.element(idx); - auto components_tz = parse_string_to_timestamp_components_tz(d_str, default_time_zone); - return create_timestamp_from_components_and_zone(components_tz.first, components_tz.second); + return parse_string_to_timestamp_us(d_str, default_time_zone, default_time_zone_char_len, + allow_time_zone, allow_special_expressions, ansi_mode); } }; /** * - * Trims and parses timestamp string column to a timestamp column and a time zone - * column + * Trims and parses timestamp string column to a timestamp column and a is valid column * */ -std::unique_ptr parse_string_to_timestamp_and_time_zone( - cudf::strings_column_view const &input, cudf::string_view default_time_zone, - rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { +std::pair, std::unique_ptr> +string_to_timestamp(cudf::strings_column_view const &input, + std::string_view const &default_time_zone, bool allow_time_zone, + bool allow_special_expressions, bool ansi_mode, rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { auto d_strings = cudf::column_device_view::create(input.parent(), stream); auto output_timestamp = cudf::make_timestamp_column( cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}, input.size(), cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count(), stream, mr); + auto output_bool = cudf::make_fixed_width_column( + cudf::data_type{cudf::type_id::BOOL8}, input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count(), stream, mr); - thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), - thrust::make_counting_iterator(input.size()), - output_timestamp->mutable_view().begin(), - parse_timestamp_string_fn{*d_strings, default_time_zone}); - - return output_timestamp; + thrust::transform( + rmm::exec_policy(stream), thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + thrust::make_zip_iterator( + thrust::make_tuple(output_timestamp->mutable_view().begin(), + output_bool->mutable_view().begin())), + parse_timestamp_string_fn{*d_strings, default_time_zone.data(), + static_cast(default_time_zone.size()), + allow_time_zone, allow_special_expressions, ansi_mode}); + + return std::make_pair(std::move(output_timestamp), std::move(output_bool)); } } // namespace namespace spark_rapids_jni { -/** - * - * Trims and parses timestamp string column to a timestamp components column and a time zone - * column, then create timestamp column - * Refer to: https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/ - * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394 - * - * @param input input string column view. - * @param default_time_zone if input string does not contain a time zone, use this time zone. - * @returns timestamp components column and time zone string. - * be empty. - */ -std::unique_ptr parse_string_to_timestamp(cudf::strings_column_view const &input, - cudf::string_view default_time_zone) { +std::pair, std::unique_ptr> +parse_string_to_timestamp(cudf::strings_column_view const &input, + std::string_view const &default_time_zone, bool allow_time_zone, + bool allow_special_expressions, bool ansi_mode) { auto timestamp_type = cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}; if (input.size() == 0) { - return cudf::make_empty_column(timestamp_type.id()); + return std::make_pair(cudf::make_empty_column(timestamp_type.id()), + cudf::make_empty_column(cudf::data_type{cudf::type_id::BOOL8})); } auto const stream = cudf::get_default_stream(); auto const mr = rmm::mr::get_current_device_resource(); - return parse_string_to_timestamp_and_time_zone(input, default_time_zone, stream, mr); -} - -/** - * - * Refer to `SparkDateTimeUtils.stringToTimestampWithoutTimeZone` - */ -std::unique_ptr -string_to_timestamp_without_time_zone(cudf::strings_column_view const &input, - bool allow_time_zone) { - // TODO - throw std::runtime_error("Not implemented!!!"); -} - -/** - * - * Refer to `SparkDateTimeUtils.stringToTimestamp` - */ -std::unique_ptr string_to_timestamp(cudf::strings_column_view const &input, - cudf::string_view time_zone) { - // TODO - throw std::runtime_error("Not implemented!!!"); + return string_to_timestamp(input, default_time_zone, allow_time_zone, allow_special_expressions, + ansi_mode, stream, mr); } } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp index 509fcf8008..139e69086b 100644 --- a/src/main/cpp/src/datetime_parser.hpp +++ b/src/main/cpp/src/datetime_parser.hpp @@ -19,49 +19,54 @@ namespace spark_rapids_jni { /** - * represents local date time in a time zone. - */ -struct timestamp_components { - int32_t year; // max 6 digits - int8_t month; - int8_t day; - int8_t hour; - int8_t minute; - int8_t second; - int32_t microseconds; -}; - -thrust::pair -parse_string_to_timestamp_components_tz(cudf::string_view timestamp_str, - cudf::string_view default_time_zone); - -/** * - * Trims and parses timestamp string column to a timestamp components column and a time zone - * column, then create timestamp column + * Trims and parses a timestamp string column with time zone suffix to a timestamp column. + * e.g.: 1991-04-14T02:00:00Asia/Shanghai => 1991-04-13 18:00:00 + * * Refer to: https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/ * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394 * - * @param input input string column view. - * @param default_time_zone if input string does not contain a time zone, use this time zone. - * @returns timestamp components column and time zone string. - * be empty. - */ -std::unique_ptr parse_string_to_timestamp(cudf::strings_column_view const &input, - cudf::string_view default_time_zone); - -/** + * Formats are: * - * Refer to `SparkDateTimeUtils.stringToTimestampWithoutTimeZone` - */ -std::unique_ptr -string_to_timestamp_without_time_zone(cudf::strings_column_view const &input, bool allow_time_zone); - -/** + * `[+-]yyyy*` + * `[+-]yyyy*-[m]m` + * `[+-]yyyy*-[m]m-[d]d` + * `[+-]yyyy*-[m]m-[d]d ` + * `[+-]yyyy*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * `[+-]yyyy*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * + * Spark supports the following zone id forms: + * - Z - Zulu time zone UTC+0 + * - +|-[h]h:[m]m + * - A short id, see https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS + * - An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-, + * and a suffix in the formats: + * - +|-h[h] + * - +|-hh[:]mm + * - +|-hh:mm:ss + * - +|-hhmmss + * - Region-based zone IDs in the form `area/city`, such as `Europe/Paris` + * + * Unlike Spark, Spark-Rapids only supports the following time zones: + * - Z - Zulu time zone UTC+0 + * - +|-[h]h:[m]m + * - Region-based zone IDs in the form `area/city`, such as `Europe/Paris` * - * Refer to `SparkDateTimeUtils.stringToTimestamp` + * + * @param input input string column view. + * @param default_time_zone if input string does not contain a time zone, use this time zone. + * @param allow_time_zone whether allow time zone in the timestamp string. e.g.: + * 1991-04-14T02:00:00Asia/Shanghai is invalid when do not allow time zone. + * @param allow_special_expressions whether allow epoch, now, today, yesterday, tomorrow strings. + * @param ansi_mode is ansi mode + * @returns a timestamp column and a bool column. Bool column is empty if ansi mode is false, not empty otherwise. */ -std::unique_ptr string_to_timestamp(cudf::strings_column_view const &input, - cudf::string_view time_zone); - +std::pair, std::unique_ptr> +parse_string_to_timestamp(cudf::strings_column_view const &input, + std::string_view const &default_time_zone, + bool allow_time_zone, + bool allow_special_expressions, + bool ansi_mode); } // namespace spark_rapids_jni From df60772b70befbbef9a95ec2088532a662fdd996 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Tue, 12 Dec 2023 17:05:15 +0800 Subject: [PATCH 03/35] Update --- src/main/cpp/src/datetime_parser.cu | 368 ++++++++++++++++++------- src/main/cpp/src/datetime_parser.hpp | 23 +- src/main/cpp/tests/CMakeLists.txt | 3 + src/main/cpp/tests/datetime_parser.cpp | 55 ++++ 4 files changed, 351 insertions(+), 98 deletions(-) create mode 100644 src/main/cpp/tests/datetime_parser.cpp diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index 14ed0a9c9f..cd5b53e32f 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -18,27 +18,20 @@ #include #include -#include #include #include #include -#include #include -#include -#include -#include -#include -#include -#include + +#include +#include #include #include #include #include -#include #include #include #include -#include #include #include #include @@ -46,12 +39,14 @@ #include "datetime_parser.hpp" -namespace { +namespace +{ /** * represents local date time in a time zone. */ -struct timestamp_components { +struct timestamp_components +{ int32_t year; // max 6 digits int8_t month; int8_t day; @@ -65,42 +60,73 @@ struct timestamp_components { * convert a local time in a time zone to UTC timestamp */ __device__ __host__ thrust::tuple -to_utc_timestamp(timestamp_components components, cudf::string_view const &time_zone) { +to_utc_timestamp(timestamp_components components, cudf::string_view const &time_zone) +{ // TODO replace the temp implementation long v = 365L * 86400L * 1000000L * components.year + 30L * 86400L * 1000000L * components.month + - 86400L * 1000000L * components.day + 3600L * 1000000L * components.hour + - 60L * 1000000L * components.minute + 1000000L * components.second + - components.microseconds; + 86400L * 1000000L * components.day + 3600L * 1000000L * components.hour + + 60L * 1000000L * components.minute + 1000000L * components.second + + components.microseconds; return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{v}}, true); } -__device__ __host__ inline bool is_whitespace(const char chr) { - switch (chr) { - case ' ': - case '\r': - case '\t': - case '\n': return true; - default: return false; +__device__ __host__ inline bool is_whitespace(const char chr) +{ + switch (chr) + { + case ' ': + case '\r': + case '\t': + case '\n': + return true; + default: + return false; + } +} + +// compare 2 strings are equal ignore case, the expect string should be lower-case +__device__ __host__ inline bool equals(const char *actual_begin, const char *actual_end, + const char *expect_begin, const char *expect_end) +{ + if (actual_end - actual_begin != expect_end - expect_begin) + { + return false; + } + + while (actual_begin < actual_end) + { + // the diff between upper case and lower case for a same char is 32 + if (*actual_begin != *expect_begin && *actual_begin != (*expect_begin - 32)) + { + return false; + } + actual_begin++; + expect_begin++; } + return true; } -__device__ __host__ bool is_valid_digits(int segment, int digits) { +__device__ __host__ bool is_valid_digits(int segment, int digits) +{ // A Long is able to represent a timestamp within [+-]200 thousand years const int constexpr maxDigitsYear = 6; // For the nanosecond part, more than 6 digits is allowed, but will be truncated. return segment == 6 || (segment == 0 && digits >= 4 && digits <= maxDigitsYear) || - // For the zoneId segment(7), it's could be zero digits when it's a region-based zone ID - (segment == 7 && digits <= 2) || - (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2); + // For the zoneId segment(7), it's could be zero digits when it's a region-based zone ID + (segment == 7 && digits <= 2) || + (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2); } __device__ __host__ thrust::tuple parse_string_to_timestamp_us(cudf::string_view const ×tamp_str, const char *default_time_zone, - cudf::size_type default_time_zone_char_len, bool allow_time_zone, - bool allow_special_expressions, bool ansi_mode) { + cudf::size_type default_time_zone_char_len, bool allow_time_zone, + bool allow_special_expressions) +{ + auto error_us = cudf::timestamp_us{cudf::duration_us{0}}; - if (timestamp_str.empty()) { + if (timestamp_str.empty()) + { return thrust::make_tuple(error_us, false); } @@ -108,15 +134,58 @@ parse_string_to_timestamp_us(cudf::string_view const ×tamp_str, const char const char *end_ptr = curr_ptr + timestamp_str.size_bytes(); // trim left - while (curr_ptr < end_ptr && is_whitespace(*curr_ptr)) { + while (curr_ptr < end_ptr && is_whitespace(*curr_ptr)) + { ++curr_ptr; } // trim right - while (curr_ptr < end_ptr - 1 && is_whitespace(*(end_ptr - 1))) { + while (curr_ptr < end_ptr - 1 && is_whitespace(*(end_ptr - 1))) + { --end_ptr; } - if (curr_ptr == end_ptr) { + // special strings: epoch, now, today, yesterday, tomorrow + // TODO + if (allow_special_expressions) + { + char const *begin_epoch = "epoch"; + char const *end_epoch = begin_epoch + 5; + + char const *begin_now = "now"; + char const *end_now = begin_now + 3; + + char const *begin_today = "today"; + char const *end_today = begin_today + 5; + + char const *begin_yesterday = "yesterday"; + char const *end_yesterday = begin_yesterday + 9; + + char const *begin_tomorrow = "tomorrow"; + char const *end_tomorrow = begin_tomorrow + 8; + if (equals(curr_ptr, end_ptr, begin_epoch, end_epoch)) + { // epoch + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, true); + } + else if (equals(curr_ptr, end_ptr, begin_now, end_now)) + { + // now + } + else if (equals(curr_ptr, end_ptr, begin_today, end_today)) + { + // today + } + else if (equals(curr_ptr, end_ptr, begin_yesterday, end_yesterday)) + { + // yesterday + } + else if (equals(curr_ptr, end_ptr, begin_tomorrow, end_tomorrow)) + { + // tomorrow + } + } + + if (curr_ptr == end_ptr) + { return thrust::make_tuple(error_us, false); } @@ -133,78 +202,113 @@ parse_string_to_timestamp_us(cudf::string_view const ×tamp_str, const char int digits_milli = 0; bool just_time = false; thrust::optional year_sign; - if ('-' == bytes[j] || '+' == bytes[j]) { - if ('-' == bytes[j]) { + if ('-' == bytes[j] || '+' == bytes[j]) + { + if ('-' == bytes[j]) + { year_sign = -1; - } else { + } + else + { year_sign = 1; } j += 1; } - while (j < bytes_length) { + while (j < bytes_length) + { char b = bytes[j]; int parsed_value = static_cast(b - '0'); - if (parsed_value < 0 || parsed_value > 9) { - if (0 == j && 'T' == b) { + if (parsed_value < 0 || parsed_value > 9) + { + if (0 == j && 'T' == b) + { just_time = true; i += 3; - } else if (i < 2) { - if (b == '-') { - if (!is_valid_digits(i, current_segment_digits)) { + } + else if (i < 2) + { + if (b == '-') + { + if (!is_valid_digits(i, current_segment_digits)) + { return thrust::make_tuple(error_us, false); } segments[i] = current_segment_value; current_segment_value = 0; current_segment_digits = 0; i += 1; - } else if (0 == i && ':' == b && !year_sign.has_value()) { + } + else if (0 == i && ':' == b && !year_sign.has_value()) + { just_time = true; - if (!is_valid_digits(3, current_segment_digits)) { + if (!is_valid_digits(3, current_segment_digits)) + { return thrust::make_tuple(error_us, false); } segments[3] = current_segment_value; current_segment_value = 0; current_segment_digits = 0; i = 4; - } else { + } + else + { return thrust::make_tuple(error_us, false); } - } else if (2 == i) { - if (' ' == b || 'T' == b) { - if (!is_valid_digits(i, current_segment_digits)) { + } + else if (2 == i) + { + if (' ' == b || 'T' == b) + { + if (!is_valid_digits(i, current_segment_digits)) + { return thrust::make_tuple(error_us, false); } segments[i] = current_segment_value; current_segment_value = 0; current_segment_digits = 0; i += 1; - } else { + } + else + { return thrust::make_tuple(error_us, false); } - } else if (3 == i || 4 == i) { - if (':' == b) { - if (!is_valid_digits(i, current_segment_digits)) { + } + else if (3 == i || 4 == i) + { + if (':' == b) + { + if (!is_valid_digits(i, current_segment_digits)) + { return thrust::make_tuple(error_us, false); } segments[i] = current_segment_value; current_segment_value = 0; current_segment_digits = 0; i += 1; - } else { + } + else + { return thrust::make_tuple(error_us, false); } - } else if (5 == i || 6 == i) { - if ('.' == b && 5 == i) { - if (!is_valid_digits(i, current_segment_digits)) { + } + else if (5 == i || 6 == i) + { + if ('.' == b && 5 == i) + { + if (!is_valid_digits(i, current_segment_digits)) + { return thrust::make_tuple(error_us, false); } segments[i] = current_segment_value; current_segment_value = 0; current_segment_digits = 0; i += 1; - } else { - if (!is_valid_digits(i, current_segment_digits)) { + } + else + { + if (!is_valid_digits(i, current_segment_digits)) + { return thrust::make_tuple(error_us, false); } segments[i] = current_segment_value; @@ -214,29 +318,40 @@ parse_string_to_timestamp_us(cudf::string_view const ×tamp_str, const char tz = cudf::string_view(bytes + j, (bytes_length - j)); j = bytes_length - 1; } - if (i == 6 && '.' != b) { + if (i == 6 && '.' != b) + { i += 1; } - } else { - if (i < segments_len && (':' == b || ' ' == b)) { - if (!is_valid_digits(i, current_segment_digits)) { + } + else + { + if (i < segments_len && (':' == b || ' ' == b)) + { + if (!is_valid_digits(i, current_segment_digits)) + { return thrust::make_tuple(error_us, false); } segments[i] = current_segment_value; current_segment_value = 0; current_segment_digits = 0; i += 1; - } else { + } + else + { return thrust::make_tuple(error_us, false); } } - } else { - if (6 == i) { + } + else + { + if (6 == i) + { digits_milli += 1; } // We will truncate the nanosecond part if there are more than 6 digits, which results // in loss of precision - if (6 != i || current_segment_digits < 6) { + if (6 != i || current_segment_digits < 6) + { current_segment_value = current_segment_value * 10 + parsed_value; } current_segment_digits += 1; @@ -244,20 +359,43 @@ parse_string_to_timestamp_us(cudf::string_view const ×tamp_str, const char j += 1; } - if (!is_valid_digits(i, current_segment_digits)) { + if (!is_valid_digits(i, current_segment_digits)) + { return thrust::make_tuple(error_us, false); } segments[i] = current_segment_value; - while (digits_milli < 6) { + while (digits_milli < 6) + { segments[6] *= 10; digits_milli += 1; } + if (default_time_zone_char_len == 0) + { + // invoke from `string_to_timestamp_without_time_zone` + if (just_time || !allow_time_zone && tz.has_value()) + { + return thrust::make_tuple(error_us, false); + } + } + else + { + // invoke from `string_to_timestamp` + if (just_time) + { + // TODO + // set today: year-month-day + } + } + cudf::string_view timze_zone; - if (tz.has_value()) { + if (tz.has_value()) + { timze_zone = tz.value(); - } else { + } + else + { timze_zone = cudf::string_view(default_time_zone, default_time_zone_char_len); } @@ -266,30 +404,29 @@ parse_string_to_timestamp_us(cudf::string_view const ×tamp_str, const char // set components auto components = timestamp_components{segments[0], - static_cast(segments[1]), - static_cast(segments[2]), - static_cast(segments[3]), - static_cast(segments[4]), - static_cast(segments[5]), - segments[6]}; - if (just_time) { - components.year = components.month = components.day = -1; - } + static_cast(segments[1]), + static_cast(segments[2]), + static_cast(segments[3]), + static_cast(segments[4]), + static_cast(segments[5]), + segments[6]}; + return to_utc_timestamp(components, timze_zone); } -struct parse_timestamp_string_fn { +struct parse_timestamp_string_fn +{ cudf::column_device_view const d_strings; const char *default_time_zone; cudf::size_type default_time_zone_char_len; bool allow_time_zone; bool allow_special_expressions; - bool ansi_mode; - __device__ thrust::tuple operator()(const cudf::size_type &idx) const { + __device__ thrust::tuple operator()(const cudf::size_type &idx) const + { auto const d_str = d_strings.element(idx); return parse_string_to_timestamp_us(d_str, default_time_zone, default_time_zone_char_len, - allow_time_zone, allow_special_expressions, ansi_mode); + allow_time_zone, allow_special_expressions); } }; @@ -299,10 +436,11 @@ struct parse_timestamp_string_fn { * */ std::pair, std::unique_ptr> -string_to_timestamp(cudf::strings_column_view const &input, - std::string_view const &default_time_zone, bool allow_time_zone, - bool allow_special_expressions, bool ansi_mode, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { +to_timestamp(cudf::strings_column_view const &input, + std::string_view const &default_time_zone, bool allow_time_zone, + bool allow_special_expressions, rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ auto d_strings = cudf::column_device_view::create(input.parent(), stream); auto output_timestamp = cudf::make_timestamp_column( @@ -317,32 +455,68 @@ string_to_timestamp(cudf::strings_column_view const &input, thrust::make_counting_iterator(input.size()), thrust::make_zip_iterator( thrust::make_tuple(output_timestamp->mutable_view().begin(), - output_bool->mutable_view().begin())), + output_bool->mutable_view().begin())), parse_timestamp_string_fn{*d_strings, default_time_zone.data(), static_cast(default_time_zone.size()), - allow_time_zone, allow_special_expressions, ansi_mode}); + allow_time_zone, allow_special_expressions}); return std::make_pair(std::move(output_timestamp), std::move(output_bool)); } } // namespace -namespace spark_rapids_jni { +namespace spark_rapids_jni +{ -std::pair, std::unique_ptr> +std::pair, bool> parse_string_to_timestamp(cudf::strings_column_view const &input, std::string_view const &default_time_zone, bool allow_time_zone, - bool allow_special_expressions, bool ansi_mode) { + bool allow_special_expressions, bool ansi_mode) +{ auto timestamp_type = cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}; - if (input.size() == 0) { - return std::make_pair(cudf::make_empty_column(timestamp_type.id()), - cudf::make_empty_column(cudf::data_type{cudf::type_id::BOOL8})); + if (input.size() == 0) + { + return std::make_pair(cudf::make_empty_column(timestamp_type.id()), true); } auto const stream = cudf::get_default_stream(); auto const mr = rmm::mr::get_current_device_resource(); - return string_to_timestamp(input, default_time_zone, allow_time_zone, allow_special_expressions, - ansi_mode, stream, mr); + auto [timestamp_column, valid_column] = to_timestamp(input, default_time_zone, allow_time_zone, allow_special_expressions, + stream, mr); + if (ansi_mode) + { + cudf::numeric_scalar false_scalar{false, true, stream}; + if (cudf::contains(*valid_column, false_scalar, stream)) + { + return std::make_pair(nullptr, false); + } + else + { + return std::make_pair(std::move(timestamp_column), true); + } + } + else + { + return std::make_pair(std::move(timestamp_column), true); + } +} + +std::pair, bool> +string_to_timestamp(cudf::strings_column_view const &input, + std::string_view const &default_time_zone, + bool allow_special_expressions, + bool ansi_mode) +{ + return parse_string_to_timestamp(input, default_time_zone, true, allow_special_expressions, ansi_mode); +} + +std::pair, bool> +string_to_timestamp_without_time_zone(cudf::strings_column_view const &input, + bool allow_time_zone, + bool allow_special_expressions, + bool ansi_mode) +{ + return parse_string_to_timestamp(input, std::string_view(""), allow_time_zone, allow_special_expressions, ansi_mode); } } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp index 139e69086b..26ecf421a3 100644 --- a/src/main/cpp/src/datetime_parser.hpp +++ b/src/main/cpp/src/datetime_parser.hpp @@ -63,10 +63,31 @@ namespace spark_rapids_jni { * @param ansi_mode is ansi mode * @returns a timestamp column and a bool column. Bool column is empty if ansi mode is false, not empty otherwise. */ -std::pair, std::unique_ptr> +std::pair, bool> parse_string_to_timestamp(cudf::strings_column_view const &input, std::string_view const &default_time_zone, bool allow_time_zone, bool allow_special_expressions, bool ansi_mode); +/** + * Refer to `parse_string_to_timestamp` + * If timestamp string does not contain date info(yyyy mm dd), use current date +*/ +std::pair, bool> +string_to_timestamp(cudf::strings_column_view const &input, + std::string_view const &default_time_zone, + bool allow_special_expressions, + bool ansi_mode); + +/** + * Refer to `parse_string_to_timestamp` + * + * @param allow_time_zone whether allow time zone in the timestamp string. e.g.: + * 1991-04-14T02:00:00Asia/Shanghai is invalid when do not allow time zone. +*/ +std::pair, bool> +string_to_timestamp_without_time_zone(cudf::strings_column_view const &input, + bool allow_time_zone, + bool allow_special_expressions, + bool ansi_mode); } // namespace spark_rapids_jni diff --git a/src/main/cpp/tests/CMakeLists.txt b/src/main/cpp/tests/CMakeLists.txt index 617df6dfde..1f58176327 100644 --- a/src/main/cpp/tests/CMakeLists.txt +++ b/src/main/cpp/tests/CMakeLists.txt @@ -57,6 +57,9 @@ ConfigureTest(FORMAT_FLOAT ConfigureTest(CAST_FLOAT_TO_STRING cast_float_to_string.cpp) +ConfigureTest(DATETIME_PARSER + datetime_parser.cpp) + ConfigureTest(DATETIME_REBASE datetime_rebase.cpp) diff --git a/src/main/cpp/tests/datetime_parser.cpp b/src/main/cpp/tests/datetime_parser.cpp new file mode 100644 index 0000000000..ff6c7b79db --- /dev/null +++ b/src/main/cpp/tests/datetime_parser.cpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +// + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using timestamp_col = cudf::test::fixed_width_column_wrapper; + +struct DateTimeParserTest : public cudf::test::BaseFixture +{ +}; + +TEST_F(DateTimeParserTest, ParseTimestamp) +{ + auto const ts_col = timestamp_col{ + -719162L, -354285L, -141714, -141438, -141437, -141432, -141427, -31463, -31453, -1, 0, 18335}; + + auto const ts_strings = + cudf::test::strings_column_wrapper{"2023-11-05T03:04:55Z", + "2023-11-05T03:04:55 ", + "2023-11-05T03:04:55.123456 "}; + auto const parsed_ts = + cudf::strings::string_to_timestamp(cudf::strings_column_view(ts_strings), + "Z", + true, + true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(ts_col, *parsed_ts); +} From a4a83c02ebd929fc2292ea7e84043f0e61526e27 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Tue, 12 Dec 2023 18:08:24 +0800 Subject: [PATCH 04/35] Update --- src/main/cpp/src/datetime_parser.cu | 17 +++++++++++------ src/main/cpp/tests/datetime_parser.cpp | 18 +++++++++--------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index cd5b53e32f..4b607f4876 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -62,12 +62,15 @@ struct timestamp_components __device__ __host__ thrust::tuple to_utc_timestamp(timestamp_components components, cudf::string_view const &time_zone) { - // TODO replace the temp implementation - long v = 365L * 86400L * 1000000L * components.year + 30L * 86400L * 1000000L * components.month + - 86400L * 1000000L * components.day + 3600L * 1000000L * components.hour + - 60L * 1000000L * components.minute + 1000000L * components.second + - components.microseconds; - return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{v}}, true); + // TODO replace the fake implementation + long seconds = components.year * 365L * 86400L + + components.month * 30L * 86400L + + components.day * 86400L + + components.hour * 3600L + + components.minute * 60L + + components.second; + long us = seconds * 1000000L + components.microseconds; + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{us}}, true); } __device__ __host__ inline bool is_whitespace(const char chr) @@ -492,11 +495,13 @@ parse_string_to_timestamp(cudf::strings_column_view const &input, } else { + // TODO update bitmask return std::make_pair(std::move(timestamp_column), true); } } else { + // TODO update bitmask return std::make_pair(std::move(timestamp_column), true); } } diff --git a/src/main/cpp/tests/datetime_parser.cpp b/src/main/cpp/tests/datetime_parser.cpp index ff6c7b79db..47a733a41e 100644 --- a/src/main/cpp/tests/datetime_parser.cpp +++ b/src/main/cpp/tests/datetime_parser.cpp @@ -39,17 +39,17 @@ struct DateTimeParserTest : public cudf::test::BaseFixture TEST_F(DateTimeParserTest, ParseTimestamp) { - auto const ts_col = timestamp_col{ - -719162L, -354285L, -141714, -141438, -141437, -141432, -141427, -31463, -31453, -1, 0, 18335}; + auto v = (2023L * 365L * 86400L + 11L * 30L * 86400L + 5L * 86400L + 3L * 3600L + 4L * 60L + 55L) * 1000000L; + auto const ts_col = timestamp_col{v, v, v + 123456}; auto const ts_strings = cudf::test::strings_column_wrapper{"2023-11-05T03:04:55Z", - "2023-11-05T03:04:55 ", - "2023-11-05T03:04:55.123456 "}; + "2023-11-05T03:04:55 ", + "2023-11-05T03:04:55.123456 "}; auto const parsed_ts = - cudf::strings::string_to_timestamp(cudf::strings_column_view(ts_strings), - "Z", - true, - true); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(ts_col, *parsed_ts); + spark_rapids_jni::string_to_timestamp(cudf::strings_column_view(ts_strings), + "Z", + true, + true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(ts_col, *(parsed_ts.first)); } From 89eef6b34e98c33da84ad1373c1eaf83ee34393d Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 14 Dec 2023 09:48:49 +0800 Subject: [PATCH 05/35] Fix bitmask; Parse special timestamp strings: now, today ...; Add Ansi mode check --- src/main/cpp/src/datetime_parser.cu | 621 +++++++++++++------------ src/main/cpp/src/datetime_parser.hpp | 104 +++-- src/main/cpp/tests/datetime_parser.cpp | 169 ++++++- 3 files changed, 546 insertions(+), 348 deletions(-) diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index 4b607f4876..6411c00017 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -18,11 +18,14 @@ #include #include +#include + #include #include #include +#include #include - +#include #include #include #include @@ -39,15 +42,13 @@ #include "datetime_parser.hpp" -namespace -{ +namespace { /** - * represents local date time in a time zone. + * Represents local date time in a time zone. */ -struct timestamp_components -{ - int32_t year; // max 6 digits +struct timestamp_components { + int32_t year; // max 6 digits int8_t month; int8_t day; int8_t hour; @@ -57,304 +58,255 @@ struct timestamp_components }; /** - * convert a local time in a time zone to UTC timestamp + * Convert a local time in a time zone to a UTC timestamp */ -__device__ __host__ thrust::tuple -to_utc_timestamp(timestamp_components components, cudf::string_view const &time_zone) +__device__ __host__ thrust::tuple to_utc_timestamp( + timestamp_components const& components, cudf::string_view const& time_zone) { - // TODO replace the fake implementation - long seconds = components.year * 365L * 86400L + - components.month * 30L * 86400L + - components.day * 86400L + - components.hour * 3600L + - components.minute * 60L + + // TODO replace the following fake implementation + long seconds = components.year * 365L * 86400L + components.month * 30L * 86400L + + components.day * 86400L + components.hour * 3600L + components.minute * 60L + components.second; long us = seconds * 1000000L + components.microseconds; return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{us}}, true); } +/** + * Is white space + */ __device__ __host__ inline bool is_whitespace(const char chr) { - switch (chr) - { - case ' ': - case '\r': - case '\t': - case '\n': - return true; - default: - return false; + switch (chr) { + case ' ': + case '\r': + case '\t': + case '\n': return true; + default: return false; } } -// compare 2 strings are equal ignore case, the expect string should be lower-case -__device__ __host__ inline bool equals(const char *actual_begin, const char *actual_end, - const char *expect_begin, const char *expect_end) +/** + * Whether the given two strings are equal, + * used to compare special timestamp strings ignoring case: + * "epoch", "now", "today", "yesterday", "tomorrow" + * the expect string should be lower-case a-z chars + */ +__device__ __host__ inline bool equals_ascii_ignore_case(char const* actual_begin, + char const* actual_end, + char const* expect_begin, + char const* expect_end) { - if (actual_end - actual_begin != expect_end - expect_begin) - { - return false; - } + if (actual_end - actual_begin != expect_end - expect_begin) { return false; } - while (actual_begin < actual_end) - { + while (expect_begin < expect_end) { // the diff between upper case and lower case for a same char is 32 - if (*actual_begin != *expect_begin && *actual_begin != (*expect_begin - 32)) - { - return false; - } + if (*actual_begin != *expect_begin && *actual_begin != (*expect_begin - 32)) { return false; } actual_begin++; expect_begin++; } return true; } +/** + * Ported from Spark + */ __device__ __host__ bool is_valid_digits(int segment, int digits) { // A Long is able to represent a timestamp within [+-]200 thousand years const int constexpr maxDigitsYear = 6; // For the nanosecond part, more than 6 digits is allowed, but will be truncated. return segment == 6 || (segment == 0 && digits >= 4 && digits <= maxDigitsYear) || - // For the zoneId segment(7), it's could be zero digits when it's a region-based zone ID - (segment == 7 && digits <= 2) || - (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2); + // For the zoneId segment(7), it's could be zero digits when it's a region-based zone ID + (segment == 7 && digits <= 2) || + (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2); } -__device__ __host__ thrust::tuple -parse_string_to_timestamp_us(cudf::string_view const ×tamp_str, const char *default_time_zone, - cudf::size_type default_time_zone_char_len, bool allow_time_zone, - bool allow_special_expressions) +/** + * Ported from Spark: + * https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/ + * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394 + * + * Parse a string with time zone to a timestamp. + * The bool in the returned tuple is false if the parse failed. + */ +__device__ __host__ thrust::tuple parse_string_to_timestamp_us( + cudf::string_view const& timestamp_str, + const char* default_time_zone, + cudf::size_type default_time_zone_char_len, + bool allow_time_zone, + bool allow_special_expressions, + cudf::timestamp_us epoch, + cudf::timestamp_us now, + cudf::timestamp_us today, + cudf::timestamp_us tomorrow, + cudf::timestamp_us yesterday) { - auto error_us = cudf::timestamp_us{cudf::duration_us{0}}; - if (timestamp_str.empty()) - { - return thrust::make_tuple(error_us, false); - } + if (timestamp_str.empty()) { return thrust::make_tuple(error_us, false); } - const char *curr_ptr = timestamp_str.data(); - const char *end_ptr = curr_ptr + timestamp_str.size_bytes(); + const char* curr_ptr = timestamp_str.data(); + const char* end_ptr = curr_ptr + timestamp_str.size_bytes(); // trim left - while (curr_ptr < end_ptr && is_whitespace(*curr_ptr)) - { + while (curr_ptr < end_ptr && is_whitespace(*curr_ptr)) { ++curr_ptr; } // trim right - while (curr_ptr < end_ptr - 1 && is_whitespace(*(end_ptr - 1))) - { + while (curr_ptr < end_ptr - 1 && is_whitespace(*(end_ptr - 1))) { --end_ptr; } // special strings: epoch, now, today, yesterday, tomorrow - // TODO - if (allow_special_expressions) - { - char const *begin_epoch = "epoch"; - char const *end_epoch = begin_epoch + 5; + if (allow_special_expressions) { + char const* begin_epoch = "epoch"; + char const* end_epoch = begin_epoch + 5; - char const *begin_now = "now"; - char const *end_now = begin_now + 3; + char const* begin_now = "now"; + char const* end_now = begin_now + 3; - char const *begin_today = "today"; - char const *end_today = begin_today + 5; + char const* begin_today = "today"; + char const* end_today = begin_today + 5; - char const *begin_yesterday = "yesterday"; - char const *end_yesterday = begin_yesterday + 9; + char const* begin_tomorrow = "tomorrow"; + char const* end_tomorrow = begin_tomorrow + 8; - char const *begin_tomorrow = "tomorrow"; - char const *end_tomorrow = begin_tomorrow + 8; - if (equals(curr_ptr, end_ptr, begin_epoch, end_epoch)) - { // epoch - return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, true); - } - else if (equals(curr_ptr, end_ptr, begin_now, end_now)) - { + char const* begin_yesterday = "yesterday"; + char const* end_yesterday = begin_yesterday + 9; + + if (equals_ascii_ignore_case(curr_ptr, end_ptr, begin_epoch, end_epoch)) { + // epoch + return thrust::make_tuple(epoch, true); + } else if (equals_ascii_ignore_case(curr_ptr, end_ptr, begin_now, end_now)) { // now - } - else if (equals(curr_ptr, end_ptr, begin_today, end_today)) - { + return thrust::make_tuple(now, true); + } else if (equals_ascii_ignore_case(curr_ptr, end_ptr, begin_today, end_today)) { // today - } - else if (equals(curr_ptr, end_ptr, begin_yesterday, end_yesterday)) - { - // yesterday - } - else if (equals(curr_ptr, end_ptr, begin_tomorrow, end_tomorrow)) - { + return thrust::make_tuple(today, true); + } else if (equals_ascii_ignore_case(curr_ptr, end_ptr, begin_tomorrow, end_tomorrow)) { // tomorrow + return thrust::make_tuple(tomorrow, true); + } else if (equals_ascii_ignore_case(curr_ptr, end_ptr, begin_yesterday, end_yesterday)) { + // yesterday + return thrust::make_tuple(yesterday, true); } } - if (curr_ptr == end_ptr) - { - return thrust::make_tuple(error_us, false); - } + if (curr_ptr == end_ptr) { return thrust::make_tuple(error_us, false); } - const char *const bytes = curr_ptr; + const char* const bytes = curr_ptr; const size_t bytes_length = end_ptr - curr_ptr; thrust::optional tz; - int segments[] = {1, 1, 1, 0, 0, 0, 0, 0, 0}; - int segments_len = 9; - int i = 0; - int current_segment_value = 0; + int segments[] = {1, 1, 1, 0, 0, 0, 0, 0, 0}; + int segments_len = 9; + int i = 0; + int current_segment_value = 0; int current_segment_digits = 0; - size_t j = 0; - int digits_milli = 0; - bool just_time = false; + size_t j = 0; + int digits_milli = 0; + bool just_time = false; thrust::optional year_sign; - if ('-' == bytes[j] || '+' == bytes[j]) - { - if ('-' == bytes[j]) - { + if ('-' == bytes[j] || '+' == bytes[j]) { + if ('-' == bytes[j]) { year_sign = -1; - } - else - { + } else { year_sign = 1; } j += 1; } - while (j < bytes_length) - { - char b = bytes[j]; + while (j < bytes_length) { + char b = bytes[j]; int parsed_value = static_cast(b - '0'); - if (parsed_value < 0 || parsed_value > 9) - { - if (0 == j && 'T' == b) - { + if (parsed_value < 0 || parsed_value > 9) { + if (0 == j && 'T' == b) { just_time = true; i += 3; - } - else if (i < 2) - { - if (b == '-') - { - if (!is_valid_digits(i, current_segment_digits)) - { + } else if (i < 2) { + if (b == '-') { + if (!is_valid_digits(i, current_segment_digits)) { return thrust::make_tuple(error_us, false); } - segments[i] = current_segment_value; - current_segment_value = 0; + segments[i] = current_segment_value; + current_segment_value = 0; current_segment_digits = 0; i += 1; - } - else if (0 == i && ':' == b && !year_sign.has_value()) - { + } else if (0 == i && ':' == b && !year_sign.has_value()) { just_time = true; - if (!is_valid_digits(3, current_segment_digits)) - { + if (!is_valid_digits(3, current_segment_digits)) { return thrust::make_tuple(error_us, false); } - segments[3] = current_segment_value; - current_segment_value = 0; + segments[3] = current_segment_value; + current_segment_value = 0; current_segment_digits = 0; - i = 4; - } - else - { + i = 4; + } else { return thrust::make_tuple(error_us, false); } - } - else if (2 == i) - { - if (' ' == b || 'T' == b) - { - if (!is_valid_digits(i, current_segment_digits)) - { + } else if (2 == i) { + if (' ' == b || 'T' == b) { + if (!is_valid_digits(i, current_segment_digits)) { return thrust::make_tuple(error_us, false); } - segments[i] = current_segment_value; - current_segment_value = 0; + segments[i] = current_segment_value; + current_segment_value = 0; current_segment_digits = 0; i += 1; - } - else - { + } else { return thrust::make_tuple(error_us, false); } - } - else if (3 == i || 4 == i) - { - if (':' == b) - { - if (!is_valid_digits(i, current_segment_digits)) - { + } else if (3 == i || 4 == i) { + if (':' == b) { + if (!is_valid_digits(i, current_segment_digits)) { return thrust::make_tuple(error_us, false); } - segments[i] = current_segment_value; - current_segment_value = 0; + segments[i] = current_segment_value; + current_segment_value = 0; current_segment_digits = 0; i += 1; - } - else - { + } else { return thrust::make_tuple(error_us, false); } - } - else if (5 == i || 6 == i) - { - if ('.' == b && 5 == i) - { - if (!is_valid_digits(i, current_segment_digits)) - { + } else if (5 == i || 6 == i) { + if ('.' == b && 5 == i) { + if (!is_valid_digits(i, current_segment_digits)) { return thrust::make_tuple(error_us, false); } - segments[i] = current_segment_value; - current_segment_value = 0; + segments[i] = current_segment_value; + current_segment_value = 0; current_segment_digits = 0; i += 1; - } - else - { - if (!is_valid_digits(i, current_segment_digits)) - { + } else { + if (!is_valid_digits(i, current_segment_digits)) { return thrust::make_tuple(error_us, false); } - segments[i] = current_segment_value; - current_segment_value = 0; + segments[i] = current_segment_value; + current_segment_value = 0; current_segment_digits = 0; i += 1; tz = cudf::string_view(bytes + j, (bytes_length - j)); - j = bytes_length - 1; + j = bytes_length - 1; } - if (i == 6 && '.' != b) - { - i += 1; - } - } - else - { - if (i < segments_len && (':' == b || ' ' == b)) - { - if (!is_valid_digits(i, current_segment_digits)) - { + if (i == 6 && '.' != b) { i += 1; } + } else { + if (i < segments_len && (':' == b || ' ' == b)) { + if (!is_valid_digits(i, current_segment_digits)) { return thrust::make_tuple(error_us, false); } - segments[i] = current_segment_value; - current_segment_value = 0; + segments[i] = current_segment_value; + current_segment_value = 0; current_segment_digits = 0; i += 1; - } - else - { + } else { return thrust::make_tuple(error_us, false); } } - } - else - { - if (6 == i) - { - digits_milli += 1; - } + } else { + if (6 == i) { digits_milli += 1; } // We will truncate the nanosecond part if there are more than 6 digits, which results // in loss of precision - if (6 != i || current_segment_digits < 6) - { + if (6 != i || current_segment_digits < 6) { current_segment_value = current_segment_value * 10 + parsed_value; } current_segment_digits += 1; @@ -362,74 +314,82 @@ parse_string_to_timestamp_us(cudf::string_view const ×tamp_str, const char j += 1; } - if (!is_valid_digits(i, current_segment_digits)) - { - return thrust::make_tuple(error_us, false); - } + if (!is_valid_digits(i, current_segment_digits)) { return thrust::make_tuple(error_us, false); } segments[i] = current_segment_value; - while (digits_milli < 6) - { + while (digits_milli < 6) { segments[6] *= 10; digits_milli += 1; } - if (default_time_zone_char_len == 0) - { + cudf::string_view time_zone; + if (tz.has_value()) { + time_zone = tz.value(); + } else { + time_zone = cudf::string_view(default_time_zone, default_time_zone_char_len); + } + + segments[0] *= year_sign.value_or(1); + // above is ported from Spark. + + if (default_time_zone_char_len == 0) { // invoke from `string_to_timestamp_without_time_zone` - if (just_time || !allow_time_zone && tz.has_value()) - { + if (just_time || !allow_time_zone && tz.has_value()) { return thrust::make_tuple(error_us, false); } - } - else - { + } else { // invoke from `string_to_timestamp` - if (just_time) - { - // TODO - // set today: year-month-day + if (just_time) { + // Update here to support the following format: + // `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + // `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + // by set local date in a time zone: year-month-day. + // Above 2 formats are time zone related, Spark uses LocalDate.now(zoneId) + + // do not support currently + return thrust::make_tuple(error_us, false); } } - cudf::string_view timze_zone; - if (tz.has_value()) - { - timze_zone = tz.value(); - } - else - { - timze_zone = cudf::string_view(default_time_zone, default_time_zone_char_len); - } - - segments[0] *= year_sign.value_or(1); - // above is ported from Spark. - // set components auto components = timestamp_components{segments[0], - static_cast(segments[1]), - static_cast(segments[2]), - static_cast(segments[3]), - static_cast(segments[4]), - static_cast(segments[5]), - segments[6]}; - - return to_utc_timestamp(components, timze_zone); + static_cast(segments[1]), + static_cast(segments[2]), + static_cast(segments[3]), + static_cast(segments[4]), + static_cast(segments[5]), + segments[6]}; + + return to_utc_timestamp(components, time_zone); } -struct parse_timestamp_string_fn -{ +struct parse_timestamp_string_fn { cudf::column_device_view const d_strings; - const char *default_time_zone; + const char* default_time_zone; cudf::size_type default_time_zone_char_len; bool allow_time_zone; bool allow_special_expressions; - - __device__ thrust::tuple operator()(const cudf::size_type &idx) const + // TODO the following should be passed in. + // Note: today, tomorrow, yesterday are time zone related, should use time zone to generate. + cudf::timestamp_us epoch = cudf::timestamp_us{cudf::duration_us{111L}}; + cudf::timestamp_us now = cudf::timestamp_us{cudf::duration_us{222L}}; + cudf::timestamp_us today = cudf::timestamp_us{cudf::duration_us{333L}}; + cudf::timestamp_us tomorrow = cudf::timestamp_us{cudf::duration_us{444L}}; + cudf::timestamp_us yesterday = cudf::timestamp_us{cudf::duration_us{555L}}; + + __device__ thrust::tuple operator()(const cudf::size_type& idx) const { auto const d_str = d_strings.element(idx); - return parse_string_to_timestamp_us(d_str, default_time_zone, default_time_zone_char_len, - allow_time_zone, allow_special_expressions); + return parse_string_to_timestamp_us(d_str, + default_time_zone, + default_time_zone_char_len, + allow_time_zone, + allow_special_expressions, + epoch, + now, + today, + tomorrow, + yesterday); } }; @@ -438,90 +398,153 @@ struct parse_timestamp_string_fn * Trims and parses timestamp string column to a timestamp column and a is valid column * */ -std::pair, std::unique_ptr> -to_timestamp(cudf::strings_column_view const &input, - std::string_view const &default_time_zone, bool allow_time_zone, - bool allow_special_expressions, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) +std::pair, std::unique_ptr> to_timestamp( + cudf::strings_column_view const& input, + std::string_view const& default_time_zone, + bool allow_time_zone, + bool allow_special_expressions, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto d_strings = cudf::column_device_view::create(input.parent(), stream); - auto output_timestamp = cudf::make_timestamp_column( - cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}, input.size(), - cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count(), stream, mr); - auto output_bool = cudf::make_fixed_width_column( - cudf::data_type{cudf::type_id::BOOL8}, input.size(), - cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count(), stream, mr); + auto output_timestamp = + cudf::make_timestamp_column(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}, + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), + stream, + mr); + // record which string is failed to parse. + auto output_bool = + cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), + stream, + mr); thrust::transform( - rmm::exec_policy(stream), thrust::make_counting_iterator(0), - thrust::make_counting_iterator(input.size()), - thrust::make_zip_iterator( - thrust::make_tuple(output_timestamp->mutable_view().begin(), - output_bool->mutable_view().begin())), - parse_timestamp_string_fn{*d_strings, default_time_zone.data(), - static_cast(default_time_zone.size()), - allow_time_zone, allow_special_expressions}); + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + thrust::make_zip_iterator( + thrust::make_tuple(output_timestamp->mutable_view().begin(), + output_bool->mutable_view().begin())), + parse_timestamp_string_fn{*d_strings, + default_time_zone.data(), + static_cast(default_time_zone.size()), + allow_time_zone, + allow_special_expressions}); return std::make_pair(std::move(output_timestamp), std::move(output_bool)); } -} // namespace - -namespace spark_rapids_jni +/** + * Set the null mask of timestamp column according to the valid column. + */ +void update_bitmask(cudf::column& timestamp_column, + cudf::column const& validity_column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { + auto const& ts_view = timestamp_column.view(); + auto const& valid_view = validity_column.view(); + std::vector masks; + std::vector offsets; + if (timestamp_column.nullable()) { + masks.push_back(ts_view.null_mask()); + offsets.push_back(ts_view.offset()); + } + + // generate bitmask from `validity_column` + auto [valid_bitmask, valid_null_count] = cudf::detail::valid_if( + valid_view.begin(), valid_view.end(), thrust::identity{}, stream, mr); -std::pair, bool> -parse_string_to_timestamp(cudf::strings_column_view const &input, - std::string_view const &default_time_zone, bool allow_time_zone, - bool allow_special_expressions, bool ansi_mode) + masks.push_back(static_cast(valid_bitmask.data())); + offsets.push_back(0); + + // merge 2 bitmasks + auto [null_mask, null_count] = + cudf::detail::bitmask_and(masks, offsets, timestamp_column.size(), stream, mr); + + timestamp_column.set_null_mask(null_mask, null_count); +} + +/** + * Parse string column with time zone to timestamp column, + * Returns a pair of timestamp column and a bool indicates whether successed. + */ +std::pair, bool> parse_string_to_timestamp( + cudf::strings_column_view const& input, + std::string_view const& default_time_zone, + bool allow_time_zone, + bool allow_special_expressions, + bool ansi_mode) { auto timestamp_type = cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}; - if (input.size() == 0) - { + if (input.size() == 0) { return std::make_pair(cudf::make_empty_column(timestamp_type.id()), true); } auto const stream = cudf::get_default_stream(); - auto const mr = rmm::mr::get_current_device_resource(); - auto [timestamp_column, valid_column] = to_timestamp(input, default_time_zone, allow_time_zone, allow_special_expressions, - stream, mr); - if (ansi_mode) - { - cudf::numeric_scalar false_scalar{false, true, stream}; - if (cudf::contains(*valid_column, false_scalar, stream)) - { + auto const mr = rmm::mr::get_current_device_resource(); + auto [timestamp_column, validity_column] = + to_timestamp(input, default_time_zone, allow_time_zone, allow_special_expressions, stream, mr); + + if (ansi_mode) { + // create scalar, value is false, is_valid is true + cudf::numeric_scalar false_scalar{false, true, stream, mr}; + if (cudf::contains(*validity_column, false_scalar, stream)) { + // has invalid value in validity column under ansi mode return std::make_pair(nullptr, false); - } - else - { - // TODO update bitmask + } else { + update_bitmask(*timestamp_column, *validity_column, stream, mr); return std::make_pair(std::move(timestamp_column), true); } - } - else - { - // TODO update bitmask + } else { + update_bitmask(*timestamp_column, *validity_column, stream, mr); return std::make_pair(std::move(timestamp_column), true); } } -std::pair, bool> -string_to_timestamp(cudf::strings_column_view const &input, - std::string_view const &default_time_zone, - bool allow_special_expressions, - bool ansi_mode) +} // namespace + +namespace spark_rapids_jni { + +/** + * Parse string column with time zone to timestamp column, + * Returns a pair of timestamp column and a bool indicates whether successed. + * If does not have time zone in string, use the default time zone. + */ +std::pair, bool> string_to_timestamp( + cudf::strings_column_view const& input, + std::string_view const& default_time_zone, + bool allow_special_expressions, + bool ansi_mode) { - return parse_string_to_timestamp(input, default_time_zone, true, allow_special_expressions, ansi_mode); + CUDF_EXPECTS(default_time_zone.size() > 0, "should specify default time zone"); + return parse_string_to_timestamp( + input, default_time_zone, true, allow_special_expressions, ansi_mode); } -std::pair, bool> -string_to_timestamp_without_time_zone(cudf::strings_column_view const &input, - bool allow_time_zone, - bool allow_special_expressions, - bool ansi_mode) +/** + * Parse string column with time zone to timestamp column, + * Returns a pair of timestamp column and a bool indicates whether successed. + * Do not use the time zone in string. + * If allow_time_zone is false and string contains time zone, then the string is invalid. + */ +std::pair, bool> string_to_timestamp_without_time_zone( + cudf::strings_column_view const& input, + bool allow_time_zone, + bool allow_special_expressions, + bool ansi_mode) { - return parse_string_to_timestamp(input, std::string_view(""), allow_time_zone, allow_special_expressions, ansi_mode); + return parse_string_to_timestamp(input, + std::string_view(""), // specify empty time zone + allow_time_zone, + allow_special_expressions, + ansi_mode); } -} // namespace spark_rapids_jni +} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp index 26ecf421a3..d2f1dfa39c 100644 --- a/src/main/cpp/src/datetime_parser.hpp +++ b/src/main/cpp/src/datetime_parser.hpp @@ -26,8 +26,7 @@ namespace spark_rapids_jni { * Refer to: https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/ * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394 * - * Formats are: - * + * Spark supports the following formats: * `[+-]yyyy*` * `[+-]yyyy*-[m]m` * `[+-]yyyy*-[m]m-[d]d` @@ -37,6 +36,14 @@ namespace spark_rapids_jni { * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` * + * Unlike Spark, Spark-Rapids only supports the following formats: + * `[+-]yyyy*` + * `[+-]yyyy*-[m]m` + * `[+-]yyyy*-[m]m-[d]d` + * `[+-]yyyy*-[m]m-[d]d ` + * `[+-]yyyy*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * `[+-]yyyy*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * * Spark supports the following zone id forms: * - Z - Zulu time zone UTC+0 * - +|-[h]h:[m]m @@ -57,37 +64,72 @@ namespace spark_rapids_jni { * * @param input input string column view. * @param default_time_zone if input string does not contain a time zone, use this time zone. - * @param allow_time_zone whether allow time zone in the timestamp string. e.g.: - * 1991-04-14T02:00:00Asia/Shanghai is invalid when do not allow time zone. - * @param allow_special_expressions whether allow epoch, now, today, yesterday, tomorrow strings. * @param ansi_mode is ansi mode - * @returns a timestamp column and a bool column. Bool column is empty if ansi mode is false, not empty otherwise. + * @returns a timestamp column and a bool column. Bool column is empty if ansi mode is false, not + * empty otherwise. */ -std::pair, bool> -parse_string_to_timestamp(cudf::strings_column_view const &input, - std::string_view const &default_time_zone, - bool allow_time_zone, - bool allow_special_expressions, - bool ansi_mode); -/** - * Refer to `parse_string_to_timestamp` - * If timestamp string does not contain date info(yyyy mm dd), use current date -*/ -std::pair, bool> -string_to_timestamp(cudf::strings_column_view const &input, - std::string_view const &default_time_zone, - bool allow_special_expressions, - bool ansi_mode); +std::pair, bool> string_to_timestamp( + cudf::strings_column_view const& input, + std::string_view const& default_time_zone, + bool allow_special_expressions, + bool ansi_mode); /** - * Refer to `parse_string_to_timestamp` - * - * @param allow_time_zone whether allow time zone in the timestamp string. e.g.: + * + * Trims and parses a timestamp string column with time zone suffix to a timestamp column. + * e.g.: 1991-04-14T02:00:00Asia/Shanghai => 1991-04-13 18:00:00 + * + * Refer to: https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/ + * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394 + * + * Spark supports the following formats: + * `[+-]yyyy*` + * `[+-]yyyy*-[m]m` + * `[+-]yyyy*-[m]m-[d]d` + * `[+-]yyyy*-[m]m-[d]d ` + * `[+-]yyyy*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * `[+-]yyyy*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * + * Unlike Spark, Spark-Rapids only supports the following formats: + * `[+-]yyyy*` + * `[+-]yyyy*-[m]m` + * `[+-]yyyy*-[m]m-[d]d` + * `[+-]yyyy*-[m]m-[d]d ` + * `[+-]yyyy*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * `[+-]yyyy*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * + * Spark supports the following zone id forms: + * - Z - Zulu time zone UTC+0 + * - +|-[h]h:[m]m + * - A short id, see https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS + * - An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-, + * and a suffix in the formats: + * - +|-h[h] + * - +|-hh[:]mm + * - +|-hh:mm:ss + * - +|-hhmmss + * - Region-based zone IDs in the form `area/city`, such as `Europe/Paris` + * + * Unlike Spark, Spark-Rapids only supports the following time zones: + * - Z - Zulu time zone UTC+0 + * - +|-[h]h:[m]m + * - Region-based zone IDs in the form `area/city`, such as `Europe/Paris` + * + * + * @param input input string column view. + * @param allow_time_zone whether allow time zone in the timestamp string. e.g.: * 1991-04-14T02:00:00Asia/Shanghai is invalid when do not allow time zone. -*/ -std::pair, bool> -string_to_timestamp_without_time_zone(cudf::strings_column_view const &input, - bool allow_time_zone, - bool allow_special_expressions, - bool ansi_mode); -} // namespace spark_rapids_jni + * @param allow_special_expressions whether allow epoch, now, today, yesterday, tomorrow strings. + * @param ansi_mode is ansi mode + * @returns a timestamp column and a bool column. Bool column is empty if ansi mode is false, not + * empty otherwise. + */ +std::pair, bool> string_to_timestamp_without_time_zone( + cudf::strings_column_view const& input, + bool allow_time_zone, + bool allow_special_expressions, + bool ansi_mode); + +} // namespace spark_rapids_jni diff --git a/src/main/cpp/tests/datetime_parser.cpp b/src/main/cpp/tests/datetime_parser.cpp index 47a733a41e..23d20bcbb6 100644 --- a/src/main/cpp/tests/datetime_parser.cpp +++ b/src/main/cpp/tests/datetime_parser.cpp @@ -31,25 +31,158 @@ #include #include -using timestamp_col = cudf::test::fixed_width_column_wrapper; - -struct DateTimeParserTest : public cudf::test::BaseFixture -{ -}; +using timestamp_col = + cudf::test::fixed_width_column_wrapper; +using micros_col = + cudf::test::fixed_width_column_wrapper; +struct DateTimeParserTest : public cudf::test::BaseFixture {}; TEST_F(DateTimeParserTest, ParseTimestamp) { - auto v = (2023L * 365L * 86400L + 11L * 30L * 86400L + 5L * 86400L + 3L * 3600L + 4L * 60L + 55L) * 1000000L; - auto const ts_col = timestamp_col{v, v, v + 123456}; - - auto const ts_strings = - cudf::test::strings_column_wrapper{"2023-11-05T03:04:55Z", - "2023-11-05T03:04:55 ", - "2023-11-05T03:04:55.123456 "}; - auto const parsed_ts = - spark_rapids_jni::string_to_timestamp(cudf::strings_column_view(ts_strings), - "Z", - true, - true); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(ts_col, *(parsed_ts.first)); + auto ts_strings = cudf::test::strings_column_wrapper( + { + "2023", + " 2023 ", + " 2023-11 ", + " 2023-11-5 ", + " 2023-11-05 3:04:55 ", + " 2023-11-05T03:4:55 ", + " 2023-11-05T3:4:55 ", + " 2023-11-5T3:4:55.", + " 2023-11-5T3:4:55.Iran", + " 2023-11-5T3:4:55.1 ", + " 2023-11-5T3:4:55.1Iran", + " 2023-11-05T03:04:55.123456 ", + " 2023-11-05T03:04:55.123456Iran ", + " 222222 ", + " ", // invalid + "", // invalid + "1-" // invalid + + }, + { + + 0, // null bit + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1 + + }); + auto d_2023_1_1 = (2023L * 365L * 86400L + 1 * 30L * 86400L + 1 * 86400L) * 1000000L; + auto d_2023_11_1 = (2023L * 365L * 86400L + 11 * 30L * 86400L + 1 * 86400L) * 1000000L; + auto d_2023_11_5 = (2023L * 365L * 86400L + 11L * 30L * 86400L + 5L * 86400L) * 1000000L; + auto t_3_4_55 = (3L * 3600L + 4L * 60L + 55L) * 1000000L; + auto d_2023_11_5_t_3_4_55 = d_2023_11_5 + t_3_4_55; + auto ts_col = timestamp_col( + { + + 0L, + d_2023_1_1, + d_2023_11_1, + d_2023_11_5, + d_2023_11_5_t_3_4_55, + d_2023_11_5_t_3_4_55, + d_2023_11_5_t_3_4_55, + d_2023_11_5_t_3_4_55, + d_2023_11_5_t_3_4_55, + d_2023_11_5_t_3_4_55 + 100000, + d_2023_11_5_t_3_4_55 + 100000, + d_2023_11_5_t_3_4_55 + 123456, + d_2023_11_5_t_3_4_55 + 123456, + (222222L * 365L * 86400L + 1 * 30L * 86400L + 1 * 86400L) * 1000000L, + 0L, + 0L, + 0L + + }, + { + 0, // null bit + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 0, // null bit + 0, // null bit + 0 // null bit + + }); + auto ret = + spark_rapids_jni::string_to_timestamp(cudf::strings_column_view(ts_strings), "Z", true, false); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(ts_col, *(ret.first)); + assert(ret.second == true); + + ts_strings = cudf::test::strings_column_wrapper( + { + + "invalid" + + }, + { + + 1 + + }); + ts_col = timestamp_col( + { + + 0L + + }, + {0 + + }); + ret = + spark_rapids_jni::string_to_timestamp(cudf::strings_column_view(ts_strings), "Z", true, true); + assert(ret.first == nullptr); + assert(ret.second == false); + + ts_strings = cudf::test::strings_column_wrapper( + { + + " Epoch ", " NOW ", " today ", " tomoRRow ", " yesTERday " + + }, + { + + 1, 1, 1, 1, 1 + + }); + ts_col = timestamp_col( + {// Temp implement: epoch -> 111, now -> 222, ... , yesterday -> 555 + 111L, + 222L, + 333L, + 444L, + 555L + + }, + {1, 1, 1, 1, 1 + + }); + ret = + spark_rapids_jni::string_to_timestamp(cudf::strings_column_view(ts_strings), "Z", true, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(ts_col, *(ret.first)); + assert(ret.second == true); } From 533f5904235a6e7fb08793886b45df7b938f0133 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 14 Dec 2023 10:12:28 +0800 Subject: [PATCH 06/35] Format code --- src/main/cpp/src/datetime_parser.cu | 2 +- src/main/cpp/tests/datetime_parser.cpp | 68 +++++++++++++------------- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index 6411c00017..d0d9cee93f 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -464,7 +464,7 @@ void update_bitmask(cudf::column& timestamp_column, masks.push_back(static_cast(valid_bitmask.data())); offsets.push_back(0); - // merge 2 bitmasks + // merge 2 bitmasks auto [null_mask, null_count] = cudf::detail::bitmask_and(masks, offsets, timestamp_column.size(), stream, mr); diff --git a/src/main/cpp/tests/datetime_parser.cpp b/src/main/cpp/tests/datetime_parser.cpp index 23d20bcbb6..9ab4271327 100644 --- a/src/main/cpp/tests/datetime_parser.cpp +++ b/src/main/cpp/tests/datetime_parser.cpp @@ -89,43 +89,43 @@ TEST_F(DateTimeParserTest, ParseTimestamp) auto ts_col = timestamp_col( { - 0L, - d_2023_1_1, - d_2023_11_1, - d_2023_11_5, - d_2023_11_5_t_3_4_55, - d_2023_11_5_t_3_4_55, - d_2023_11_5_t_3_4_55, - d_2023_11_5_t_3_4_55, - d_2023_11_5_t_3_4_55, - d_2023_11_5_t_3_4_55 + 100000, - d_2023_11_5_t_3_4_55 + 100000, - d_2023_11_5_t_3_4_55 + 123456, - d_2023_11_5_t_3_4_55 + 123456, - (222222L * 365L * 86400L + 1 * 30L * 86400L + 1 * 86400L) * 1000000L, - 0L, - 0L, - 0L + 0L, + d_2023_1_1, + d_2023_11_1, + d_2023_11_5, + d_2023_11_5_t_3_4_55, + d_2023_11_5_t_3_4_55, + d_2023_11_5_t_3_4_55, + d_2023_11_5_t_3_4_55, + d_2023_11_5_t_3_4_55, + d_2023_11_5_t_3_4_55 + 100000, + d_2023_11_5_t_3_4_55 + 100000, + d_2023_11_5_t_3_4_55 + 123456, + d_2023_11_5_t_3_4_55 + 123456, + (222222L * 365L * 86400L + 1 * 30L * 86400L + 1 * 86400L) * 1000000L, + 0L, + 0L, + 0L }, { - 0, // null bit - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 0, // null bit - 0, // null bit - 0 // null bit + 0, // null bit + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 0, // null bit + 0, // null bit + 0 // null bit }); auto ret = From 759a6dc73069a9bec70b2a17ea90818109e68f93 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 14 Dec 2023 11:02:03 +0800 Subject: [PATCH 07/35] Update for UTC time zone parser --- src/main/cpp/src/datetime_parser.cu | 34 +++++++++++++++++++---------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index d0d9cee93f..7b1be25a43 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -71,6 +71,15 @@ __device__ __host__ thrust::tuple to_utc_timestamp( return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{us}}, true); } +/** + * Convert a local time in a time zone to a UTC timestamp + */ +__device__ __host__ thrust::tuple to_utc_timestamp( + timestamp_components const& components) +{ + return to_utc_timestamp(components, cudf::string_view("UTC", 3)); +} + /** * Is white space */ @@ -332,10 +341,20 @@ __device__ __host__ thrust::tuple parse_string_to_time segments[0] *= year_sign.value_or(1); // above is ported from Spark. + // set components + auto components = timestamp_components{segments[0], + static_cast(segments[1]), + static_cast(segments[2]), + static_cast(segments[3]), + static_cast(segments[4]), + static_cast(segments[5]), + segments[6]}; if (default_time_zone_char_len == 0) { // invoke from `string_to_timestamp_without_time_zone` if (just_time || !allow_time_zone && tz.has_value()) { return thrust::make_tuple(error_us, false); + } else { + return to_utc_timestamp(components); } } else { // invoke from `string_to_timestamp` @@ -348,19 +367,10 @@ __device__ __host__ thrust::tuple parse_string_to_time // do not support currently return thrust::make_tuple(error_us, false); + } else { + return to_utc_timestamp(components, time_zone); } } - - // set components - auto components = timestamp_components{segments[0], - static_cast(segments[1]), - static_cast(segments[2]), - static_cast(segments[3]), - static_cast(segments[4]), - static_cast(segments[5]), - segments[6]}; - - return to_utc_timestamp(components, time_zone); } struct parse_timestamp_string_fn { @@ -441,7 +451,7 @@ std::pair, std::unique_ptr> to_times } /** - * Set the null mask of timestamp column according to the valid column. + * Set the null mask of timestamp column according to the validity column. */ void update_bitmask(cudf::column& timestamp_column, cudf::column const& validity_column, From 3335201d0aaf01ed02463b0e1ec74b4c9097b2c1 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 14 Dec 2023 18:29:34 +0800 Subject: [PATCH 08/35] Add JNI interface --- src/main/cpp/src/CastStringJni.cpp | 48 ++++++++++ .../nvidia/spark/rapids/jni/CastStrings.java | 89 ++++++++++++++++++- .../spark/rapids/jni/CastStringsTest.java | 71 +++++++++++++++ 3 files changed, 207 insertions(+), 1 deletion(-) diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp index b7d898a0c8..32d7b7d697 100644 --- a/src/main/cpp/src/CastStringJni.cpp +++ b/src/main/cpp/src/CastStringJni.cpp @@ -30,6 +30,7 @@ #include #include "cudf_jni_apis.hpp" +#include "datetime_parser.hpp" #include "dtype_utils.hpp" #include "jni_utils.hpp" @@ -255,4 +256,51 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromInteger } CATCH_CAST_EXCEPTION(env, 0); } + +JNIEXPORT jlong JNICALL +Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv* env, + jclass, + jlong input_column, + jstring default_time_zone, + jboolean allow_special_expressions, + jboolean ansiEnabled) +{ + JNI_NULL_CHECK(env, input_column, "input column is null", 0); + try { + cudf::jni::auto_set_device(env); + cudf::jni::native_jstring default_zone(env, default_time_zone); + auto input_view{*reinterpret_cast(input_column)}; + auto [ret_cv, success] = spark_rapids_jni::string_to_timestamp( + input_view, default_zone.get(), allow_special_expressions, ansiEnabled); + if (success) { return cudf::jni::release_as_jlong(ret_cv); } + } + CATCH_STD(env, 0); + + // sucess is false, throw exception. + // Note: do not need to release ret_cv, because it's nullptr when success is false. + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Parse failed on Ansi mode", 0); +} + +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestampWithoutTimeZone( + JNIEnv* env, + jclass, + jlong input_column, + jboolean allowTimeZone, + jboolean allow_special_expressions, + jboolean ansiEnabled) +{ + JNI_NULL_CHECK(env, input_column, "input column is null", 0); + try { + cudf::jni::auto_set_device(env); + auto input_view{*reinterpret_cast(input_column)}; + auto [ret_cv, success] = spark_rapids_jni::string_to_timestamp_without_time_zone( + input_view, allowTimeZone, allow_special_expressions, ansiEnabled); + if (success) { return cudf::jni::release_as_jlong(ret_cv); } + } + CATCH_STD(env, 0); + + // sucess is false, throw exception. + // Note: do not need to release ret_cv, because it's nullptr when success is false. + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Parse failed on Ansi mode", 0); +} } diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java index 2b2267f034..515f725e02 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java @@ -152,6 +152,89 @@ public static ColumnVector fromIntegersWithBase(ColumnView cv, int base) { return new ColumnVector(fromIntegersWithBase(cv.getNativeView(), base)); } + /** + * Trims and parses a timestamp string column with time zone suffix to a + * timestamp column. + * Use the default time zone if string does not contain time zone. + * + * Supports the following formats: + * `[+-]yyyy*` + * `[+-]yyyy*-[m]m` + * `[+-]yyyy*-[m]m-[d]d` + * `[+-]yyyy*-[m]m-[d]d ` + * `[+-]yyyy*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * `[+-]yyyy*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * + * Supports the following time zones: + * - Z - Zulu time zone UTC+0 + * - +|-[h]h:[m]m + * - Region-based zone IDs in the form `area/city`, such as `Europe/Paris` + * + * Example: + * input = [" 2023", "2023-01-01T08:00:00Asia/Shanghai "] + * ts = toTimestamp(input, "UTC", allowSpecialExpressions = true, ansiEnabled = + * false) + * ts is: ['2023-01-01 00:00:00', '2023-01-01T00:00:00'] + * + * @param cv The input string column to be converted. + * @param defaultTimeZone Use the default time zone if string does not + * contain time zone. + * @param allowSpecialExpressions Whether allow: epoch, now, today, tomorrow + * @param ansiEnabled is Ansi mode + * @return a timestamp column + * @throws IllegalArgumentException if cv contains invalid value when + * ansiEnabled is true + */ + public static ColumnVector toTimestamp(ColumnView cv, String defaultTimeZone, + boolean allowSpecialExpressions, boolean ansiEnabled) { + if (defaultTimeZone == null || defaultTimeZone.isEmpty()) { + throw new IllegalArgumentException("Default time zone can not be empty."); + } + return new ColumnVector(toTimestamp(cv.getNativeView(), defaultTimeZone, + allowSpecialExpressions, ansiEnabled)); + } + + /** + * Trims and parses a timestamp string column with time zone suffix to a + * timestamp column. + * Do not use the time zones in timestamp strings. + * + * Supports the following formats: + * `[+-]yyyy*` + * `[+-]yyyy*-[m]m` + * `[+-]yyyy*-[m]m-[d]d` + * `[+-]yyyy*-[m]m-[d]d ` + * `[+-]yyyy*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * `[+-]yyyy*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * + * Supports the following time zones: + * - Z - Zulu time zone UTC+0 + * - +|-[h]h:[m]m + * - Region-based zone IDs in the form `area/city`, such as `Europe/Paris` + * + * Example: + * input = [" 2023", "2023-01-01T08:00:00Asia/Shanghai "] + * ts = toTimestampWithoutTimeZone(input, allowTimeZone = true, + * allowSpecialExpressions = true, ansiEnabled = false) + * ts is: ['2023-01-01 00:00:00', '2023-01-01T08:00:00'] + * + * @param cv The input string column to be converted. + * @param allow_time_zone whether allow time zone in the timestamp + * string. e.g.: + * 1991-04-14T02:00:00Asia/Shanghai is invalid + * when do not allow time zone. + * @param allowSpecialExpressions Whether allow: epoch, now, today, tomorrow + * @param ansiEnabled is Ansi mode + * @return a timestamp column + * @throws IllegalArgumentException if cv contains invalid value when + * ansiEnabled is true + */ + public static ColumnVector toTimestampWithoutTimeZone(ColumnView cv, boolean allowTimeZone, + boolean allowSpecialExpressions, boolean ansiEnabled) { + return new ColumnVector(toTimestampWithoutTimeZone(cv.getNativeView(), allowTimeZone, + allowSpecialExpressions, ansiEnabled)); + } + private static native long toInteger(long nativeColumnView, boolean ansi_enabled, boolean strip, int dtype); private static native long toDecimal(long nativeColumnView, boolean ansi_enabled, boolean strip, @@ -163,4 +246,8 @@ private static native long toDecimal(long nativeColumnView, boolean ansi_enabled private static native long toIntegersWithBase(long nativeColumnView, int base, boolean ansiEnabled, int dtype); private static native long fromIntegersWithBase(long nativeColumnView, int base); -} \ No newline at end of file + private static native long toTimestamp(long nativeColumnView, String defaultTimeZone, + boolean allowSpecialExpressions, boolean ansiEnabled); + private static native long toTimestampWithoutTimeZone(long nativeColumnView, + boolean allowTimeZone, boolean allowSpecialExpressions, boolean ansiEnabled); +} diff --git a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java index c39766454a..7eeee46945 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java @@ -17,6 +17,7 @@ package com.nvidia.spark.rapids.jni; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.fail; import java.util.ArrayList; @@ -324,4 +325,74 @@ void baseHex2DecTest() { convTestInternal(input, expected, 16); } } + + // TODO update after this PR is done. + @Test + void toTimestampTestNonAnsi() { + long d_2023_1_1 = (2023L * 365L * 86400L + 1 * 30L * 86400L + 1 * 86400L) * 1000000L; + long d_2023_11_1 = (2023L * 365L * 86400L + 11 * 30L * 86400L + 1 * 86400L) * 1000000L; + long d_2023_11_5 = (2023L * 365L * 86400L + 11L * 30L * 86400L + 5L * 86400L) * 1000000L; + long t_3_4_55 = (3L * 3600L + 4L * 60L + 55L) * 1000000L; + long d_2023_11_5_t_3_4_55 = d_2023_11_5 + t_3_4_55; + + try ( + ColumnVector input = ColumnVector.fromStrings( + null, + " 2023 ", + " 2023-11 ", + " 2023-11-5 ", + " 2023-11-05 3:04:55 ", + " 2023-11-05T03:4:55 ", + " 2023-11-05T3:4:55 ", + " 2023-11-5T3:4:55.", + " 2023-11-5T3:4:55.Iran", + " 2023-11-5T3:4:55.1 ", + " 2023-11-5T3:4:55.1Iran", + " 2023-11-05T03:04:55.123456 ", + " 2023-11-05T03:04:55.123456Iran ", + " 222222 ", + " ", // invalid + "", // invalid + "1-" // invalid + ); + ColumnVector expected = ColumnVector.timestampMicroSecondsFromBoxedLongs( + null, + d_2023_1_1, + d_2023_11_1, + d_2023_11_5, + d_2023_11_5_t_3_4_55, + d_2023_11_5_t_3_4_55, + d_2023_11_5_t_3_4_55, + d_2023_11_5_t_3_4_55, + d_2023_11_5_t_3_4_55, + d_2023_11_5_t_3_4_55 + 100000, + d_2023_11_5_t_3_4_55 + 100000, + d_2023_11_5_t_3_4_55 + 123456, + d_2023_11_5_t_3_4_55 + 123456, + (222222L * 365L * 86400L + 1 * 30L * 86400L + 1 * 86400L) * 1000000L, + null, + null, + null); + ColumnVector actual = CastStrings.toTimestamp(input, + "Asia/Shanghai", false, false)) { + AssertUtils.assertColumnsAreEqual(expected, actual); + } + } + + @Test + void toTimestampTestAnsi() { + assertThrows(IllegalArgumentException.class, () -> { + try (ColumnVector input = ColumnVector.fromStrings(" invalid_value ")) { + // ansiEnabled is true + CastStrings.toTimestamp(input, "Asia/Shanghai", false, true); + } + }); + + assertThrows(IllegalArgumentException.class, () -> { + try (ColumnVector input = ColumnVector.fromStrings(" invalid_value ")) { + // ansiEnabled is true + CastStrings.toTimestampWithoutTimeZone(input, true, false, true); + } + }); + } } From 2270d8b10ed376f7a6af93d2fcf8c269d6f9a298 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Wed, 27 Dec 2023 18:32:18 +0800 Subject: [PATCH 09/35] complete the work Signed-off-by: sperlingxx --- src/main/cpp/src/CastStringJni.cpp | 44 +- src/main/cpp/src/datetime_parser.cu | 839 ++++++++++-------- src/main/cpp/src/datetime_parser.hpp | 14 +- .../nvidia/spark/rapids/jni/CastStrings.java | 52 +- .../spark/rapids/jni/GpuTimeZoneDB.java | 94 +- .../spark/rapids/jni/CastStringsTest.java | 182 ++-- 6 files changed, 728 insertions(+), 497 deletions(-) diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp index 32d7b7d697..60bf69ff2e 100644 --- a/src/main/cpp/src/CastStringJni.cpp +++ b/src/main/cpp/src/CastStringJni.cpp @@ -257,21 +257,30 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromInteger CATCH_CAST_EXCEPTION(env, 0); } -JNIEXPORT jlong JNICALL -Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv* env, - jclass, - jlong input_column, - jstring default_time_zone, - jboolean allow_special_expressions, - jboolean ansiEnabled) +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv *env, + jclass, + jlong input_column, + jlong transitions_handle, + jlong tz_indices_col, + jlong special_dt_lit_col, + jint tz_default_index, + jboolean ansi_enabled) { JNI_NULL_CHECK(env, input_column, "input column is null", 0); try { cudf::jni::auto_set_device(env); - cudf::jni::native_jstring default_zone(env, default_time_zone); - auto input_view{*reinterpret_cast(input_column)}; - auto [ret_cv, success] = spark_rapids_jni::string_to_timestamp( - input_view, default_zone.get(), allow_special_expressions, ansiEnabled); + + auto const &input_view = cudf::strings_column_view(*reinterpret_cast(input_column)); + auto const transitions = reinterpret_cast(transitions_handle)->column(0); + auto const &tz_indices_view = cudf::strings_column_view( + *reinterpret_cast(tz_indices_col)); + auto const &special_dt_lit_view = cudf::strings_column_view( + *reinterpret_cast(special_dt_lit_col)); + + auto const tz_index = static_cast(tz_default_index); + + auto [ret_cv, success] = spark_rapids_jni::string_to_timestamp_with_tz( + input_view, transitions, tz_indices_view, special_dt_lit_view, tz_index, ansi_enabled); if (success) { return cudf::jni::release_as_jlong(ret_cv); } } CATCH_STD(env, 0); @@ -285,16 +294,19 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp JNIEnv* env, jclass, jlong input_column, - jboolean allowTimeZone, - jboolean allow_special_expressions, - jboolean ansiEnabled) + jlong special_dt_lit_col, + jboolean allow_time_zone, + jboolean ansi_enabled) { JNI_NULL_CHECK(env, input_column, "input column is null", 0); try { cudf::jni::auto_set_device(env); - auto input_view{*reinterpret_cast(input_column)}; + auto const &input_view = cudf::strings_column_view(*reinterpret_cast(input_column)); + auto const &special_dt_lit_view = cudf::strings_column_view( + *reinterpret_cast(special_dt_lit_col)); + auto [ret_cv, success] = spark_rapids_jni::string_to_timestamp_without_time_zone( - input_view, allowTimeZone, allow_special_expressions, ansiEnabled); + input_view, special_dt_lit_view, allow_time_zone, ansi_enabled); if (success) { return cudf::jni::release_as_jlong(ret_cv); } } CATCH_STD(env, 0); diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index 7b1be25a43..31b5fe0099 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -14,11 +14,12 @@ * limitations under the License. */ -#include -#include -#include +#include "datetime_parser.hpp" #include +#include + +#include #include #include @@ -26,21 +27,36 @@ #include #include #include + +#include +#include #include -#include #include -#include + +#include #include #include +#include #include + #include #include + +#include #include +#include #include #include #include -#include "datetime_parser.hpp" +using column = cudf::column; +using column_device_view = cudf::column_device_view; +using column_view = cudf::column_view; +using lists_column_device_view = cudf::detail::lists_column_device_view; +using size_type = cudf::size_type; +using string_view = cudf::string_view; +using struct_view = cudf::struct_view; +using table_view = cudf::table_view; namespace { @@ -57,29 +73,6 @@ struct timestamp_components { int32_t microseconds; }; -/** - * Convert a local time in a time zone to a UTC timestamp - */ -__device__ __host__ thrust::tuple to_utc_timestamp( - timestamp_components const& components, cudf::string_view const& time_zone) -{ - // TODO replace the following fake implementation - long seconds = components.year * 365L * 86400L + components.month * 30L * 86400L + - components.day * 86400L + components.hour * 3600L + components.minute * 60L + - components.second; - long us = seconds * 1000000L + components.microseconds; - return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{us}}, true); -} - -/** - * Convert a local time in a time zone to a UTC timestamp - */ -__device__ __host__ thrust::tuple to_utc_timestamp( - timestamp_components const& components) -{ - return to_utc_timestamp(components, cudf::string_view("UTC", 3)); -} - /** * Is white space */ @@ -100,11 +93,10 @@ __device__ __host__ inline bool is_whitespace(const char chr) * "epoch", "now", "today", "yesterday", "tomorrow" * the expect string should be lower-case a-z chars */ -__device__ __host__ inline bool equals_ascii_ignore_case(char const* actual_begin, - char const* actual_end, - char const* expect_begin, - char const* expect_end) -{ +__device__ inline bool equals_ascii_ignore_case(char const *actual_begin, + char const *actual_end, + char const *expect_begin, + char const *expect_end) { if (actual_end - actual_begin != expect_end - expect_begin) { return false; } while (expect_begin < expect_end) { @@ -125,281 +117,380 @@ __device__ __host__ bool is_valid_digits(int segment, int digits) const int constexpr maxDigitsYear = 6; // For the nanosecond part, more than 6 digits is allowed, but will be truncated. return segment == 6 || (segment == 0 && digits >= 4 && digits <= maxDigitsYear) || - // For the zoneId segment(7), it's could be zero digits when it's a region-based zone ID - (segment == 7 && digits <= 2) || - (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2); + // For the zoneId segment(7), it's could be zero digits when it's a region-based zone ID + (segment == 7 && digits <= 2) || + (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2); } -/** - * Ported from Spark: - * https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/ - * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394 - * - * Parse a string with time zone to a timestamp. - * The bool in the returned tuple is false if the parse failed. - */ -__device__ __host__ thrust::tuple parse_string_to_timestamp_us( - cudf::string_view const& timestamp_str, - const char* default_time_zone, - cudf::size_type default_time_zone_char_len, - bool allow_time_zone, - bool allow_special_expressions, - cudf::timestamp_us epoch, - cudf::timestamp_us now, - cudf::timestamp_us today, - cudf::timestamp_us tomorrow, - cudf::timestamp_us yesterday) -{ - auto error_us = cudf::timestamp_us{cudf::duration_us{0}}; +enum ParseResult { + OK = 0, + INVALID = 1, + UNSUPPORTED = 2 +}; - if (timestamp_str.empty()) { return thrust::make_tuple(error_us, false); } +template +struct parse_timestamp_string_fn { + column_device_view const d_strings; + column_device_view const special_datetime_names; + size_type default_tz_index; + bool allow_tz_in_date_str = true; + // The list column of transitions to figure out the correct offset + // to adjust the timestamp. The type of the values in this column is + // LIST>. + thrust::optional transitions = thrust::nullopt; + thrust::optional tz_indices = thrust::nullopt; + + __device__ thrust::tuple operator()(const cudf::size_type& idx) const + { + if (!d_strings.is_valid(idx)) { + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID); + } - const char* curr_ptr = timestamp_str.data(); - const char* end_ptr = curr_ptr + timestamp_str.size_bytes(); + auto const d_str = d_strings.element(idx); - // trim left - while (curr_ptr < end_ptr && is_whitespace(*curr_ptr)) { - ++curr_ptr; - } - // trim right - while (curr_ptr < end_ptr - 1 && is_whitespace(*(end_ptr - 1))) { - --end_ptr; - } + timestamp_components ts_comp{}; + char const * tz_lit_ptr = nullptr; + size_type tz_lit_len = 0; + switch (parse_string_to_timestamp_us(&ts_comp, &tz_lit_ptr, &tz_lit_len, d_str)) { + case ParseResult::INVALID: + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID); + case ParseResult::UNSUPPORTED: + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::UNSUPPORTED); + case ParseResult::OK: + default: + break; + } - // special strings: epoch, now, today, yesterday, tomorrow - if (allow_special_expressions) { - char const* begin_epoch = "epoch"; - char const* end_epoch = begin_epoch + 5; - - char const* begin_now = "now"; - char const* end_now = begin_now + 3; - - char const* begin_today = "today"; - char const* end_today = begin_today + 5; - - char const* begin_tomorrow = "tomorrow"; - char const* end_tomorrow = begin_tomorrow + 8; - - char const* begin_yesterday = "yesterday"; - char const* end_yesterday = begin_yesterday + 9; - - if (equals_ascii_ignore_case(curr_ptr, end_ptr, begin_epoch, end_epoch)) { - // epoch - return thrust::make_tuple(epoch, true); - } else if (equals_ascii_ignore_case(curr_ptr, end_ptr, begin_now, end_now)) { - // now - return thrust::make_tuple(now, true); - } else if (equals_ascii_ignore_case(curr_ptr, end_ptr, begin_today, end_today)) { - // today - return thrust::make_tuple(today, true); - } else if (equals_ascii_ignore_case(curr_ptr, end_ptr, begin_tomorrow, end_tomorrow)) { - // tomorrow - return thrust::make_tuple(tomorrow, true); - } else if (equals_ascii_ignore_case(curr_ptr, end_ptr, begin_yesterday, end_yesterday)) { - // yesterday - return thrust::make_tuple(yesterday, true); + if constexpr (!with_timezone) { + // path without timezone, in which unix_timestamp is straightforwardly computed + auto const ts_unaligned = compute_epoch_us(ts_comp); + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{ts_unaligned}}, ParseResult::OK); + } + + // path with timezone, in which timezone offset has to be determined before computing unix_timestamp + int64_t tz_offset; + if (tz_lit_ptr == nullptr) { + tz_offset = extract_timezone_offset(compute_loose_epoch_s(ts_comp), default_tz_index); + } else { + auto tz_view = string_view(tz_lit_ptr, tz_lit_len); + if (auto [utc_offset, ret_code] = parse_utc_like_tz(tz_view); ret_code == 0) { + tz_offset = utc_offset; + } else if (ret_code == 1) { + auto tz_index = query_index_from_tz_db(tz_view); + if (tz_index > transitions->size()) { + if (tz_index == tz_indices->size()) + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID); + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::UNSUPPORTED); + } + tz_offset = extract_timezone_offset(compute_loose_epoch_s(ts_comp), tz_index); + } else { + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID); + } } + + auto const ts_unaligned = compute_epoch_us(ts_comp); + + return thrust::make_tuple( + cudf::timestamp_us{cudf::duration_us{ts_unaligned - tz_offset * 1000000L}}, + ParseResult::OK); } - if (curr_ptr == end_ptr) { return thrust::make_tuple(error_us, false); } - - const char* const bytes = curr_ptr; - const size_t bytes_length = end_ptr - curr_ptr; - - thrust::optional tz; - int segments[] = {1, 1, 1, 0, 0, 0, 0, 0, 0}; - int segments_len = 9; - int i = 0; - int current_segment_value = 0; - int current_segment_digits = 0; - size_t j = 0; - int digits_milli = 0; - bool just_time = false; - thrust::optional year_sign; - if ('-' == bytes[j] || '+' == bytes[j]) { - if ('-' == bytes[j]) { - year_sign = -1; + // TODO: support CST/PST/AST + __device__ inline thrust::pair parse_utc_like_tz(string_view const &tz_lit) const + { + size_type len = tz_lit.size_bytes(); + + char const *ptr = tz_lit.data(); + + if (*ptr == 'Z') { + if (len > 1) return {0, 1}; + return {0, 0}; + } + + size_t char_offset = 0; + + if (len > 2 + && ((*ptr == 'G' && *(ptr + 1) == 'M' && *(ptr + 2) == 'T') + || (*ptr == 'U' && *(ptr + 1) == 'T' && *(ptr + 2) == 'C'))) { + char_offset = 3; + } + + if (len == char_offset) return {0, 0}; + + char const sign_char = *(ptr + char_offset++); + int64_t sign; + if (sign_char == '+') { + sign = 1L; + } else if (sign_char == '-') { + sign = -1L; } else { - year_sign = 1; + return {0, char_offset < 3 ? 1 : 2}; + } + + int64_t hms[3] = {0L, 0L, 0L}; + bool has_colon = false; + bool one_digit_mm = false; + for (size_type i = 0; i < 3; i++) { + if (i == 2 && one_digit_mm) return {0, 2}; + + hms[i] = *(ptr + char_offset++) - '0'; + if (hms[i] < 0 || hms[i] > 9) return {0, 2}; + + if (len == char_offset) { + if (i > 0) { + if (!has_colon) return {0, 2}; + one_digit_mm = true; + } + break; + } + + if (*(ptr + char_offset) == ':') { + if (len == ++char_offset) break; + has_colon = true; + continue; + } + + auto digit = *(ptr + char_offset++) - '0'; + if (digit < 0 || digit > 9) return {0, 2}; + hms[i] = hms[i] * 10 + digit; + + if (len == char_offset) break; + if (*(ptr + char_offset) == ':') { + if (len == ++char_offset) break; + has_colon = true; + continue; + } + if (has_colon) return {0, 2}; } - j += 1; + + if (hms[0] > 18 || hms[1] > 59 || hms[2] > 59) return {0, 2}; + if (hms[0] == 18 && hms[1] + hms[2] > 0) return {0, 2}; + + return {sign * (hms[0] * 3600L + hms[1] * 60L + hms[2]), 0}; } - while (j < bytes_length) { - char b = bytes[j]; - int parsed_value = static_cast(b - '0'); - if (parsed_value < 0 || parsed_value > 9) { - if (0 == j && 'T' == b) { - just_time = true; - i += 3; - } else if (i < 2) { - if (b == '-') { - if (!is_valid_digits(i, current_segment_digits)) { - return thrust::make_tuple(error_us, false); - } - segments[i] = current_segment_value; - current_segment_value = 0; - current_segment_digits = 0; - i += 1; - } else if (0 == i && ':' == b && !year_sign.has_value()) { - just_time = true; - if (!is_valid_digits(3, current_segment_digits)) { - return thrust::make_tuple(error_us, false); + __device__ inline int query_index_from_tz_db(string_view const &tz_lit) const + { + // TODO: replace with more efficient approach (such as binary search or prefix tree) + auto predicate = [tz = tz_indices, &tz_lit] __device__(auto const i) { + return tz->element(i) == tz_lit; + }; + auto ret = thrust::find_if(thrust::seq, + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(tz_indices->size()), + predicate); + + return *ret; + } + + __device__ inline int64_t extract_timezone_offset(int64_t loose_epoch_second, size_type tz_index) const + { + auto const &utc_offsets = transitions->child().child(2); + auto const &loose_instants = transitions->child().child(3); + + auto const local_transitions = cudf::list_device_view{*transitions, tz_index}; + auto const list_size = local_transitions.size(); + + auto const transition_times = cudf::device_span( + loose_instants.data() + local_transitions.element_offset(0), + static_cast(list_size)); + + auto const it = thrust::upper_bound( + thrust::seq, transition_times.begin(), transition_times.end(), loose_epoch_second); + auto const idx = static_cast(thrust::distance(transition_times.begin(), it)); + auto const list_offset = local_transitions.element_offset(idx - 1); + + return static_cast(utc_offsets.element(list_offset)); + } + + __device__ inline int64_t compute_loose_epoch_s(timestamp_components const& ts) const + { + return (ts.year * 400 + (ts.month - 1) * 31 + ts.day - 1) * 86400L + ts.hour * 3600L + ts.minute * 60L + ts.second; + } + + __device__ inline int64_t compute_epoch_us(timestamp_components const& ts) const + { + auto const ymd = // chrono class handles the leap year calculations for us + cuda::std::chrono::year_month_day( + cuda::std::chrono::year{ts.year}, + cuda::std::chrono::month{static_cast(ts.month)}, + cuda::std::chrono::day{static_cast(ts.day)}); + auto days = cuda::std::chrono::sys_days(ymd).time_since_epoch().count(); + + int64_t timestamp_s = (days * 24L * 3600L) + (ts.hour * 3600L) + (ts.minute * 60L) + ts.second; + + return timestamp_s * 1000000L + ts.microseconds; + } + + /** + * Ported from Spark: + * https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/ + * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394 + * + * Parse a string with time zone to a timestamp. + * The bool in the returned tuple is false if the parse failed. + */ + __device__ inline ParseResult parse_string_to_timestamp_us( + timestamp_components *ts_comp, + char const **parsed_tz_ptr, + size_type *parsed_tz_length, + cudf::string_view const ×tamp_str) const { + + if (timestamp_str.empty()) { return ParseResult::INVALID; } + + const char *curr_ptr = timestamp_str.data(); + const char *end_ptr = curr_ptr + timestamp_str.size_bytes(); + + // trim left + while (curr_ptr < end_ptr && is_whitespace(*curr_ptr)) { + ++curr_ptr; + } + // trim right + while (curr_ptr < end_ptr - 1 && is_whitespace(*(end_ptr - 1))) { + --end_ptr; + } + + // TODO: support special dates [epoch, now, today, yesterday, tomorrow] + for (size_type i = 0; i < special_datetime_names.size(); i++) { + auto const& ref = special_datetime_names.element(i); + if (equals_ascii_ignore_case(curr_ptr, end_ptr, ref.data(), ref.data() + ref.size_bytes())) { + *parsed_tz_ptr = ref.data(); + *parsed_tz_length = ref.size_bytes(); + return ParseResult::UNSUPPORTED; + } + } + + if (curr_ptr == end_ptr) { return ParseResult::INVALID; } + + const char *const bytes = curr_ptr; + const size_type bytes_length = end_ptr - curr_ptr; + + int segments[] = {1, 1, 1, 0, 0, 0, 0, 0, 0}; + int segments_len = 9; + int i = 0; + int current_segment_value = 0; + int current_segment_digits = 0; + size_t j = 0; + int digits_milli = 0; + // bool just_time = false; + thrust::optional year_sign; + if ('-' == bytes[j] || '+' == bytes[j]) { + if ('-' == bytes[j]) { + year_sign = -1; + } else { + year_sign = 1; + } + j += 1; + } + + while (j < bytes_length) { + char b = bytes[j]; + int parsed_value = static_cast(b - '0'); + if (parsed_value < 0 || parsed_value > 9) { + if (0 == j && 'T' == b) { + // just_time = true; + i += 3; + } else if (i < 2) { + if (b == '-') { + if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; } + segments[i] = current_segment_value; + current_segment_value = 0; + current_segment_digits = 0; + i += 1; + } else if (0 == i && ':' == b && !year_sign.has_value()) { + // just_time = true; + if (!is_valid_digits(3, current_segment_digits)) { return ParseResult::INVALID; } + segments[3] = current_segment_value; + current_segment_value = 0; + current_segment_digits = 0; + i = 4; + } else { + return ParseResult::INVALID; } - segments[3] = current_segment_value; - current_segment_value = 0; - current_segment_digits = 0; - i = 4; - } else { - return thrust::make_tuple(error_us, false); - } - } else if (2 == i) { - if (' ' == b || 'T' == b) { - if (!is_valid_digits(i, current_segment_digits)) { - return thrust::make_tuple(error_us, false); + } else if (2 == i) { + if (' ' == b || 'T' == b) { + if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; } + segments[i] = current_segment_value; + current_segment_value = 0; + current_segment_digits = 0; + i += 1; + } else { + return ParseResult::INVALID; } - segments[i] = current_segment_value; - current_segment_value = 0; - current_segment_digits = 0; - i += 1; - } else { - return thrust::make_tuple(error_us, false); - } - } else if (3 == i || 4 == i) { - if (':' == b) { - if (!is_valid_digits(i, current_segment_digits)) { - return thrust::make_tuple(error_us, false); + } else if (3 == i || 4 == i) { + if (':' == b) { + if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; } + segments[i] = current_segment_value; + current_segment_value = 0; + current_segment_digits = 0; + i += 1; + } else { + return ParseResult::INVALID; } - segments[i] = current_segment_value; - current_segment_value = 0; - current_segment_digits = 0; - i += 1; - } else { - return thrust::make_tuple(error_us, false); - } - } else if (5 == i || 6 == i) { - if ('.' == b && 5 == i) { - if (!is_valid_digits(i, current_segment_digits)) { - return thrust::make_tuple(error_us, false); + } else if (5 == i || 6 == i) { + if ('.' == b && 5 == i) { + if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; } + segments[i] = current_segment_value; + current_segment_value = 0; + current_segment_digits = 0; + i += 1; + } else { + if (!is_valid_digits(i, current_segment_digits) || !allow_tz_in_date_str) { return ParseResult::INVALID; } + segments[i] = current_segment_value; + current_segment_value = 0; + current_segment_digits = 0; + i += 1; + *parsed_tz_ptr = bytes + j; + // strip the whitespace between timestamp and timezone + while (*parsed_tz_ptr < end_ptr && is_whitespace(**parsed_tz_ptr)) ++(*parsed_tz_ptr); + *parsed_tz_length = end_ptr - *parsed_tz_ptr; + break; } - segments[i] = current_segment_value; - current_segment_value = 0; - current_segment_digits = 0; - i += 1; + if (i == 6 && '.' != b) { i += 1; } } else { - if (!is_valid_digits(i, current_segment_digits)) { - return thrust::make_tuple(error_us, false); + if (i < segments_len && (':' == b || ' ' == b)) { + if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; } + segments[i] = current_segment_value; + current_segment_value = 0; + current_segment_digits = 0; + i += 1; + } else { + return ParseResult::INVALID; } - segments[i] = current_segment_value; - current_segment_value = 0; - current_segment_digits = 0; - i += 1; - tz = cudf::string_view(bytes + j, (bytes_length - j)); - j = bytes_length - 1; } - if (i == 6 && '.' != b) { i += 1; } } else { - if (i < segments_len && (':' == b || ' ' == b)) { - if (!is_valid_digits(i, current_segment_digits)) { - return thrust::make_tuple(error_us, false); - } - segments[i] = current_segment_value; - current_segment_value = 0; - current_segment_digits = 0; - i += 1; - } else { - return thrust::make_tuple(error_us, false); + if (6 == i) { digits_milli += 1; } + // We will truncate the nanosecond part if there are more than 6 digits, which results + // in loss of precision + if (6 != i || current_segment_digits < 6) { + current_segment_value = current_segment_value * 10 + parsed_value; } + current_segment_digits += 1; } - } else { - if (6 == i) { digits_milli += 1; } - // We will truncate the nanosecond part if there are more than 6 digits, which results - // in loss of precision - if (6 != i || current_segment_digits < 6) { - current_segment_value = current_segment_value * 10 + parsed_value; - } - current_segment_digits += 1; + j += 1; } - j += 1; - } - if (!is_valid_digits(i, current_segment_digits)) { return thrust::make_tuple(error_us, false); } - segments[i] = current_segment_value; + if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; } + segments[i] = current_segment_value; - while (digits_milli < 6) { - segments[6] *= 10; - digits_milli += 1; - } + while (digits_milli < 6) { + segments[6] *= 10; + digits_milli += 1; + } - cudf::string_view time_zone; - if (tz.has_value()) { - time_zone = tz.value(); - } else { - time_zone = cudf::string_view(default_time_zone, default_time_zone_char_len); - } + segments[0] *= year_sign.value_or(1); + // above is ported from Spark. - segments[0] *= year_sign.value_or(1); - // above is ported from Spark. - - // set components - auto components = timestamp_components{segments[0], - static_cast(segments[1]), - static_cast(segments[2]), - static_cast(segments[3]), - static_cast(segments[4]), - static_cast(segments[5]), - segments[6]}; - if (default_time_zone_char_len == 0) { - // invoke from `string_to_timestamp_without_time_zone` - if (just_time || !allow_time_zone && tz.has_value()) { - return thrust::make_tuple(error_us, false); - } else { - return to_utc_timestamp(components); - } - } else { - // invoke from `string_to_timestamp` - if (just_time) { - // Update here to support the following format: - // `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` - // `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` - // by set local date in a time zone: year-month-day. - // Above 2 formats are time zone related, Spark uses LocalDate.now(zoneId) - - // do not support currently - return thrust::make_tuple(error_us, false); - } else { - return to_utc_timestamp(components, time_zone); - } - } -} + // set components + ts_comp->year = segments[0]; + ts_comp->month = static_cast(segments[1]); + ts_comp->day = static_cast(segments[2]); + ts_comp->hour = static_cast(segments[3]); + ts_comp->minute = static_cast(segments[4]); + ts_comp->second = static_cast(segments[5]); + ts_comp->microseconds = segments[6]; -struct parse_timestamp_string_fn { - cudf::column_device_view const d_strings; - const char* default_time_zone; - cudf::size_type default_time_zone_char_len; - bool allow_time_zone; - bool allow_special_expressions; - // TODO the following should be passed in. - // Note: today, tomorrow, yesterday are time zone related, should use time zone to generate. - cudf::timestamp_us epoch = cudf::timestamp_us{cudf::duration_us{111L}}; - cudf::timestamp_us now = cudf::timestamp_us{cudf::duration_us{222L}}; - cudf::timestamp_us today = cudf::timestamp_us{cudf::duration_us{333L}}; - cudf::timestamp_us tomorrow = cudf::timestamp_us{cudf::duration_us{444L}}; - cudf::timestamp_us yesterday = cudf::timestamp_us{cudf::duration_us{555L}}; - - __device__ thrust::tuple operator()(const cudf::size_type& idx) const - { - auto const d_str = d_strings.element(idx); - return parse_string_to_timestamp_us(d_str, - default_time_zone, - default_time_zone_char_len, - allow_time_zone, - allow_special_expressions, - epoch, - now, - today, - tomorrow, - yesterday); + return ParseResult::OK; } }; @@ -408,114 +499,90 @@ struct parse_timestamp_string_fn { * Trims and parses timestamp string column to a timestamp column and a is valid column * */ -std::pair, std::unique_ptr> to_timestamp( +std::pair, bool> to_timestamp( cudf::strings_column_view const& input, - std::string_view const& default_time_zone, - bool allow_time_zone, - bool allow_special_expressions, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + cudf::strings_column_view const& special_datetime_lit, + bool ansi_mode, + bool allow_tz_in_date_str = true, + size_type default_tz_index = 1000000000, + cudf::column_view const *transitions = nullptr, + cudf::strings_column_view const *tz_indices = nullptr) { + auto const stream = cudf::get_default_stream(); + auto const mr = rmm::mr::get_current_device_resource(); + auto d_strings = cudf::column_device_view::create(input.parent(), stream); + auto d_special_datetime_lit = cudf::column_device_view::create(special_datetime_lit.parent(), stream); - auto output_timestamp = - cudf::make_timestamp_column(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}, - input.size(), - cudf::detail::copy_bitmask(input.parent(), stream, mr), - input.null_count(), - stream, - mr); - // record which string is failed to parse. - auto output_bool = - cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, + auto result_col = + cudf::make_timestamp_column(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}, input.size(), - cudf::detail::copy_bitmask(input.parent(), stream, mr), - input.null_count(), + cudf::mask_state::UNALLOCATED, stream, mr); + // record which string is failed to parse. + auto result_valid_col = + cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::UINT8}, + input.size(), + cudf::mask_state::UNALLOCATED, + stream, + mr); + + if (transitions == nullptr || tz_indices == nullptr) { + thrust::transform( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + thrust::make_zip_iterator( + thrust::make_tuple(result_col->mutable_view().begin(), + result_valid_col->mutable_view().begin())), + parse_timestamp_string_fn{*d_strings, + *d_special_datetime_lit, + default_tz_index, + allow_tz_in_date_str}); + } else { + auto const ft_cdv_ptr = column_device_view::create(*transitions, stream); + auto const d_transitions = lists_column_device_view{*ft_cdv_ptr}; + auto d_tz_indices = cudf::column_device_view::create(tz_indices->parent(), stream); + + thrust::transform( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + thrust::make_zip_iterator( + thrust::make_tuple(result_col->mutable_view().begin(), + result_valid_col->mutable_view().begin())), + parse_timestamp_string_fn{*d_strings, + *d_special_datetime_lit, + default_tz_index, + true, + d_transitions, + *d_tz_indices}); + } - thrust::transform( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(input.size()), - thrust::make_zip_iterator( - thrust::make_tuple(output_timestamp->mutable_view().begin(), - output_bool->mutable_view().begin())), - parse_timestamp_string_fn{*d_strings, - default_time_zone.data(), - static_cast(default_time_zone.size()), - allow_time_zone, - allow_special_expressions}); - - return std::make_pair(std::move(output_timestamp), std::move(output_bool)); -} + auto valid_view = result_valid_col->mutable_view(); -/** - * Set the null mask of timestamp column according to the validity column. - */ -void update_bitmask(cudf::column& timestamp_column, - cudf::column const& validity_column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto const& ts_view = timestamp_column.view(); - auto const& valid_view = validity_column.view(); - std::vector masks; - std::vector offsets; - if (timestamp_column.nullable()) { - masks.push_back(ts_view.null_mask()); - offsets.push_back(ts_view.offset()); + auto exception_exists = thrust::any_of( + rmm::exec_policy(stream), + valid_view.begin(), + valid_view.end(), + []__device__(uint8_t e) { return e == ParseResult::UNSUPPORTED; }); + if (exception_exists) { + CUDF_FAIL("There exists unsupported timestamp schema!"); } - // generate bitmask from `validity_column` auto [valid_bitmask, valid_null_count] = cudf::detail::valid_if( - valid_view.begin(), valid_view.end(), thrust::identity{}, stream, mr); - - masks.push_back(static_cast(valid_bitmask.data())); - offsets.push_back(0); - - // merge 2 bitmasks - auto [null_mask, null_count] = - cudf::detail::bitmask_and(masks, offsets, timestamp_column.size(), stream, mr); - - timestamp_column.set_null_mask(null_mask, null_count); -} + valid_view.begin(), valid_view.end(), + [] __device__(uint8_t e) { return e == 0; }, + stream, mr); -/** - * Parse string column with time zone to timestamp column, - * Returns a pair of timestamp column and a bool indicates whether successed. - */ -std::pair, bool> parse_string_to_timestamp( - cudf::strings_column_view const& input, - std::string_view const& default_time_zone, - bool allow_time_zone, - bool allow_special_expressions, - bool ansi_mode) -{ - auto timestamp_type = cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}; - if (input.size() == 0) { - return std::make_pair(cudf::make_empty_column(timestamp_type.id()), true); + if (ansi_mode && input.null_count() < valid_null_count) { + // has invalid value in validity column under ansi mode + return std::make_pair(nullptr, false); } - auto const stream = cudf::get_default_stream(); - auto const mr = rmm::mr::get_current_device_resource(); - auto [timestamp_column, validity_column] = - to_timestamp(input, default_time_zone, allow_time_zone, allow_special_expressions, stream, mr); - - if (ansi_mode) { - // create scalar, value is false, is_valid is true - cudf::numeric_scalar false_scalar{false, true, stream, mr}; - if (cudf::contains(*validity_column, false_scalar, stream)) { - // has invalid value in validity column under ansi mode - return std::make_pair(nullptr, false); - } else { - update_bitmask(*timestamp_column, *validity_column, stream, mr); - return std::make_pair(std::move(timestamp_column), true); - } - } else { - update_bitmask(*timestamp_column, *validity_column, stream, mr); - return std::make_pair(std::move(timestamp_column), true); - } + result_col->set_null_mask(valid_bitmask, valid_null_count, stream); + return std::make_pair(std::move(result_col), true); } } // namespace @@ -527,15 +594,18 @@ namespace spark_rapids_jni { * Returns a pair of timestamp column and a bool indicates whether successed. * If does not have time zone in string, use the default time zone. */ -std::pair, bool> string_to_timestamp( +std::pair, bool> string_to_timestamp_with_tz( cudf::strings_column_view const& input, - std::string_view const& default_time_zone, - bool allow_special_expressions, + cudf::column_view const& transitions, + cudf::strings_column_view const& tz_indices, + cudf::strings_column_view const& special_datetime_lit, + cudf::size_type default_tz_index, bool ansi_mode) { - CUDF_EXPECTS(default_time_zone.size() > 0, "should specify default time zone"); - return parse_string_to_timestamp( - input, default_time_zone, true, allow_special_expressions, ansi_mode); + if (input.size() == 0) { + return std::make_pair(cudf::make_empty_column(cudf::type_id::TIMESTAMP_MICROSECONDS), true); + } + return to_timestamp(input, special_datetime_lit, ansi_mode, true, default_tz_index, &transitions, &tz_indices); } /** @@ -546,15 +616,14 @@ std::pair, bool> string_to_timestamp( */ std::pair, bool> string_to_timestamp_without_time_zone( cudf::strings_column_view const& input, + cudf::strings_column_view const& special_datetime_lit, bool allow_time_zone, - bool allow_special_expressions, bool ansi_mode) { - return parse_string_to_timestamp(input, - std::string_view(""), // specify empty time zone - allow_time_zone, - allow_special_expressions, - ansi_mode); + if (input.size() == 0) { + return std::make_pair(cudf::make_empty_column(cudf::type_id::TIMESTAMP_MICROSECONDS), true); + } + return to_timestamp(input, special_datetime_lit, ansi_mode, allow_time_zone); } } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp index d2f1dfa39c..1b72a1fbb8 100644 --- a/src/main/cpp/src/datetime_parser.hpp +++ b/src/main/cpp/src/datetime_parser.hpp @@ -68,11 +68,13 @@ namespace spark_rapids_jni { * @returns a timestamp column and a bool column. Bool column is empty if ansi mode is false, not * empty otherwise. */ -std::pair, bool> string_to_timestamp( - cudf::strings_column_view const& input, - std::string_view const& default_time_zone, - bool allow_special_expressions, - bool ansi_mode); +std::pair, bool> string_to_timestamp_with_tz( + cudf::strings_column_view const& input, + cudf::column_view const& transitions, + cudf::strings_column_view const& tz_indices, + cudf::strings_column_view const& special_datetime_lit, + cudf::size_type default_tz_index, + bool ansi_mode); /** * @@ -128,8 +130,8 @@ std::pair, bool> string_to_timestamp( */ std::pair, bool> string_to_timestamp_without_time_zone( cudf::strings_column_view const& input, + cudf::strings_column_view const& special_datetime_lit, bool allow_time_zone, - bool allow_special_expressions, bool ansi_mode); } // namespace spark_rapids_jni diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java index 515f725e02..b383468e7e 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java @@ -18,6 +18,8 @@ import ai.rapids.cudf.*; +import java.time.ZoneId; + /** Utility class for casting between string columns and native type columns */ public class CastStrings { static { @@ -179,19 +181,30 @@ public static ColumnVector fromIntegersWithBase(ColumnView cv, int base) { * @param cv The input string column to be converted. * @param defaultTimeZone Use the default time zone if string does not * contain time zone. - * @param allowSpecialExpressions Whether allow: epoch, now, today, tomorrow * @param ansiEnabled is Ansi mode * @return a timestamp column * @throws IllegalArgumentException if cv contains invalid value when * ansiEnabled is true */ - public static ColumnVector toTimestamp(ColumnView cv, String defaultTimeZone, - boolean allowSpecialExpressions, boolean ansiEnabled) { - if (defaultTimeZone == null || defaultTimeZone.isEmpty()) { - throw new IllegalArgumentException("Default time zone can not be empty."); + public static ColumnVector toTimestamp(ColumnView cv, ZoneId defaultTimeZone, boolean ansiEnabled) { + if (!GpuTimeZoneDB.isSupportedTimeZone(defaultTimeZone)) { + throw new IllegalArgumentException(String.format("Unsupported timezone: %s", + defaultTimeZone.toString())); + } + + GpuTimeZoneDB singleton = GpuTimeZoneDB.getInstance(); + if (!singleton.isLoaded()) { + GpuTimeZoneDB.cacheDatabase(); + } + + Integer tzIndex = singleton.getZoneIDMap().get(defaultTimeZone.normalized().toString()); + + try (Table transitions = singleton.getTransitions(); + ColumnVector tzIndices = singleton.getZoneIDVector(); + ColumnVector specialTz = singleton.getSpecialTzVector()) { + return new ColumnVector(toTimestamp(cv.getNativeView(), transitions.getNativeView(), + tzIndices.getNativeView(), specialTz.getNativeView(), tzIndex, ansiEnabled)); } - return new ColumnVector(toTimestamp(cv.getNativeView(), defaultTimeZone, - allowSpecialExpressions, ansiEnabled)); } /** @@ -219,20 +232,25 @@ public static ColumnVector toTimestamp(ColumnView cv, String defaultTimeZone, * ts is: ['2023-01-01 00:00:00', '2023-01-01T08:00:00'] * * @param cv The input string column to be converted. - * @param allow_time_zone whether allow time zone in the timestamp + * @param allowTimeZone whether allow time zone in the timestamp * string. e.g.: * 1991-04-14T02:00:00Asia/Shanghai is invalid * when do not allow time zone. - * @param allowSpecialExpressions Whether allow: epoch, now, today, tomorrow * @param ansiEnabled is Ansi mode * @return a timestamp column * @throws IllegalArgumentException if cv contains invalid value when * ansiEnabled is true */ - public static ColumnVector toTimestampWithoutTimeZone(ColumnView cv, boolean allowTimeZone, - boolean allowSpecialExpressions, boolean ansiEnabled) { - return new ColumnVector(toTimestampWithoutTimeZone(cv.getNativeView(), allowTimeZone, - allowSpecialExpressions, ansiEnabled)); + public static ColumnVector toTimestampWithoutTimeZone(ColumnView cv, boolean allowTimeZone, boolean ansiEnabled) { + GpuTimeZoneDB singleton = GpuTimeZoneDB.getInstance(); + if (!singleton.isLoaded()) { + GpuTimeZoneDB.cacheDatabase(); + } + + try (ColumnVector specialTz = singleton.getSpecialTzVector()) { + return new ColumnVector(toTimestampWithoutTimeZone(cv.getNativeView(), specialTz.getNativeView(), + allowTimeZone, ansiEnabled)); + } } private static native long toInteger(long nativeColumnView, boolean ansi_enabled, boolean strip, @@ -246,8 +264,8 @@ private static native long toDecimal(long nativeColumnView, boolean ansi_enabled private static native long toIntegersWithBase(long nativeColumnView, int base, boolean ansiEnabled, int dtype); private static native long fromIntegersWithBase(long nativeColumnView, int base); - private static native long toTimestamp(long nativeColumnView, String defaultTimeZone, - boolean allowSpecialExpressions, boolean ansiEnabled); - private static native long toTimestampWithoutTimeZone(long nativeColumnView, - boolean allowTimeZone, boolean allowSpecialExpressions, boolean ansiEnabled); + private static native long toTimestamp(long input, + long transitions, long tzIndices, long specialDate, int tzIndex, boolean ansiEnabled); + private static native long toTimestampWithoutTimeZone(long input, + long specialDate, boolean allowTimeZone, boolean ansiEnabled); } diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java index b63a9dc282..f8b49b5b22 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java @@ -17,6 +17,7 @@ package com.nvidia.spark.rapids.jni; import java.time.Instant; +import java.time.LocalDateTime; import java.time.ZoneId; import java.time.zone.ZoneOffsetTransition; import java.time.zone.ZoneRules; @@ -27,15 +28,14 @@ import java.util.Map; import java.util.TimeZone; import java.util.concurrent.*; +import java.util.function.Function; -import ai.rapids.cudf.ColumnVector; -import ai.rapids.cudf.DType; -import ai.rapids.cudf.HostColumnVector; -import ai.rapids.cudf.Table; +import ai.rapids.cudf.*; public class GpuTimeZoneDB { public static final int TIMEOUT_SECS = 300; + public static final String[] SPECIAL_TZ_LITERALS = {"epoch", "now", "today", "tomorrow", "yesterday"}; // For the timezone database, we store the transitions in a ColumnVector that is a list of @@ -43,14 +43,18 @@ public class GpuTimeZoneDB { // LIST> private CompletableFuture> zoneIdToTableFuture; private CompletableFuture fixedTransitionsFuture; + private CompletableFuture zoneIdVectorFuture; + private CompletableFuture specialTzLiteralsFuture; private boolean closed = false; GpuTimeZoneDB() { zoneIdToTableFuture = new CompletableFuture<>(); fixedTransitionsFuture = new CompletableFuture<>(); + zoneIdVectorFuture = new CompletableFuture<>(); + specialTzLiteralsFuture = new CompletableFuture<>(); } - + private static GpuTimeZoneDB instance = new GpuTimeZoneDB(); // This method is default visibility for testing purposes only. The instance will be never be exposed publicly // for this class. @@ -157,10 +161,11 @@ public static ZoneId getZoneId(String timeZoneId) { return ZoneId.of(formattedZoneId, ZoneId.SHORT_IDS); } - private boolean isLoaded() { - return zoneIdToTableFuture.isDone(); + public boolean isLoaded() { + return zoneIdToTableFuture.isDone() && fixedTransitionsFuture.isDone() && + zoneIdVectorFuture.isDone() && specialTzLiteralsFuture.isDone(); } - + private void loadData(Executor executor) throws IllegalStateException { // Start loading the data in separate thread and return try { @@ -176,6 +181,9 @@ private void doLoadData() { try { Map zoneIdToTable = new HashMap<>(); List> masterTransitions = new ArrayList<>(); + List zondIdList = new ArrayList<>(); + List unsupportedZoneList = new ArrayList<>(); + for (String tzId : TimeZone.getAvailableIDs()) { ZoneId zoneId; try { @@ -189,6 +197,7 @@ private void doLoadData() { ZoneRules zoneRules = zoneId.getRules(); // Filter by non-repeating rules if (!zoneRules.isFixedOffset() && !zoneRules.getTransitionRules().isEmpty()) { + unsupportedZoneList.add(zoneId.getId()); continue; } if (!zoneIdToTable.containsKey(zoneId.getId())) { @@ -198,16 +207,27 @@ private void doLoadData() { if (zoneRules.isFixedOffset()) { data.add( new HostColumnVector.StructData(Long.MIN_VALUE, Long.MIN_VALUE, - zoneRules.getOffset(Instant.now()).getTotalSeconds()) + zoneRules.getOffset(Instant.now()).getTotalSeconds(), Long.MIN_VALUE) ); } else { // Capture the first official offset (before any transition) using Long min ZoneOffsetTransition first = transitions.get(0); data.add( new HostColumnVector.StructData(Long.MIN_VALUE, Long.MIN_VALUE, - first.getOffsetBefore().getTotalSeconds()) + first.getOffsetBefore().getTotalSeconds(), Long.MIN_VALUE) ); transitions.forEach(t -> { + // A simple approach to transform LocalDateTime to a value which is proportional to + // the exact EpochSecond. After caching these values along with EpochSeconds, we + // can easily search out which time zone transition rule we should apply according + // to LocalDateTime structs. The searching procedure is same as the binary search with + // exact EpochSeconds(convert_timestamp_tz_functor), except using "loose EpochSeconds" + // as search index instead of exact EpochSeconds. + Function localToLooseEpochSecond = lt -> + 86400L * (lt.getYear() * 400L + (lt.getMonthValue() - 1) * 31L + + lt.getDayOfMonth() - 1) + + 3600L * lt.getHour() + 60L * lt.getMinute() + lt.getSecond(); + // Whether transition is an overlap vs gap. // In Spark: // if it's a gap, then we use the offset after *on* the instant @@ -219,35 +239,53 @@ private void doLoadData() { new HostColumnVector.StructData( t.getInstant().getEpochSecond(), t.getInstant().getEpochSecond() + t.getOffsetAfter().getTotalSeconds(), - t.getOffsetAfter().getTotalSeconds()) + t.getOffsetAfter().getTotalSeconds(), + localToLooseEpochSecond.apply(t.getDateTimeAfter()) + ) ); } else { data.add( new HostColumnVector.StructData( t.getInstant().getEpochSecond(), t.getInstant().getEpochSecond() + t.getOffsetBefore().getTotalSeconds(), - t.getOffsetAfter().getTotalSeconds()) + t.getOffsetAfter().getTotalSeconds(), + localToLooseEpochSecond.apply(t.getDateTimeBefore()) + ) ); } }); } masterTransitions.add(data); zoneIdToTable.put(zoneId.getId(), idx); + zondIdList.add(zoneId.getId()); } } + zoneIdToTableFuture.complete(zoneIdToTable); + HostColumnVector.DataType childType = new HostColumnVector.StructType(false, new HostColumnVector.BasicType(false, DType.INT64), new HostColumnVector.BasicType(false, DType.INT64), - new HostColumnVector.BasicType(false, DType.INT32)); + new HostColumnVector.BasicType(false, DType.INT32), + new HostColumnVector.BasicType(false, DType.INT64)); HostColumnVector.DataType resultType = new HostColumnVector.ListType(false, childType); - HostColumnVector fixedTransitions = HostColumnVector.fromLists(resultType, - masterTransitions.toArray(new List[0])); - fixedTransitionsFuture.complete(fixedTransitions); - zoneIdToTableFuture.complete(zoneIdToTable); + + zondIdList.addAll(unsupportedZoneList); + + try (HostColumnVector fixedTransitions = HostColumnVector.fromLists(resultType, masterTransitions.toArray(new List[0]))) { + try (HostColumnVector zoneIdVector = HostColumnVector.fromStrings(zondIdList.toArray(new String[0]))) { + try (HostColumnVector specialTzVector = HostColumnVector.fromStrings(SPECIAL_TZ_LITERALS)) { + fixedTransitionsFuture.complete(fixedTransitions.incRefCount()); + zoneIdVectorFuture.complete(zoneIdVector.incRefCount()); + specialTzLiteralsFuture.complete(specialTzVector.incRefCount()); + } + } + } } catch (Exception e) { fixedTransitionsFuture.completeExceptionally(e); zoneIdToTableFuture.completeExceptionally(e); + zoneIdVectorFuture.completeExceptionally(e); + specialTzLiteralsFuture.completeExceptionally(e); throw e; } } @@ -273,7 +311,7 @@ private HostColumnVector getHostFixedTransitions() { } } - private Map getZoneIDMap() { + public Map getZoneIDMap() { try { return zoneIdToTableFuture.get(TIMEOUT_SECS, TimeUnit.SECONDS); } catch (InterruptedException | ExecutionException | TimeoutException e) { @@ -281,7 +319,25 @@ private Map getZoneIDMap() { } } - private Table getTransitions() { + public ColumnVector getZoneIDVector() { + try { + HostColumnVector hcv = zoneIdVectorFuture.get(TIMEOUT_SECS, TimeUnit.SECONDS); + return hcv.copyToDevice(); + } catch (InterruptedException | ExecutionException | TimeoutException e) { + throw new RuntimeException(e); + } + } + + public ColumnVector getSpecialTzVector() { + try { + HostColumnVector hcv = specialTzLiteralsFuture.get(TIMEOUT_SECS, TimeUnit.SECONDS); + return hcv.copyToDevice(); + } catch (InterruptedException | ExecutionException | TimeoutException e) { + throw new RuntimeException(e); + } + } + + public Table getTransitions() { try (ColumnVector fixedTransitions = getFixedTransitions()) { return new Table(fixedTransitions); } diff --git a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java index 7eeee46945..a8939bc825 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java @@ -20,8 +20,11 @@ import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.fail; +import java.time.*; import java.util.ArrayList; import java.util.List; +import java.util.AbstractMap; +import java.util.Map; import org.junit.jupiter.api.Test; @@ -326,72 +329,143 @@ void baseHex2DecTest() { } } - // TODO update after this PR is done. @Test - void toTimestampTestNonAnsi() { - long d_2023_1_1 = (2023L * 365L * 86400L + 1 * 30L * 86400L + 1 * 86400L) * 1000000L; - long d_2023_11_1 = (2023L * 365L * 86400L + 11 * 30L * 86400L + 1 * 86400L) * 1000000L; - long d_2023_11_5 = (2023L * 365L * 86400L + 11L * 30L * 86400L + 5L * 86400L) * 1000000L; - long t_3_4_55 = (3L * 3600L + 4L * 60L + 55L) * 1000000L; - long d_2023_11_5_t_3_4_55 = d_2023_11_5 + t_3_4_55; + void toTimestampTestAnsiWithoutTz() { + assertThrows(IllegalArgumentException.class, () -> { + try (ColumnVector input = ColumnVector.fromStrings(" invalid_value ")) { + // ansiEnabled is true + CastStrings.toTimestampWithoutTimeZone(input, false, true); + } + }); + + Instant instant = LocalDateTime.parse("2023-11-05T03:04:55").toInstant(ZoneOffset.UTC); + long expectedResults = instant.getEpochSecond() * 1000000L; try ( - ColumnVector input = ColumnVector.fromStrings( - null, - " 2023 ", - " 2023-11 ", - " 2023-11-5 ", - " 2023-11-05 3:04:55 ", - " 2023-11-05T03:4:55 ", - " 2023-11-05T3:4:55 ", - " 2023-11-5T3:4:55.", - " 2023-11-5T3:4:55.Iran", - " 2023-11-5T3:4:55.1 ", - " 2023-11-5T3:4:55.1Iran", - " 2023-11-05T03:04:55.123456 ", - " 2023-11-05T03:04:55.123456Iran ", - " 222222 ", - " ", // invalid - "", // invalid - "1-" // invalid - ); - ColumnVector expected = ColumnVector.timestampMicroSecondsFromBoxedLongs( - null, - d_2023_1_1, - d_2023_11_1, - d_2023_11_5, - d_2023_11_5_t_3_4_55, - d_2023_11_5_t_3_4_55, - d_2023_11_5_t_3_4_55, - d_2023_11_5_t_3_4_55, - d_2023_11_5_t_3_4_55, - d_2023_11_5_t_3_4_55 + 100000, - d_2023_11_5_t_3_4_55 + 100000, - d_2023_11_5_t_3_4_55 + 123456, - d_2023_11_5_t_3_4_55 + 123456, - (222222L * 365L * 86400L + 1 * 30L * 86400L + 1 * 86400L) * 1000000L, - null, - null, - null); - ColumnVector actual = CastStrings.toTimestamp(input, - "Asia/Shanghai", false, false)) { + ColumnVector input = ColumnVector.fromStrings("2023-11-05 3:04:55"); + ColumnVector expected = ColumnVector.timestampMicroSecondsFromBoxedLongs(expectedResults); + ColumnVector actual = CastStrings.toTimestampWithoutTimeZone(input, false, true)) { AssertUtils.assertColumnsAreEqual(expected, actual); } } @Test - void toTimestampTestAnsi() { - assertThrows(IllegalArgumentException.class, () -> { - try (ColumnVector input = ColumnVector.fromStrings(" invalid_value ")) { - // ansiEnabled is true - CastStrings.toTimestamp(input, "Asia/Shanghai", false, true); + void toTimestampTestWithTz() { + List> entries = new ArrayList<>(); + // Without timezone + entries.add(new AbstractMap.SimpleEntry<>(" 2000-01-29 ", 949104000000000L)); + // Timezone IDs + entries.add(new AbstractMap.SimpleEntry<>("2023-11-05 3:4:55 America/Sao_Paulo", 1699164295000000L)); + entries.add(new AbstractMap.SimpleEntry<>("2023-11-5T03:04:55.1 Asia/Shanghai", 1699124695100000L)); + entries.add(new AbstractMap.SimpleEntry<>("2000-1-29 13:59:8 Iran", 949141748000000L)); + entries.add(new AbstractMap.SimpleEntry<>("1968-03-25T23:59:1.123Asia/Tokyo", -55846858877000L)); + entries.add(new AbstractMap.SimpleEntry<>("1968-03-25T23:59:1.123456Asia/Tokyo", -55846858876544L)); + + // UTC-like timezones + // no adjustment + entries.add(new AbstractMap.SimpleEntry<>("1970-9-9 2:33:44 Z", 21695624000000L)); + entries.add(new AbstractMap.SimpleEntry<>(" 1969-12-1 2:3:4.999Z", -2671015001000L)); + entries.add(new AbstractMap.SimpleEntry<>("1954-10-20 00:11:22 GMT ", -479692118000000L)); + entries.add(new AbstractMap.SimpleEntry<>("1984-1-3 00:11:22UTC", 441936682000000L)); + // hh + entries.add(new AbstractMap.SimpleEntry<>("1998-11-05T20:00:1.12 UTC+18 ", 910231201120000L)); + entries.add(new AbstractMap.SimpleEntry<>("1998-11-05T20:00:1.12UTC+0", 910296001120000L)); + entries.add(new AbstractMap.SimpleEntry<>("1998-11-05T20:00:1.12UTC-00", 910296001120000L)); + entries.add(new AbstractMap.SimpleEntry<>(" 1998-11-05T20:00:1.12 GMT+09 ", 910263601120000L)); + entries.add(new AbstractMap.SimpleEntry<>("1998-11-05T20:00:1.12 GMT-1", 910299601120000L)); + entries.add(new AbstractMap.SimpleEntry<>("1998-11-05T20:00:1.12 UTC-6", 910317601120000L)); + entries.add(new AbstractMap.SimpleEntry<>("1998-11-05T20:00:1.12 UTC-18", 910360801120000L)); + entries.add(new AbstractMap.SimpleEntry<>("1998-11-05T20:00:1.12UTC-00", 910296001120000L)); + entries.add(new AbstractMap.SimpleEntry<>(" 1998-11-05T20:00:1.12 +09 ", 910263601120000L)); + entries.add(new AbstractMap.SimpleEntry<>("1998-11-05T20:00:1.12 -1", 910299601120000L)); + entries.add(new AbstractMap.SimpleEntry<>("1998-11-05T20:00:1.12 +18 ", 910231201120000L)); + entries.add(new AbstractMap.SimpleEntry<>("1998-11-05T20:00:1.12-00", 910296001120000L)); + // hh:mm + entries.add(new AbstractMap.SimpleEntry<>("1969-12-1 2:3:4.999 UTC+1428", -2723095001000L)); + entries.add(new AbstractMap.SimpleEntry<>("1969-12-1 2:3:4.999 GMT-1501", -2616955001000L)); + entries.add(new AbstractMap.SimpleEntry<>("1969-12-1 2:3:4.999 GMT+1:22", -2675935001000L)); + entries.add(new AbstractMap.SimpleEntry<>("1969-12-1 2:3:4.8888 GMT+8:2", -2699935111200L)); + entries.add(new AbstractMap.SimpleEntry<>("1969-12-1 2:3:4.999 UTC+17:9", -2732755001000L)); + entries.add(new AbstractMap.SimpleEntry<>("1969-12-1 2:3:4.999 UTC-09:11", -2637955001000L)); + entries.add(new AbstractMap.SimpleEntry<>("1969-12-1 2:3:4.999 +1428 ", -2723095001000L)); + entries.add(new AbstractMap.SimpleEntry<>("1969-12-1 2:3:4.999-1501 ", -2616955001000L)); + entries.add(new AbstractMap.SimpleEntry<>("1969-12-1 2:3:4.999 +1:22 ", -2675935001000L)); + entries.add(new AbstractMap.SimpleEntry<>("1969-12-1 2:3:4.8888 +8:2 ", -2699935111200L)); + entries.add(new AbstractMap.SimpleEntry<>("1969-12-1 2:3:4.999+17:9", -2732755001000L)); + entries.add(new AbstractMap.SimpleEntry<>("1969-12-1 2:3:4.999 -09:11", -2637955001000L)); + // hh:mm::ss + entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 GMT+112233", 1571569871100000L)); + entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 UTC-100102", 1571646886100000L)); + entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 UTC+11:22:33", 1571569871100000L)); + entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 GMT-10:10:10", 1571647434100000L)); + entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 GMT-8:08:01", 1571640105100000L)); + entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 UTC+4:59:59", 1571592825100000L)); + entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 00:1:20.3 +102030", 1571492450300000L)); + entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 00:1:20.3 -020103", 1571536943300000L)); + entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 -8:08:01 ", 1571640105100000L)); + entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1+4:59:59", 1571592825100000L)); + + int validDataSize = entries.size(); + + // Invalid instances + // Timezone without hh:mm:ss + entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 Iran", null)); + // Invalid Timezone ID + entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 10:20:30 Asia/London", null)); + // Invalid UTC-like timezone + // overflow + entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 10:20:30 +10:60", null)); + entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 10:20:30 UTC-7:59:60", null)); + entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 10:20:30 +19", null)); + entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 10:20:30 UTC-23", null)); + entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 10:20:30 GMT+1801", null)); + entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 10:20:30 -180001", null)); + entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 10:20:30 UTC+18:00:10", null)); + entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 10:20:30 GMT-23:5", null)); + + List inputs = new ArrayList<>(); + List expects = new ArrayList<>(); + for (Map.Entry entry : entries) { + inputs.add(entry.getKey()); + expects.add(entry.getValue()); + } + + // Throw unsupported exception for symbols because Europe/London contains DST rules + assertThrows(ai.rapids.cudf.CudfException.class, () -> { + try (ColumnVector input = ColumnVector.fromStrings("2000-01-29 1:2:3 Europe/London")) { + CastStrings.toTimestamp(input, ZoneId.of("UTC"), false); + } + }); + // Throw unsupported exception for symbols of special dates + for (String date : new String[]{"epoch", "now", "today", "yesterday", "tomorrow"}) + assertThrows(ai.rapids.cudf.CudfException.class, () -> { + try (ColumnVector input = ColumnVector.fromStrings(date)) { + CastStrings.toTimestamp(input, ZoneId.of("UTC"), false); } }); + // non-ANSI mode + try ( + ColumnVector input = ColumnVector.fromStrings(inputs.toArray(new String[0])); + ColumnVector expected = ColumnVector.timestampMicroSecondsFromBoxedLongs(expects.toArray(new Long[0])); + ColumnVector actual = CastStrings.toTimestamp(input, ZoneId.of("UTC"), false)) { + AssertUtils.assertColumnsAreEqual(expected, actual); + } + + // Should NOT throw exception because all inputs are valid + String[] validInputs = inputs.stream().limit(validDataSize).toArray(String[]::new); + Long[] validExpects = expects.stream().limit(validDataSize).toArray(Long[]::new); + try ( + ColumnVector input = ColumnVector.fromStrings(validInputs); + ColumnVector expected = ColumnVector.timestampMicroSecondsFromBoxedLongs(validExpects); + ColumnVector actual = CastStrings.toTimestamp(input, ZoneId.of("UTC"), true)) { + AssertUtils.assertColumnsAreEqual(expected, actual); + } + + // Throw IllegalArgumentException for invalid timestamps under ANSI mode assertThrows(IllegalArgumentException.class, () -> { - try (ColumnVector input = ColumnVector.fromStrings(" invalid_value ")) { - // ansiEnabled is true - CastStrings.toTimestampWithoutTimeZone(input, true, false, true); + try (ColumnVector input = ColumnVector.fromStrings(inputs.toArray(new String[0]))) { + CastStrings.toTimestamp(input, ZoneId.of("UTC"), true); } }); } From 7c9b8000593e4c57a6057dccb5aa979d9607102e Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Wed, 10 Jan 2024 19:40:55 +0800 Subject: [PATCH 10/35] refine --- src/main/cpp/src/CastStringJni.cpp | 63 ++- src/main/cpp/src/datetime_parser.cu | 517 +++++++++++------- src/main/cpp/src/datetime_parser.hpp | 64 ++- .../spark/rapids/jni/GpuTimeZoneDB.java | 18 +- 4 files changed, 403 insertions(+), 259 deletions(-) diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp index 60bf69ff2e..de23f48c39 100644 --- a/src/main/cpp/src/CastStringJni.cpp +++ b/src/main/cpp/src/CastStringJni.cpp @@ -257,21 +257,20 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromInteger CATCH_CAST_EXCEPTION(env, 0); } -JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv *env, - jclass, - jlong input_column, - jlong transitions_handle, - jlong tz_indices_col, - jlong special_dt_lit_col, - jint tz_default_index, - jboolean ansi_enabled) -{ +JNIEXPORT jlong JNICALL +Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp( + JNIEnv *env, jclass, jlong input_column, jlong transitions_handle, + jlong tz_indices_col, jlong special_dt_lit_col, jint tz_default_index, + jboolean ansi_enabled) { JNI_NULL_CHECK(env, input_column, "input column is null", 0); try { cudf::jni::auto_set_device(env); - auto const &input_view = cudf::strings_column_view(*reinterpret_cast(input_column)); - auto const transitions = reinterpret_cast(transitions_handle)->column(0); + auto const &input_view = cudf::strings_column_view( + *reinterpret_cast(input_column)); + auto const transitions = + reinterpret_cast(transitions_handle) + ->column(0); auto const &tz_indices_view = cudf::strings_column_view( *reinterpret_cast(tz_indices_col)); auto const &special_dt_lit_view = cudf::strings_column_view( @@ -279,40 +278,46 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp auto const tz_index = static_cast(tz_default_index); - auto [ret_cv, success] = spark_rapids_jni::string_to_timestamp_with_tz( - input_view, transitions, tz_indices_view, special_dt_lit_view, tz_index, ansi_enabled); - if (success) { return cudf::jni::release_as_jlong(ret_cv); } + auto ret_cv = spark_rapids_jni::string_to_timestamp_with_tz( + input_view, transitions, tz_indices_view, special_dt_lit_view, tz_index, + ansi_enabled); + if (ret_cv) { + return cudf::jni::release_as_jlong(ret_cv); + } } CATCH_STD(env, 0); // sucess is false, throw exception. - // Note: do not need to release ret_cv, because it's nullptr when success is false. - JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Parse failed on Ansi mode", 0); + // Note: do not need to release ret_cv, because it's nullptr when success is + // false. + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", + "Parse failed on Ansi mode", 0); } -JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestampWithoutTimeZone( - JNIEnv* env, - jclass, - jlong input_column, - jlong special_dt_lit_col, - jboolean allow_time_zone, - jboolean ansi_enabled) -{ +JNIEXPORT jlong JNICALL +Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestampWithoutTimeZone( + JNIEnv *env, jclass, jlong input_column, jlong special_dt_lit_col, + jboolean allow_time_zone, jboolean ansi_enabled) { JNI_NULL_CHECK(env, input_column, "input column is null", 0); try { cudf::jni::auto_set_device(env); - auto const &input_view = cudf::strings_column_view(*reinterpret_cast(input_column)); + auto const &input_view = cudf::strings_column_view( + *reinterpret_cast(input_column)); auto const &special_dt_lit_view = cudf::strings_column_view( *reinterpret_cast(special_dt_lit_col)); - auto [ret_cv, success] = spark_rapids_jni::string_to_timestamp_without_time_zone( + auto ret_cv = spark_rapids_jni::string_to_timestamp_without_tz( input_view, special_dt_lit_view, allow_time_zone, ansi_enabled); - if (success) { return cudf::jni::release_as_jlong(ret_cv); } + if (ret_cv) { + return cudf::jni::release_as_jlong(ret_cv); + } } CATCH_STD(env, 0); // sucess is false, throw exception. - // Note: do not need to release ret_cv, because it's nullptr when success is false. - JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Parse failed on Ansi mode", 0); + // Note: do not need to release ret_cv, because it's nullptr when success is + // false. + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", + "Parse failed on Ansi mode", 0); } } diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index 31b5fe0099..a70e49fb33 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -49,14 +49,14 @@ #include #include -using column = cudf::column; -using column_device_view = cudf::column_device_view; -using column_view = cudf::column_view; +using column = cudf::column; +using column_device_view = cudf::column_device_view; +using column_view = cudf::column_view; using lists_column_device_view = cudf::detail::lists_column_device_view; -using size_type = cudf::size_type; -using string_view = cudf::string_view; -using struct_view = cudf::struct_view; -using table_view = cudf::table_view; +using size_type = cudf::size_type; +using string_view = cudf::string_view; +using struct_view = cudf::struct_view; +using table_view = cudf::table_view; namespace { @@ -64,7 +64,7 @@ namespace { * Represents local date time in a time zone. */ struct timestamp_components { - int32_t year; // max 6 digits + int32_t year; // max 6 digits int8_t month; int8_t day; int8_t hour; @@ -76,14 +76,15 @@ struct timestamp_components { /** * Is white space */ -__device__ __host__ inline bool is_whitespace(const char chr) -{ +__device__ __host__ inline bool is_whitespace(const char chr) { switch (chr) { - case ' ': - case '\r': - case '\t': - case '\n': return true; - default: return false; + case ' ': + case '\r': + case '\t': + case '\n': + return true; + default: + return false; } } @@ -97,11 +98,16 @@ __device__ inline bool equals_ascii_ignore_case(char const *actual_begin, char const *actual_end, char const *expect_begin, char const *expect_end) { - if (actual_end - actual_begin != expect_end - expect_begin) { return false; } + if (actual_end - actual_begin != expect_end - expect_begin) { + return false; + } while (expect_begin < expect_end) { // the diff between upper case and lower case for a same char is 32 - if (*actual_begin != *expect_begin && *actual_begin != (*expect_begin - 32)) { return false; } + if (*actual_begin != *expect_begin && + *actual_begin != (*expect_begin - 32)) { + return false; + } actual_begin++; expect_begin++; } @@ -111,112 +117,165 @@ __device__ inline bool equals_ascii_ignore_case(char const *actual_begin, /** * Ported from Spark */ -__device__ __host__ bool is_valid_digits(int segment, int digits) -{ +__device__ __host__ bool is_valid_digits(int segment, int digits) { // A Long is able to represent a timestamp within [+-]200 thousand years const int constexpr maxDigitsYear = 6; - // For the nanosecond part, more than 6 digits is allowed, but will be truncated. - return segment == 6 || (segment == 0 && digits >= 4 && digits <= maxDigitsYear) || - // For the zoneId segment(7), it's could be zero digits when it's a region-based zone ID - (segment == 7 && digits <= 2) || - (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2); + // For the nanosecond part, more than 6 digits is allowed, but will be + // truncated. + return segment == 6 || + (segment == 0 && digits >= 4 && digits <= maxDigitsYear) || + // For the zoneId segment(7), it's could be zero digits when it's a + // region-based zone ID + (segment == 7 && digits <= 2) || + (segment != 0 && segment != 6 && segment != 7 && digits > 0 && + digits <= 2); } -enum ParseResult { - OK = 0, - INVALID = 1, - UNSUPPORTED = 2 -}; +/** + * We have to dintinguish INVALID value with UNSUPPORTED value. + * INVALID means the value is invalid in Spark SQL. + * UNSUPPORTED means the value is valid in Spark SQL but not supported by rapids + * yet. As for INVALID values, we treat them in the same as Spark SQL. As for + * UNSUPPORTED values, we just throw cuDF exception. + */ +enum ParseResult { OK = 0, INVALID = 1, UNSUPPORTED = 2 }; -template -struct parse_timestamp_string_fn { +template struct parse_timestamp_string_fn { column_device_view const d_strings; column_device_view const special_datetime_names; size_type default_tz_index; bool allow_tz_in_date_str = true; // The list column of transitions to figure out the correct offset // to adjust the timestamp. The type of the values in this column is - // LIST>. - thrust::optional transitions = thrust::nullopt; + // LIST>. + thrust::optional transitions = + thrust::nullopt; thrust::optional tz_indices = thrust::nullopt; - __device__ thrust::tuple operator()(const cudf::size_type& idx) const - { + __device__ thrust::tuple + operator()(const cudf::size_type &idx) const { + // inherit the nullmask of the input column if (!d_strings.is_valid(idx)) { - return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID); + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, + ParseResult::INVALID); } auto const d_str = d_strings.element(idx); timestamp_components ts_comp{}; - char const * tz_lit_ptr = nullptr; + char const *tz_lit_ptr = nullptr; size_type tz_lit_len = 0; - switch (parse_string_to_timestamp_us(&ts_comp, &tz_lit_ptr, &tz_lit_len, d_str)) { - case ParseResult::INVALID: - return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID); - case ParseResult::UNSUPPORTED: - return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::UNSUPPORTED); - case ParseResult::OK: - default: - break; + switch (parse_string_to_timestamp_us(&ts_comp, &tz_lit_ptr, &tz_lit_len, + d_str)) { + case ParseResult::INVALID: + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, + ParseResult::INVALID); + case ParseResult::UNSUPPORTED: + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, + ParseResult::UNSUPPORTED); + case ParseResult::OK: + default: + break; } if constexpr (!with_timezone) { - // path without timezone, in which unix_timestamp is straightforwardly computed + // path without timezone, in which unix_timestamp is straightforwardly + // computed auto const ts_unaligned = compute_epoch_us(ts_comp); - return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{ts_unaligned}}, ParseResult::OK); + return thrust::make_tuple( + cudf::timestamp_us{cudf::duration_us{ts_unaligned}}, ParseResult::OK); } - - // path with timezone, in which timezone offset has to be determined before computing unix_timestamp + + // path with timezone, in which timezone offset has to be determined before + // computing unix_timestamp int64_t tz_offset; if (tz_lit_ptr == nullptr) { - tz_offset = extract_timezone_offset(compute_loose_epoch_s(ts_comp), default_tz_index); + tz_offset = extract_timezone_offset(compute_loose_epoch_s(ts_comp), + default_tz_index); } else { auto tz_view = string_view(tz_lit_ptr, tz_lit_len); - if (auto [utc_offset, ret_code] = parse_utc_like_tz(tz_view); ret_code == 0) { + // Firstly, try parsing as utc-like timezone rep + if (auto [utc_offset, ret_code] = parse_utc_like_tz(tz_view); + ret_code == 0) { tz_offset = utc_offset; } else if (ret_code == 1) { + // Then, try parsing as region-based timezone ID auto tz_index = query_index_from_tz_db(tz_view); + // tz_index < size(tzDB): found the ID in tzDB + // size(tzDB) <= tz_index < size(tzIDs): found the ID but not supported + // yet tz_index == size(tzIDs): invalid timezone ID if (tz_index > transitions->size()) { if (tz_index == tz_indices->size()) - return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID); - return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::UNSUPPORTED); + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, + ParseResult::INVALID); + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, + ParseResult::UNSUPPORTED); } - tz_offset = extract_timezone_offset(compute_loose_epoch_s(ts_comp), tz_index); + tz_offset = + extract_timezone_offset(compute_loose_epoch_s(ts_comp), tz_index); } else { - return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID); + // (ret_code == 2) quick path to mark value invalid + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, + ParseResult::INVALID); } } + // Compute the epoch as UTC timezone, then apply the timezone offset. auto const ts_unaligned = compute_epoch_us(ts_comp); - return thrust::make_tuple( - cudf::timestamp_us{cudf::duration_us{ts_unaligned - tz_offset * 1000000L}}, - ParseResult::OK); + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{ + ts_unaligned - tz_offset * 1000000L}}, + ParseResult::OK); } - // TODO: support CST/PST/AST - __device__ inline thrust::pair parse_utc_like_tz(string_view const &tz_lit) const - { + /** + * TODO: support CST/PST/AST + * + * Parse UTC-like timezone representation such as: UTC+11:22:33, GMT-8:08:01. + * This function is purposed to be fully align to Apache Spark's behavior. The + * function returns the status along with the result: 0 - successfully parsed + * the timezone offset 1 - not a valid UTC-like timezone representation, maybe + * valid regioned-base rep 2 - not a valid timezone representation + * + * Valid patterns: + * with colon + * hh:mm : ^(GMT|UTC)?[+-](\d|0[0-9]|1[0-8]):(\d|[0-5][0-9]) + * hh:mm:ss : ^(GMT|UTC)?[+-](\d|0[0-9]|1[0-8]):[0-5][0-9]:[0-5][0-9] + * without colon + * hh only : ^(GMT|UTC)?[+-](\d|0[0-9]|1[0-8]) + * hh:mm:(ss) : ^(GMT|UTC)?[+-](0[0-9]|1[0-8])([0-5][0-9])?([0-5][0-9])? + * special symbols: + * ^(Z|CST|PST|AST|...) + * + * additional restriction: 18:00:00 is the upper bound (which means 18:00:01 + * is invalid) + */ + __device__ inline thrust::pair + parse_utc_like_tz(string_view const &tz_lit) const { size_type len = tz_lit.size_bytes(); char const *ptr = tz_lit.data(); + // try to parse Z if (*ptr == 'Z') { - if (len > 1) return {0, 1}; + if (len > 1) + return {0, 1}; return {0, 0}; } size_t char_offset = 0; - - if (len > 2 - && ((*ptr == 'G' && *(ptr + 1) == 'M' && *(ptr + 2) == 'T') - || (*ptr == 'U' && *(ptr + 1) == 'T' && *(ptr + 2) == 'C'))) { + // skip UTC|GMT if existing + if (len > 2 && ((*ptr == 'G' && *(ptr + 1) == 'M' && *(ptr + 2) == 'T') || + (*ptr == 'U' && *(ptr + 1) == 'T' && *(ptr + 2) == 'C'))) { char_offset = 3; } - if (len == char_offset) return {0, 0}; + // return for the pattern UTC|GMT (without exact offset) + if (len == char_offset) + return {0, 0}; + // parse sign +|- char const sign_char = *(ptr + char_offset++); int64_t sign; if (sign_char == '+') { @@ -224,100 +283,138 @@ struct parse_timestamp_string_fn { } else if (sign_char == '-') { sign = -1L; } else { + // if the rep starts with UTC|GMT, it can NOT be regioned-base rep return {0, char_offset < 3 ? 1 : 2}; } + // parse hh:mm:ss int64_t hms[3] = {0L, 0L, 0L}; bool has_colon = false; - bool one_digit_mm = false; for (size_type i = 0; i < 3; i++) { - if (i == 2 && one_digit_mm) return {0, 2}; - + // deal with the first digit hms[i] = *(ptr + char_offset++) - '0'; - if (hms[i] < 0 || hms[i] > 9) return {0, 2}; + if (hms[i] < 0 || hms[i] > 9) + return {0, 2}; + // deal with trailing single digit instant: + // hh(GMT+8) - valid + // mm(GMT+11:2) - must be separated from (h)h by `:` + // ss(GMT-11:22:3) - invalid if (len == char_offset) { - if (i > 0) { - if (!has_colon) return {0, 2}; - one_digit_mm = true; - } + if (i == 2 || (i == 1 && !has_colon)) + return {0, 2}; break; } + // deal with `:` if (*(ptr + char_offset) == ':') { - if (len == ++char_offset) break; + // 1. (i == 1) one_digit mm with ss is invalid (+11:2:3) + // 2. (i == 2) one_dight ss is invalid (+11:22:3) + // 3. trailing `:` is invalid (GMT+8:) + if (i > 0 || len == ++char_offset) + return {0, 2}; has_colon = true; continue; } + // deal with the second digit auto digit = *(ptr + char_offset++) - '0'; - if (digit < 0 || digit > 9) return {0, 2}; + if (digit < 0 || digit > 9) + return {0, 2}; hms[i] = hms[i] * 10 + digit; - if (len == char_offset) break; + if (len == char_offset) + break; + // deal with `:` if (*(ptr + char_offset) == ':') { - if (len == ++char_offset) break; + // trailing `:` is invalid (UTC+11:) + if (len == ++char_offset) + return {0, 2}; has_colon = true; - continue; } - if (has_colon) return {0, 2}; } - if (hms[0] > 18 || hms[1] > 59 || hms[2] > 59) return {0, 2}; - if (hms[0] == 18 && hms[1] + hms[2] > 0) return {0, 2}; + // the upper bound is 18:00:00 (regardless of sign) + if (hms[0] > 18 || hms[1] > 59 || hms[2] > 59) + return {0, 2}; + if (hms[0] == 18 && hms[1] + hms[2] > 0) + return {0, 2}; return {sign * (hms[0] * 3600L + hms[1] * 60L + hms[2]), 0}; } - __device__ inline int query_index_from_tz_db(string_view const &tz_lit) const - { - // TODO: replace with more efficient approach (such as binary search or prefix tree) + /** + * TODO: replace linear search with more efficient approach (like prefix tree) + */ + __device__ inline int + query_index_from_tz_db(string_view const &tz_lit) const { auto predicate = [tz = tz_indices, &tz_lit] __device__(auto const i) { return tz->element(i) == tz_lit; }; - auto ret = thrust::find_if(thrust::seq, - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(tz_indices->size()), - predicate); + auto ret = thrust::find_if( + thrust::seq, thrust::make_counting_iterator(0), + thrust::make_counting_iterator(tz_indices->size()), predicate); return *ret; } - __device__ inline int64_t extract_timezone_offset(int64_t loose_epoch_second, size_type tz_index) const - { + /** + * Perform binaryserach to search out the timezone offset based on loose epoch + * instants. Basically, this is the same approach as + * `convert_timestamp_tz_functor`. + */ + __device__ inline int64_t extract_timezone_offset(int64_t loose_epoch_second, + size_type tz_index) const { auto const &utc_offsets = transitions->child().child(2); auto const &loose_instants = transitions->child().child(3); - auto const local_transitions = cudf::list_device_view{*transitions, tz_index}; + auto const local_transitions = + cudf::list_device_view{*transitions, tz_index}; auto const list_size = local_transitions.size(); auto const transition_times = cudf::device_span( loose_instants.data() + local_transitions.element_offset(0), static_cast(list_size)); - auto const it = thrust::upper_bound( - thrust::seq, transition_times.begin(), transition_times.end(), loose_epoch_second); - auto const idx = static_cast(thrust::distance(transition_times.begin(), it)); + auto const it = + thrust::upper_bound(thrust::seq, transition_times.begin(), + transition_times.end(), loose_epoch_second); + auto const idx = + static_cast(thrust::distance(transition_times.begin(), it)); auto const list_offset = local_transitions.element_offset(idx - 1); return static_cast(utc_offsets.element(list_offset)); } - __device__ inline int64_t compute_loose_epoch_s(timestamp_components const& ts) const - { - return (ts.year * 400 + (ts.month - 1) * 31 + ts.day - 1) * 86400L + ts.hour * 3600L + ts.minute * 60L + ts.second; + /** + * The formula to compute loose epoch from local time. The loose epoch is used + * to search for the corresponding timezone offset of specific zone ID from + * TimezoneDB. The target of loose epoch is to transfer local time to a number + * which is proportional to the real timestamp as easily as possible. Loose + * epoch, as a computation approach, helps us to align probe(kernel side) to + * the TimezoneDB(Java side). Then, we can apply binary search based on loose + * epoch instants of TimezoneDB to find out the correct timezone offset. + */ + __device__ inline int64_t + compute_loose_epoch_s(timestamp_components const &ts) const { + return (ts.year * 400 + (ts.month - 1) * 31 + ts.day - 1) * 86400L + + ts.hour * 3600L + ts.minute * 60L + ts.second; } - __device__ inline int64_t compute_epoch_us(timestamp_components const& ts) const - { - auto const ymd = // chrono class handles the leap year calculations for us + /** + * Leverage STL to convert local time to UTC unix_timestamp(in millisecond) + */ + __device__ inline int64_t + compute_epoch_us(timestamp_components const &ts) const { + auto const ymd = // chrono class handles the leap year calculations for us cuda::std::chrono::year_month_day( cuda::std::chrono::year{ts.year}, cuda::std::chrono::month{static_cast(ts.month)}, cuda::std::chrono::day{static_cast(ts.day)}); auto days = cuda::std::chrono::sys_days(ymd).time_since_epoch().count(); - int64_t timestamp_s = (days * 24L * 3600L) + (ts.hour * 3600L) + (ts.minute * 60L) + ts.second; + int64_t timestamp_s = (days * 24L * 3600L) + (ts.hour * 3600L) + + (ts.minute * 60L) + ts.second; return timestamp_s * 1000000L + ts.microseconds; } @@ -330,13 +427,15 @@ struct parse_timestamp_string_fn { * Parse a string with time zone to a timestamp. * The bool in the returned tuple is false if the parse failed. */ - __device__ inline ParseResult parse_string_to_timestamp_us( - timestamp_components *ts_comp, - char const **parsed_tz_ptr, - size_type *parsed_tz_length, - cudf::string_view const ×tamp_str) const { - - if (timestamp_str.empty()) { return ParseResult::INVALID; } + __device__ inline ParseResult + parse_string_to_timestamp_us(timestamp_components *ts_comp, + char const **parsed_tz_ptr, + size_type *parsed_tz_length, + cudf::string_view const ×tamp_str) const { + + if (timestamp_str.empty()) { + return ParseResult::INVALID; + } const char *curr_ptr = timestamp_str.data(); const char *end_ptr = curr_ptr + timestamp_str.size_bytes(); @@ -352,15 +451,18 @@ struct parse_timestamp_string_fn { // TODO: support special dates [epoch, now, today, yesterday, tomorrow] for (size_type i = 0; i < special_datetime_names.size(); i++) { - auto const& ref = special_datetime_names.element(i); - if (equals_ascii_ignore_case(curr_ptr, end_ptr, ref.data(), ref.data() + ref.size_bytes())) { + auto const &ref = special_datetime_names.element(i); + if (equals_ascii_ignore_case(curr_ptr, end_ptr, ref.data(), + ref.data() + ref.size_bytes())) { *parsed_tz_ptr = ref.data(); *parsed_tz_length = ref.size_bytes(); return ParseResult::UNSUPPORTED; } } - if (curr_ptr == end_ptr) { return ParseResult::INVALID; } + if (curr_ptr == end_ptr) { + return ParseResult::INVALID; + } const char *const bytes = curr_ptr; const size_type bytes_length = end_ptr - curr_ptr; @@ -392,14 +494,18 @@ struct parse_timestamp_string_fn { i += 3; } else if (i < 2) { if (b == '-') { - if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; } + if (!is_valid_digits(i, current_segment_digits)) { + return ParseResult::INVALID; + } segments[i] = current_segment_value; current_segment_value = 0; current_segment_digits = 0; i += 1; } else if (0 == i && ':' == b && !year_sign.has_value()) { // just_time = true; - if (!is_valid_digits(3, current_segment_digits)) { return ParseResult::INVALID; } + if (!is_valid_digits(3, current_segment_digits)) { + return ParseResult::INVALID; + } segments[3] = current_segment_value; current_segment_value = 0; current_segment_digits = 0; @@ -409,7 +515,9 @@ struct parse_timestamp_string_fn { } } else if (2 == i) { if (' ' == b || 'T' == b) { - if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; } + if (!is_valid_digits(i, current_segment_digits)) { + return ParseResult::INVALID; + } segments[i] = current_segment_value; current_segment_value = 0; current_segment_digits = 0; @@ -419,7 +527,9 @@ struct parse_timestamp_string_fn { } } else if (3 == i || 4 == i) { if (':' == b) { - if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; } + if (!is_valid_digits(i, current_segment_digits)) { + return ParseResult::INVALID; + } segments[i] = current_segment_value; current_segment_value = 0; current_segment_digits = 0; @@ -429,27 +539,37 @@ struct parse_timestamp_string_fn { } } else if (5 == i || 6 == i) { if ('.' == b && 5 == i) { - if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; } + if (!is_valid_digits(i, current_segment_digits)) { + return ParseResult::INVALID; + } segments[i] = current_segment_value; current_segment_value = 0; current_segment_digits = 0; i += 1; } else { - if (!is_valid_digits(i, current_segment_digits) || !allow_tz_in_date_str) { return ParseResult::INVALID; } + if (!is_valid_digits(i, current_segment_digits) || + !allow_tz_in_date_str) { + return ParseResult::INVALID; + } segments[i] = current_segment_value; current_segment_value = 0; current_segment_digits = 0; i += 1; *parsed_tz_ptr = bytes + j; // strip the whitespace between timestamp and timezone - while (*parsed_tz_ptr < end_ptr && is_whitespace(**parsed_tz_ptr)) ++(*parsed_tz_ptr); + while (*parsed_tz_ptr < end_ptr && is_whitespace(**parsed_tz_ptr)) + ++(*parsed_tz_ptr); *parsed_tz_length = end_ptr - *parsed_tz_ptr; break; } - if (i == 6 && '.' != b) { i += 1; } + if (i == 6 && '.' != b) { + i += 1; + } } else { if (i < segments_len && (':' == b || ' ' == b)) { - if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; } + if (!is_valid_digits(i, current_segment_digits)) { + return ParseResult::INVALID; + } segments[i] = current_segment_value; current_segment_value = 0; current_segment_digits = 0; @@ -459,9 +579,11 @@ struct parse_timestamp_string_fn { } } } else { - if (6 == i) { digits_milli += 1; } - // We will truncate the nanosecond part if there are more than 6 digits, which results - // in loss of precision + if (6 == i) { + digits_milli += 1; + } + // We will truncate the nanosecond part if there are more than 6 digits, + // which results in loss of precision if (6 != i || current_segment_digits < 6) { current_segment_value = current_segment_value * 10 + parsed_value; } @@ -470,7 +592,9 @@ struct parse_timestamp_string_fn { j += 1; } - if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; } + if (!is_valid_digits(i, current_segment_digits)) { + return ParseResult::INVALID; + } segments[i] = current_segment_value; while (digits_milli < 6) { @@ -495,97 +619,92 @@ struct parse_timestamp_string_fn { }; /** - * - * Trims and parses timestamp string column to a timestamp column and a is valid column + * The common entrance of string_to_timestamp, which combines two paths: + * with_timezone and without_timezone. This function returns the The + * transitions, tz_indices and default_tz_index are only for handling inputs + * with timezone. So, this function distinguish with_timezone callsfrom + * without_timezone ones by checking if transitions and tz_indices are nullptr. * */ -std::pair, bool> to_timestamp( - cudf::strings_column_view const& input, - cudf::strings_column_view const& special_datetime_lit, - bool ansi_mode, - bool allow_tz_in_date_str = true, - size_type default_tz_index = 1000000000, - cudf::column_view const *transitions = nullptr, - cudf::strings_column_view const *tz_indices = nullptr) -{ +std::unique_ptr +to_timestamp(cudf::strings_column_view const &input, + cudf::strings_column_view const &special_datetime_lit, + bool ansi_mode, bool allow_tz_in_date_str = true, + size_type default_tz_index = 1000000000, + cudf::column_view const *transitions = nullptr, + cudf::strings_column_view const *tz_indices = nullptr) { auto const stream = cudf::get_default_stream(); auto const mr = rmm::mr::get_current_device_resource(); auto d_strings = cudf::column_device_view::create(input.parent(), stream); - auto d_special_datetime_lit = cudf::column_device_view::create(special_datetime_lit.parent(), stream); - - auto result_col = - cudf::make_timestamp_column(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}, - input.size(), - cudf::mask_state::UNALLOCATED, - stream, - mr); - // record which string is failed to parse. - auto result_valid_col = - cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::UINT8}, - input.size(), - cudf::mask_state::UNALLOCATED, - stream, - mr); + auto d_special_datetime_lit = + cudf::column_device_view::create(special_datetime_lit.parent(), stream); + + // column to store the result timestamp + auto result_col = cudf::make_timestamp_column( + cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}, input.size(), + cudf::mask_state::UNALLOCATED, stream, mr); + // column to store the status `ParseResult` + auto result_valid_col = cudf::make_fixed_width_column( + cudf::data_type{cudf::type_id::UINT8}, input.size(), + cudf::mask_state::UNALLOCATED, stream, mr); if (transitions == nullptr || tz_indices == nullptr) { thrust::transform( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), + rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(input.size()), - thrust::make_zip_iterator( - thrust::make_tuple(result_col->mutable_view().begin(), - result_valid_col->mutable_view().begin())), - parse_timestamp_string_fn{*d_strings, - *d_special_datetime_lit, + thrust::make_zip_iterator(thrust::make_tuple( + result_col->mutable_view().begin(), + result_valid_col->mutable_view().begin())), + parse_timestamp_string_fn{*d_strings, *d_special_datetime_lit, default_tz_index, allow_tz_in_date_str}); } else { auto const ft_cdv_ptr = column_device_view::create(*transitions, stream); auto const d_transitions = lists_column_device_view{*ft_cdv_ptr}; - auto d_tz_indices = cudf::column_device_view::create(tz_indices->parent(), stream); + auto d_tz_indices = + cudf::column_device_view::create(tz_indices->parent(), stream); thrust::transform( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), + rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(input.size()), - thrust::make_zip_iterator( - thrust::make_tuple(result_col->mutable_view().begin(), - result_valid_col->mutable_view().begin())), - parse_timestamp_string_fn{*d_strings, - *d_special_datetime_lit, - default_tz_index, - true, - d_transitions, + thrust::make_zip_iterator(thrust::make_tuple( + result_col->mutable_view().begin(), + result_valid_col->mutable_view().begin())), + parse_timestamp_string_fn{*d_strings, *d_special_datetime_lit, + default_tz_index, true, d_transitions, *d_tz_indices}); } auto valid_view = result_valid_col->mutable_view(); - auto exception_exists = thrust::any_of( - rmm::exec_policy(stream), - valid_view.begin(), - valid_view.end(), - []__device__(uint8_t e) { return e == ParseResult::UNSUPPORTED; }); + // throw cuDF exception if there exists any unsupported formats + auto exception_exists = + thrust::any_of(rmm::exec_policy(stream), valid_view.begin(), + valid_view.end(), [] __device__(uint8_t e) { + return e == ParseResult::UNSUPPORTED; + }); if (exception_exists) { CUDF_FAIL("There exists unsupported timestamp schema!"); } + // build the updated nullmask and compute the null count auto [valid_bitmask, valid_null_count] = cudf::detail::valid_if( valid_view.begin(), valid_view.end(), - [] __device__(uint8_t e) { return e == 0; }, - stream, mr); + [] __device__(uint8_t e) { return e == 0; }, stream, mr); + // `output null count > input null count` indicates that there are new null + // values generated during the `to_timestamp` transaction to replace invalid + // inputs. if (ansi_mode && input.null_count() < valid_null_count) { - // has invalid value in validity column under ansi mode - return std::make_pair(nullptr, false); + return nullptr; } result_col->set_null_mask(valid_bitmask, valid_null_count, stream); - return std::make_pair(std::move(result_col), true); + return std::move(result_col); } -} // namespace +} // namespace namespace spark_rapids_jni { @@ -594,36 +713,34 @@ namespace spark_rapids_jni { * Returns a pair of timestamp column and a bool indicates whether successed. * If does not have time zone in string, use the default time zone. */ -std::pair, bool> string_to_timestamp_with_tz( - cudf::strings_column_view const& input, - cudf::column_view const& transitions, - cudf::strings_column_view const& tz_indices, - cudf::strings_column_view const& special_datetime_lit, - cudf::size_type default_tz_index, - bool ansi_mode) -{ +std::unique_ptr string_to_timestamp_with_tz( + cudf::strings_column_view const &input, + cudf::column_view const &transitions, + cudf::strings_column_view const &tz_indices, + cudf::strings_column_view const &special_datetime_lit, + cudf::size_type default_tz_index, bool ansi_mode) { if (input.size() == 0) { - return std::make_pair(cudf::make_empty_column(cudf::type_id::TIMESTAMP_MICROSECONDS), true); + return nullptr; } - return to_timestamp(input, special_datetime_lit, ansi_mode, true, default_tz_index, &transitions, &tz_indices); + return to_timestamp(input, special_datetime_lit, ansi_mode, true, + default_tz_index, &transitions, &tz_indices); } /** * Parse string column with time zone to timestamp column, * Returns a pair of timestamp column and a bool indicates whether successed. * Do not use the time zone in string. - * If allow_time_zone is false and string contains time zone, then the string is invalid. + * If allow_time_zone is false and string contains time zone, then the string is + * invalid. */ -std::pair, bool> string_to_timestamp_without_time_zone( - cudf::strings_column_view const& input, - cudf::strings_column_view const& special_datetime_lit, - bool allow_time_zone, - bool ansi_mode) -{ +std::unique_ptr string_to_timestamp_without_tz( + cudf::strings_column_view const &input, + cudf::strings_column_view const &special_datetime_lit, bool allow_time_zone, + bool ansi_mode) { if (input.size() == 0) { - return std::make_pair(cudf::make_empty_column(cudf::type_id::TIMESTAMP_MICROSECONDS), true); + return nullptr; } return to_timestamp(input, special_datetime_lit, ansi_mode, allow_time_zone); } -} // namespace spark_rapids_jni +} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp index 1b72a1fbb8..9c536b6837 100644 --- a/src/main/cpp/src/datetime_parser.hpp +++ b/src/main/cpp/src/datetime_parser.hpp @@ -20,8 +20,9 @@ namespace spark_rapids_jni { /** * - * Trims and parses a timestamp string column with time zone suffix to a timestamp column. - * e.g.: 1991-04-14T02:00:00Asia/Shanghai => 1991-04-13 18:00:00 + * Trims and parses a timestamp string column with time zone suffix to a + * timestamp column. e.g.: 1991-04-14T02:00:00Asia/Shanghai => 1991-04-13 + * 18:00:00 * * Refer to: https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/ * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394 @@ -47,7 +48,8 @@ namespace spark_rapids_jni { * Spark supports the following zone id forms: * - Z - Zulu time zone UTC+0 * - +|-[h]h:[m]m - * - A short id, see https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS + * - A short id, see + * https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS * - An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-, * and a suffix in the formats: * - +|-h[h] @@ -63,23 +65,30 @@ namespace spark_rapids_jni { * * * @param input input string column view. - * @param default_time_zone if input string does not contain a time zone, use this time zone. - * @param ansi_mode is ansi mode - * @returns a timestamp column and a bool column. Bool column is empty if ansi mode is false, not - * empty otherwise. + * @param transitions TimezoneDB, the table of transitions contains all + * information for timezones + * @param tz_indices TimezoneDB index of region-based timezone IDs + * @param special_datetime_lit cache of special datetimes + * @param default_tz_index the index of default timezone in TimezoneDB, if input + * date-like string does not contain a time zone (like: YYYY-MM-DD:hhmmss), use + * this time zone. + * @param ansi_mode whether enforce ANSI mode or not. If true, exception will be + * thrown encountering any invalid inputs. + * @returns the pointer of the timestamp result column, which points to nullptr + * if there exists invalid inputs and ANSI mode is on. */ -std::pair, bool> string_to_timestamp_with_tz( - cudf::strings_column_view const& input, - cudf::column_view const& transitions, - cudf::strings_column_view const& tz_indices, - cudf::strings_column_view const& special_datetime_lit, - cudf::size_type default_tz_index, - bool ansi_mode); +std::unique_ptr string_to_timestamp_with_tz( + cudf::strings_column_view const &input, + cudf::column_view const &transitions, + cudf::strings_column_view const &tz_indices, + cudf::strings_column_view const &special_datetime_lit, + cudf::size_type default_tz_index, bool ansi_mode); /** * - * Trims and parses a timestamp string column with time zone suffix to a timestamp column. - * e.g.: 1991-04-14T02:00:00Asia/Shanghai => 1991-04-13 18:00:00 + * Trims and parses a timestamp string column with time zone suffix to a + * timestamp column. e.g.: 1991-04-14T02:00:00Asia/Shanghai => 1991-04-13 + * 18:00:00 * * Refer to: https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/ * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394 @@ -105,7 +114,8 @@ std::pair, bool> string_to_timestamp_with_tz( * Spark supports the following zone id forms: * - Z - Zulu time zone UTC+0 * - +|-[h]h:[m]m - * - A short id, see https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS + * - A short id, see + * https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS * - An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-, * and a suffix in the formats: * - +|-h[h] @@ -121,17 +131,17 @@ std::pair, bool> string_to_timestamp_with_tz( * * * @param input input string column view. + * @param special_datetime_lit cache of special datetimes * @param allow_time_zone whether allow time zone in the timestamp string. e.g.: * 1991-04-14T02:00:00Asia/Shanghai is invalid when do not allow time zone. - * @param allow_special_expressions whether allow epoch, now, today, yesterday, tomorrow strings. - * @param ansi_mode is ansi mode - * @returns a timestamp column and a bool column. Bool column is empty if ansi mode is false, not - * empty otherwise. + * @param ansi_mode whether enforce ANSI mode or not. If true, exception will be + * thrown encountering any invalid inputs. + * @returns the pointer of the timestamp result column, which points to nullptr + * if there exists invalid inputs and ANSI mode is on. */ -std::pair, bool> string_to_timestamp_without_time_zone( - cudf::strings_column_view const& input, - cudf::strings_column_view const& special_datetime_lit, - bool allow_time_zone, - bool ansi_mode); +std::unique_ptr string_to_timestamp_without_tz( + cudf::strings_column_view const &input, + cudf::strings_column_view const &special_datetime_lit, bool allow_time_zone, + bool ansi_mode); -} // namespace spark_rapids_jni +} // namespace spark_rapids_jni diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java index f8b49b5b22..cc9ba04a8e 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java @@ -40,7 +40,7 @@ public class GpuTimeZoneDB { // For the timezone database, we store the transitions in a ColumnVector that is a list of // structs. The type of this column vector is: - // LIST> + // LIST> private CompletableFuture> zoneIdToTableFuture; private CompletableFuture fixedTransitionsFuture; private CompletableFuture zoneIdVectorFuture; @@ -61,7 +61,7 @@ public class GpuTimeZoneDB { static GpuTimeZoneDB getInstance() { return instance; } - + /** * Start to cache the database. This should be called on startup of an executor. It should start * to cache the data on the CPU in a background thread. It should return immediately and allow the @@ -181,6 +181,16 @@ private void doLoadData() { try { Map zoneIdToTable = new HashMap<>(); List> masterTransitions = new ArrayList<>(); + // Build a timezone ID index for the rendering of timezone IDs which may be included in datetime-like strings. + // For instance: "2023-11-5T03:04:55.1 Asia/Shanghai" -> This index helps to find the + // offset of "Asia/Shanghai" in timezoneDB. + // + // Currently, we do NOT support all timezone IDs. For unsupported ones, we ought to throw Exception anyway. And + // for invalid ones, we replace them with NULL value when ANSI mode is off. Therefore, we need to distinguish the + // unsupported ones from invalid ones which means the unsupported Ids need to be collected as well. + // To distinguish supported IDs from unsupported ones, we place all unsupported IDs behind supported ones: + // 1. Collect the IDs of all supported timezones in the order of masterTransitions. + // 2. Append the IDs of all unsupported timezones after the suported ones. List zondIdList = new ArrayList<>(); List unsupportedZoneList = new ArrayList<>(); @@ -221,7 +231,7 @@ private void doLoadData() { // the exact EpochSecond. After caching these values along with EpochSeconds, we // can easily search out which time zone transition rule we should apply according // to LocalDateTime structs. The searching procedure is same as the binary search with - // exact EpochSeconds(convert_timestamp_tz_functor), except using "loose EpochSeconds" + // exact EpochSeconds(convert_timestamp_tz_functor), except using "loose instant" // as search index instead of exact EpochSeconds. Function localToLooseEpochSecond = lt -> 86400L * (lt.getYear() * 400L + (lt.getMonthValue() - 1) * 31L + @@ -257,6 +267,7 @@ private void doLoadData() { } masterTransitions.add(data); zoneIdToTable.put(zoneId.getId(), idx); + // Collect the IDs of all supported timezones in the order of masterTransitions zondIdList.add(zoneId.getId()); } } @@ -270,6 +281,7 @@ private void doLoadData() { HostColumnVector.DataType resultType = new HostColumnVector.ListType(false, childType); + // Append the IDs of all unsupported timezones after the suported ones. zondIdList.addAll(unsupportedZoneList); try (HostColumnVector fixedTransitions = HostColumnVector.fromLists(resultType, masterTransitions.toArray(new List[0]))) { From 93d4a66cc3e5617ddf839cb175b79156d28d41f4 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Wed, 10 Jan 2024 21:39:51 +0800 Subject: [PATCH 11/35] fix clang-fmt --- src/main/cpp/src/CastStringJni.cpp | 65 ++-- src/main/cpp/src/datetime_parser.cu | 470 ++++++++++++--------------- src/main/cpp/src/datetime_parser.hpp | 20 +- 3 files changed, 254 insertions(+), 301 deletions(-) diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp index de23f48c39..ee0a053b88 100644 --- a/src/main/cpp/src/CastStringJni.cpp +++ b/src/main/cpp/src/CastStringJni.cpp @@ -258,66 +258,67 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromInteger } JNIEXPORT jlong JNICALL -Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp( - JNIEnv *env, jclass, jlong input_column, jlong transitions_handle, - jlong tz_indices_col, jlong special_dt_lit_col, jint tz_default_index, - jboolean ansi_enabled) { +Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv* env, + jclass, + jlong input_column, + jlong transitions_handle, + jlong tz_indices_col, + jlong special_dt_lit_col, + jint tz_default_index, + jboolean ansi_enabled) +{ JNI_NULL_CHECK(env, input_column, "input column is null", 0); try { cudf::jni::auto_set_device(env); - auto const &input_view = cudf::strings_column_view( - *reinterpret_cast(input_column)); + auto const& input_view = + cudf::strings_column_view(*reinterpret_cast(input_column)); auto const transitions = - reinterpret_cast(transitions_handle) - ->column(0); - auto const &tz_indices_view = cudf::strings_column_view( - *reinterpret_cast(tz_indices_col)); - auto const &special_dt_lit_view = cudf::strings_column_view( - *reinterpret_cast(special_dt_lit_col)); + reinterpret_cast(transitions_handle)->column(0); + auto const& tz_indices_view = + cudf::strings_column_view(*reinterpret_cast(tz_indices_col)); + auto const& special_dt_lit_view = + cudf::strings_column_view(*reinterpret_cast(special_dt_lit_col)); auto const tz_index = static_cast(tz_default_index); auto ret_cv = spark_rapids_jni::string_to_timestamp_with_tz( - input_view, transitions, tz_indices_view, special_dt_lit_view, tz_index, - ansi_enabled); - if (ret_cv) { - return cudf::jni::release_as_jlong(ret_cv); - } + input_view, transitions, tz_indices_view, special_dt_lit_view, tz_index, ansi_enabled); + if (ret_cv) { return cudf::jni::release_as_jlong(ret_cv); } } CATCH_STD(env, 0); // sucess is false, throw exception. // Note: do not need to release ret_cv, because it's nullptr when success is // false. - JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", - "Parse failed on Ansi mode", 0); + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Parse failed on Ansi mode", 0); } JNIEXPORT jlong JNICALL -Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestampWithoutTimeZone( - JNIEnv *env, jclass, jlong input_column, jlong special_dt_lit_col, - jboolean allow_time_zone, jboolean ansi_enabled) { +Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestampWithoutTimeZone(JNIEnv* env, + jclass, + jlong input_column, + jlong special_dt_lit_col, + jboolean allow_time_zone, + jboolean ansi_enabled) +{ JNI_NULL_CHECK(env, input_column, "input column is null", 0); try { cudf::jni::auto_set_device(env); - auto const &input_view = cudf::strings_column_view( - *reinterpret_cast(input_column)); - auto const &special_dt_lit_view = cudf::strings_column_view( - *reinterpret_cast(special_dt_lit_col)); + auto const& input_view = + cudf::strings_column_view(*reinterpret_cast(input_column)); + auto const& special_dt_lit_view = + cudf::strings_column_view(*reinterpret_cast(special_dt_lit_col)); auto ret_cv = spark_rapids_jni::string_to_timestamp_without_tz( - input_view, special_dt_lit_view, allow_time_zone, ansi_enabled); - if (ret_cv) { - return cudf::jni::release_as_jlong(ret_cv); - } + input_view, special_dt_lit_view, allow_time_zone, ansi_enabled); + if (ret_cv) { return cudf::jni::release_as_jlong(ret_cv); } } CATCH_STD(env, 0); // sucess is false, throw exception. // Note: do not need to release ret_cv, because it's nullptr when success is // false. - JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", - "Parse failed on Ansi mode", 0); + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Parse failed on Ansi mode", 0); } } diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index a70e49fb33..7e2a73d959 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -49,14 +49,14 @@ #include #include -using column = cudf::column; -using column_device_view = cudf::column_device_view; -using column_view = cudf::column_view; +using column = cudf::column; +using column_device_view = cudf::column_device_view; +using column_view = cudf::column_view; using lists_column_device_view = cudf::detail::lists_column_device_view; -using size_type = cudf::size_type; -using string_view = cudf::string_view; -using struct_view = cudf::struct_view; -using table_view = cudf::table_view; +using size_type = cudf::size_type; +using string_view = cudf::string_view; +using struct_view = cudf::struct_view; +using table_view = cudf::table_view; namespace { @@ -64,7 +64,7 @@ namespace { * Represents local date time in a time zone. */ struct timestamp_components { - int32_t year; // max 6 digits + int32_t year; // max 6 digits int8_t month; int8_t day; int8_t hour; @@ -76,15 +76,14 @@ struct timestamp_components { /** * Is white space */ -__device__ __host__ inline bool is_whitespace(const char chr) { +__device__ __host__ inline bool is_whitespace(const char chr) +{ switch (chr) { - case ' ': - case '\r': - case '\t': - case '\n': - return true; - default: - return false; + case ' ': + case '\r': + case '\t': + case '\n': return true; + default: return false; } } @@ -94,20 +93,16 @@ __device__ __host__ inline bool is_whitespace(const char chr) { * "epoch", "now", "today", "yesterday", "tomorrow" * the expect string should be lower-case a-z chars */ -__device__ inline bool equals_ascii_ignore_case(char const *actual_begin, - char const *actual_end, - char const *expect_begin, - char const *expect_end) { - if (actual_end - actual_begin != expect_end - expect_begin) { - return false; - } +__device__ inline bool equals_ascii_ignore_case(char const* actual_begin, + char const* actual_end, + char const* expect_begin, + char const* expect_end) +{ + if (actual_end - actual_begin != expect_end - expect_begin) { return false; } while (expect_begin < expect_end) { // the diff between upper case and lower case for a same char is 32 - if (*actual_begin != *expect_begin && - *actual_begin != (*expect_begin - 32)) { - return false; - } + if (*actual_begin != *expect_begin && *actual_begin != (*expect_begin - 32)) { return false; } actual_begin++; expect_begin++; } @@ -117,18 +112,17 @@ __device__ inline bool equals_ascii_ignore_case(char const *actual_begin, /** * Ported from Spark */ -__device__ __host__ bool is_valid_digits(int segment, int digits) { +__device__ __host__ bool is_valid_digits(int segment, int digits) +{ // A Long is able to represent a timestamp within [+-]200 thousand years const int constexpr maxDigitsYear = 6; // For the nanosecond part, more than 6 digits is allowed, but will be // truncated. - return segment == 6 || - (segment == 0 && digits >= 4 && digits <= maxDigitsYear) || + return segment == 6 || (segment == 0 && digits >= 4 && digits <= maxDigitsYear) || // For the zoneId segment(7), it's could be zero digits when it's a // region-based zone ID (segment == 7 && digits <= 2) || - (segment != 0 && segment != 6 && segment != 7 && digits > 0 && - digits <= 2); + (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2); } /** @@ -140,7 +134,8 @@ __device__ __host__ bool is_valid_digits(int segment, int digits) { */ enum ParseResult { OK = 0, INVALID = 1, UNSUPPORTED = 2 }; -template struct parse_timestamp_string_fn { +template +struct parse_timestamp_string_fn { column_device_view const d_strings; column_device_view const special_datetime_names; size_type default_tz_index; @@ -149,55 +144,48 @@ template struct parse_timestamp_string_fn { // to adjust the timestamp. The type of the values in this column is // LIST>. - thrust::optional transitions = - thrust::nullopt; - thrust::optional tz_indices = thrust::nullopt; + thrust::optional transitions = thrust::nullopt; + thrust::optional tz_indices = thrust::nullopt; - __device__ thrust::tuple - operator()(const cudf::size_type &idx) const { + __device__ thrust::tuple operator()(const cudf::size_type& idx) const + { // inherit the nullmask of the input column if (!d_strings.is_valid(idx)) { - return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, - ParseResult::INVALID); + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID); } auto const d_str = d_strings.element(idx); timestamp_components ts_comp{}; - char const *tz_lit_ptr = nullptr; - size_type tz_lit_len = 0; - switch (parse_string_to_timestamp_us(&ts_comp, &tz_lit_ptr, &tz_lit_len, - d_str)) { - case ParseResult::INVALID: - return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, - ParseResult::INVALID); - case ParseResult::UNSUPPORTED: - return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, - ParseResult::UNSUPPORTED); - case ParseResult::OK: - default: - break; + char const* tz_lit_ptr = nullptr; + size_type tz_lit_len = 0; + switch (parse_string_to_timestamp_us(&ts_comp, &tz_lit_ptr, &tz_lit_len, d_str)) { + case ParseResult::INVALID: + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID); + case ParseResult::UNSUPPORTED: + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, + ParseResult::UNSUPPORTED); + case ParseResult::OK: + default: break; } if constexpr (!with_timezone) { // path without timezone, in which unix_timestamp is straightforwardly // computed auto const ts_unaligned = compute_epoch_us(ts_comp); - return thrust::make_tuple( - cudf::timestamp_us{cudf::duration_us{ts_unaligned}}, ParseResult::OK); + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{ts_unaligned}}, + ParseResult::OK); } // path with timezone, in which timezone offset has to be determined before // computing unix_timestamp int64_t tz_offset; if (tz_lit_ptr == nullptr) { - tz_offset = extract_timezone_offset(compute_loose_epoch_s(ts_comp), - default_tz_index); + tz_offset = extract_timezone_offset(compute_loose_epoch_s(ts_comp), default_tz_index); } else { auto tz_view = string_view(tz_lit_ptr, tz_lit_len); // Firstly, try parsing as utc-like timezone rep - if (auto [utc_offset, ret_code] = parse_utc_like_tz(tz_view); - ret_code == 0) { + if (auto [utc_offset, ret_code] = parse_utc_like_tz(tz_view); ret_code == 0) { tz_offset = utc_offset; } else if (ret_code == 1) { // Then, try parsing as region-based timezone ID @@ -212,21 +200,18 @@ template struct parse_timestamp_string_fn { return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::UNSUPPORTED); } - tz_offset = - extract_timezone_offset(compute_loose_epoch_s(ts_comp), tz_index); + tz_offset = extract_timezone_offset(compute_loose_epoch_s(ts_comp), tz_index); } else { // (ret_code == 2) quick path to mark value invalid - return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, - ParseResult::INVALID); + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID); } } // Compute the epoch as UTC timezone, then apply the timezone offset. auto const ts_unaligned = compute_epoch_us(ts_comp); - return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{ - ts_unaligned - tz_offset * 1000000L}}, - ParseResult::OK); + return thrust::make_tuple( + cudf::timestamp_us{cudf::duration_us{ts_unaligned - tz_offset * 1000000L}}, ParseResult::OK); } /** @@ -251,16 +236,16 @@ template struct parse_timestamp_string_fn { * additional restriction: 18:00:00 is the upper bound (which means 18:00:01 * is invalid) */ - __device__ inline thrust::pair - parse_utc_like_tz(string_view const &tz_lit) const { + __device__ inline thrust::pair parse_utc_like_tz( + string_view const& tz_lit) const + { size_type len = tz_lit.size_bytes(); - char const *ptr = tz_lit.data(); + char const* ptr = tz_lit.data(); // try to parse Z if (*ptr == 'Z') { - if (len > 1) - return {0, 1}; + if (len > 1) return {0, 1}; return {0, 0}; } @@ -272,8 +257,7 @@ template struct parse_timestamp_string_fn { } // return for the pattern UTC|GMT (without exact offset) - if (len == char_offset) - return {0, 0}; + if (len == char_offset) return {0, 0}; // parse sign +|- char const sign_char = *(ptr + char_offset++); @@ -293,16 +277,14 @@ template struct parse_timestamp_string_fn { for (size_type i = 0; i < 3; i++) { // deal with the first digit hms[i] = *(ptr + char_offset++) - '0'; - if (hms[i] < 0 || hms[i] > 9) - return {0, 2}; + if (hms[i] < 0 || hms[i] > 9) return {0, 2}; // deal with trailing single digit instant: // hh(GMT+8) - valid // mm(GMT+11:2) - must be separated from (h)h by `:` // ss(GMT-11:22:3) - invalid if (len == char_offset) { - if (i == 2 || (i == 1 && !has_colon)) - return {0, 2}; + if (i == 2 || (i == 1 && !has_colon)) return {0, 2}; break; } @@ -311,34 +293,28 @@ template struct parse_timestamp_string_fn { // 1. (i == 1) one_digit mm with ss is invalid (+11:2:3) // 2. (i == 2) one_dight ss is invalid (+11:22:3) // 3. trailing `:` is invalid (GMT+8:) - if (i > 0 || len == ++char_offset) - return {0, 2}; + if (i > 0 || len == ++char_offset) return {0, 2}; has_colon = true; continue; } // deal with the second digit auto digit = *(ptr + char_offset++) - '0'; - if (digit < 0 || digit > 9) - return {0, 2}; + if (digit < 0 || digit > 9) return {0, 2}; hms[i] = hms[i] * 10 + digit; - if (len == char_offset) - break; + if (len == char_offset) break; // deal with `:` if (*(ptr + char_offset) == ':') { // trailing `:` is invalid (UTC+11:) - if (len == ++char_offset) - return {0, 2}; + if (len == ++char_offset) return {0, 2}; has_colon = true; } } // the upper bound is 18:00:00 (regardless of sign) - if (hms[0] > 18 || hms[1] > 59 || hms[2] > 59) - return {0, 2}; - if (hms[0] == 18 && hms[1] + hms[2] > 0) - return {0, 2}; + if (hms[0] > 18 || hms[1] > 59 || hms[2] > 59) return {0, 2}; + if (hms[0] == 18 && hms[1] + hms[2] > 0) return {0, 2}; return {sign * (hms[0] * 3600L + hms[1] * 60L + hms[2]), 0}; } @@ -346,14 +322,15 @@ template struct parse_timestamp_string_fn { /** * TODO: replace linear search with more efficient approach (like prefix tree) */ - __device__ inline int - query_index_from_tz_db(string_view const &tz_lit) const { + __device__ inline int query_index_from_tz_db(string_view const& tz_lit) const + { auto predicate = [tz = tz_indices, &tz_lit] __device__(auto const i) { return tz->element(i) == tz_lit; }; - auto ret = thrust::find_if( - thrust::seq, thrust::make_counting_iterator(0), - thrust::make_counting_iterator(tz_indices->size()), predicate); + auto ret = thrust::find_if(thrust::seq, + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(tz_indices->size()), + predicate); return *ret; } @@ -364,23 +341,21 @@ template struct parse_timestamp_string_fn { * `convert_timestamp_tz_functor`. */ __device__ inline int64_t extract_timezone_offset(int64_t loose_epoch_second, - size_type tz_index) const { - auto const &utc_offsets = transitions->child().child(2); - auto const &loose_instants = transitions->child().child(3); + size_type tz_index) const + { + auto const& utc_offsets = transitions->child().child(2); + auto const& loose_instants = transitions->child().child(3); - auto const local_transitions = - cudf::list_device_view{*transitions, tz_index}; - auto const list_size = local_transitions.size(); + auto const local_transitions = cudf::list_device_view{*transitions, tz_index}; + auto const list_size = local_transitions.size(); auto const transition_times = cudf::device_span( - loose_instants.data() + local_transitions.element_offset(0), - static_cast(list_size)); - - auto const it = - thrust::upper_bound(thrust::seq, transition_times.begin(), - transition_times.end(), loose_epoch_second); - auto const idx = - static_cast(thrust::distance(transition_times.begin(), it)); + loose_instants.data() + local_transitions.element_offset(0), + static_cast(list_size)); + + auto const it = thrust::upper_bound( + thrust::seq, transition_times.begin(), transition_times.end(), loose_epoch_second); + auto const idx = static_cast(thrust::distance(transition_times.begin(), it)); auto const list_offset = local_transitions.element_offset(idx - 1); return static_cast(utc_offsets.element(list_offset)); @@ -395,26 +370,24 @@ template struct parse_timestamp_string_fn { * the TimezoneDB(Java side). Then, we can apply binary search based on loose * epoch instants of TimezoneDB to find out the correct timezone offset. */ - __device__ inline int64_t - compute_loose_epoch_s(timestamp_components const &ts) const { - return (ts.year * 400 + (ts.month - 1) * 31 + ts.day - 1) * 86400L + - ts.hour * 3600L + ts.minute * 60L + ts.second; + __device__ inline int64_t compute_loose_epoch_s(timestamp_components const& ts) const + { + return (ts.year * 400 + (ts.month - 1) * 31 + ts.day - 1) * 86400L + ts.hour * 3600L + + ts.minute * 60L + ts.second; } /** * Leverage STL to convert local time to UTC unix_timestamp(in millisecond) */ - __device__ inline int64_t - compute_epoch_us(timestamp_components const &ts) const { - auto const ymd = // chrono class handles the leap year calculations for us - cuda::std::chrono::year_month_day( - cuda::std::chrono::year{ts.year}, - cuda::std::chrono::month{static_cast(ts.month)}, - cuda::std::chrono::day{static_cast(ts.day)}); + __device__ inline int64_t compute_epoch_us(timestamp_components const& ts) const + { + auto const ymd = // chrono class handles the leap year calculations for us + cuda::std::chrono::year_month_day(cuda::std::chrono::year{ts.year}, + cuda::std::chrono::month{static_cast(ts.month)}, + cuda::std::chrono::day{static_cast(ts.day)}); auto days = cuda::std::chrono::sys_days(ymd).time_since_epoch().count(); - int64_t timestamp_s = (days * 24L * 3600L) + (ts.hour * 3600L) + - (ts.minute * 60L) + ts.second; + int64_t timestamp_s = (days * 24L * 3600L) + (ts.hour * 3600L) + (ts.minute * 60L) + ts.second; return timestamp_s * 1000000L + ts.microseconds; } @@ -427,18 +400,16 @@ template struct parse_timestamp_string_fn { * Parse a string with time zone to a timestamp. * The bool in the returned tuple is false if the parse failed. */ - __device__ inline ParseResult - parse_string_to_timestamp_us(timestamp_components *ts_comp, - char const **parsed_tz_ptr, - size_type *parsed_tz_length, - cudf::string_view const ×tamp_str) const { - - if (timestamp_str.empty()) { - return ParseResult::INVALID; - } + __device__ inline ParseResult parse_string_to_timestamp_us( + timestamp_components* ts_comp, + char const** parsed_tz_ptr, + size_type* parsed_tz_length, + cudf::string_view const& timestamp_str) const + { + if (timestamp_str.empty()) { return ParseResult::INVALID; } - const char *curr_ptr = timestamp_str.data(); - const char *end_ptr = curr_ptr + timestamp_str.size_bytes(); + const char* curr_ptr = timestamp_str.data(); + const char* end_ptr = curr_ptr + timestamp_str.size_bytes(); // trim left while (curr_ptr < end_ptr && is_whitespace(*curr_ptr)) { @@ -451,29 +422,26 @@ template struct parse_timestamp_string_fn { // TODO: support special dates [epoch, now, today, yesterday, tomorrow] for (size_type i = 0; i < special_datetime_names.size(); i++) { - auto const &ref = special_datetime_names.element(i); - if (equals_ascii_ignore_case(curr_ptr, end_ptr, ref.data(), - ref.data() + ref.size_bytes())) { - *parsed_tz_ptr = ref.data(); + auto const& ref = special_datetime_names.element(i); + if (equals_ascii_ignore_case(curr_ptr, end_ptr, ref.data(), ref.data() + ref.size_bytes())) { + *parsed_tz_ptr = ref.data(); *parsed_tz_length = ref.size_bytes(); return ParseResult::UNSUPPORTED; } } - if (curr_ptr == end_ptr) { - return ParseResult::INVALID; - } + if (curr_ptr == end_ptr) { return ParseResult::INVALID; } - const char *const bytes = curr_ptr; + const char* const bytes = curr_ptr; const size_type bytes_length = end_ptr - curr_ptr; - int segments[] = {1, 1, 1, 0, 0, 0, 0, 0, 0}; - int segments_len = 9; - int i = 0; - int current_segment_value = 0; + int segments[] = {1, 1, 1, 0, 0, 0, 0, 0, 0}; + int segments_len = 9; + int i = 0; + int current_segment_value = 0; int current_segment_digits = 0; - size_t j = 0; - int digits_milli = 0; + size_t j = 0; + int digits_milli = 0; // bool just_time = false; thrust::optional year_sign; if ('-' == bytes[j] || '+' == bytes[j]) { @@ -486,7 +454,7 @@ template struct parse_timestamp_string_fn { } while (j < bytes_length) { - char b = bytes[j]; + char b = bytes[j]; int parsed_value = static_cast(b - '0'); if (parsed_value < 0 || parsed_value > 9) { if (0 == j && 'T' == b) { @@ -494,32 +462,26 @@ template struct parse_timestamp_string_fn { i += 3; } else if (i < 2) { if (b == '-') { - if (!is_valid_digits(i, current_segment_digits)) { - return ParseResult::INVALID; - } - segments[i] = current_segment_value; - current_segment_value = 0; + if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; } + segments[i] = current_segment_value; + current_segment_value = 0; current_segment_digits = 0; i += 1; } else if (0 == i && ':' == b && !year_sign.has_value()) { // just_time = true; - if (!is_valid_digits(3, current_segment_digits)) { - return ParseResult::INVALID; - } - segments[3] = current_segment_value; - current_segment_value = 0; + if (!is_valid_digits(3, current_segment_digits)) { return ParseResult::INVALID; } + segments[3] = current_segment_value; + current_segment_value = 0; current_segment_digits = 0; - i = 4; + i = 4; } else { return ParseResult::INVALID; } } else if (2 == i) { if (' ' == b || 'T' == b) { - if (!is_valid_digits(i, current_segment_digits)) { - return ParseResult::INVALID; - } - segments[i] = current_segment_value; - current_segment_value = 0; + if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; } + segments[i] = current_segment_value; + current_segment_value = 0; current_segment_digits = 0; i += 1; } else { @@ -527,11 +489,9 @@ template struct parse_timestamp_string_fn { } } else if (3 == i || 4 == i) { if (':' == b) { - if (!is_valid_digits(i, current_segment_digits)) { - return ParseResult::INVALID; - } - segments[i] = current_segment_value; - current_segment_value = 0; + if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; } + segments[i] = current_segment_value; + current_segment_value = 0; current_segment_digits = 0; i += 1; } else { @@ -539,20 +499,17 @@ template struct parse_timestamp_string_fn { } } else if (5 == i || 6 == i) { if ('.' == b && 5 == i) { - if (!is_valid_digits(i, current_segment_digits)) { - return ParseResult::INVALID; - } - segments[i] = current_segment_value; - current_segment_value = 0; + if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; } + segments[i] = current_segment_value; + current_segment_value = 0; current_segment_digits = 0; i += 1; } else { - if (!is_valid_digits(i, current_segment_digits) || - !allow_tz_in_date_str) { + if (!is_valid_digits(i, current_segment_digits) || !allow_tz_in_date_str) { return ParseResult::INVALID; } - segments[i] = current_segment_value; - current_segment_value = 0; + segments[i] = current_segment_value; + current_segment_value = 0; current_segment_digits = 0; i += 1; *parsed_tz_ptr = bytes + j; @@ -562,16 +519,12 @@ template struct parse_timestamp_string_fn { *parsed_tz_length = end_ptr - *parsed_tz_ptr; break; } - if (i == 6 && '.' != b) { - i += 1; - } + if (i == 6 && '.' != b) { i += 1; } } else { if (i < segments_len && (':' == b || ' ' == b)) { - if (!is_valid_digits(i, current_segment_digits)) { - return ParseResult::INVALID; - } - segments[i] = current_segment_value; - current_segment_value = 0; + if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; } + segments[i] = current_segment_value; + current_segment_value = 0; current_segment_digits = 0; i += 1; } else { @@ -579,9 +532,7 @@ template struct parse_timestamp_string_fn { } } } else { - if (6 == i) { - digits_milli += 1; - } + if (6 == i) { digits_milli += 1; } // We will truncate the nanosecond part if there are more than 6 digits, // which results in loss of precision if (6 != i || current_segment_digits < 6) { @@ -592,9 +543,7 @@ template struct parse_timestamp_string_fn { j += 1; } - if (!is_valid_digits(i, current_segment_digits)) { - return ParseResult::INVALID; - } + if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; } segments[i] = current_segment_value; while (digits_milli < 6) { @@ -606,12 +555,12 @@ template struct parse_timestamp_string_fn { // above is ported from Spark. // set components - ts_comp->year = segments[0]; - ts_comp->month = static_cast(segments[1]); - ts_comp->day = static_cast(segments[2]); - ts_comp->hour = static_cast(segments[3]); - ts_comp->minute = static_cast(segments[4]); - ts_comp->second = static_cast(segments[5]); + ts_comp->year = segments[0]; + ts_comp->month = static_cast(segments[1]); + ts_comp->day = static_cast(segments[2]); + ts_comp->hour = static_cast(segments[3]); + ts_comp->minute = static_cast(segments[4]); + ts_comp->second = static_cast(segments[5]); ts_comp->microseconds = segments[6]; return ParseResult::OK; @@ -626,85 +575,86 @@ template struct parse_timestamp_string_fn { * without_timezone ones by checking if transitions and tz_indices are nullptr. * */ -std::unique_ptr -to_timestamp(cudf::strings_column_view const &input, - cudf::strings_column_view const &special_datetime_lit, - bool ansi_mode, bool allow_tz_in_date_str = true, - size_type default_tz_index = 1000000000, - cudf::column_view const *transitions = nullptr, - cudf::strings_column_view const *tz_indices = nullptr) { +std::unique_ptr to_timestamp(cudf::strings_column_view const& input, + cudf::strings_column_view const& special_datetime_lit, + bool ansi_mode, + bool allow_tz_in_date_str = true, + size_type default_tz_index = 1000000000, + cudf::column_view const* transitions = nullptr, + cudf::strings_column_view const* tz_indices = nullptr) +{ auto const stream = cudf::get_default_stream(); - auto const mr = rmm::mr::get_current_device_resource(); + auto const mr = rmm::mr::get_current_device_resource(); auto d_strings = cudf::column_device_view::create(input.parent(), stream); auto d_special_datetime_lit = - cudf::column_device_view::create(special_datetime_lit.parent(), stream); + cudf::column_device_view::create(special_datetime_lit.parent(), stream); // column to store the result timestamp - auto result_col = cudf::make_timestamp_column( - cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}, input.size(), - cudf::mask_state::UNALLOCATED, stream, mr); + auto result_col = + cudf::make_timestamp_column(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}, + input.size(), + cudf::mask_state::UNALLOCATED, + stream, + mr); // column to store the status `ParseResult` auto result_valid_col = cudf::make_fixed_width_column( - cudf::data_type{cudf::type_id::UINT8}, input.size(), - cudf::mask_state::UNALLOCATED, stream, mr); + cudf::data_type{cudf::type_id::UINT8}, input.size(), cudf::mask_state::UNALLOCATED, stream, mr); if (transitions == nullptr || tz_indices == nullptr) { thrust::transform( - rmm::exec_policy(stream), thrust::make_counting_iterator(0), - thrust::make_counting_iterator(input.size()), - thrust::make_zip_iterator(thrust::make_tuple( - result_col->mutable_view().begin(), - result_valid_col->mutable_view().begin())), - parse_timestamp_string_fn{*d_strings, *d_special_datetime_lit, - default_tz_index, - allow_tz_in_date_str}); + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + thrust::make_zip_iterator( + thrust::make_tuple(result_col->mutable_view().begin(), + result_valid_col->mutable_view().begin())), + parse_timestamp_string_fn{ + *d_strings, *d_special_datetime_lit, default_tz_index, allow_tz_in_date_str}); } else { - auto const ft_cdv_ptr = column_device_view::create(*transitions, stream); + auto const ft_cdv_ptr = column_device_view::create(*transitions, stream); auto const d_transitions = lists_column_device_view{*ft_cdv_ptr}; - auto d_tz_indices = - cudf::column_device_view::create(tz_indices->parent(), stream); + auto d_tz_indices = cudf::column_device_view::create(tz_indices->parent(), stream); thrust::transform( - rmm::exec_policy(stream), thrust::make_counting_iterator(0), - thrust::make_counting_iterator(input.size()), - thrust::make_zip_iterator(thrust::make_tuple( - result_col->mutable_view().begin(), - result_valid_col->mutable_view().begin())), - parse_timestamp_string_fn{*d_strings, *d_special_datetime_lit, - default_tz_index, true, d_transitions, - *d_tz_indices}); + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + thrust::make_zip_iterator( + thrust::make_tuple(result_col->mutable_view().begin(), + result_valid_col->mutable_view().begin())), + parse_timestamp_string_fn{ + *d_strings, *d_special_datetime_lit, default_tz_index, true, d_transitions, *d_tz_indices}); } auto valid_view = result_valid_col->mutable_view(); // throw cuDF exception if there exists any unsupported formats auto exception_exists = - thrust::any_of(rmm::exec_policy(stream), valid_view.begin(), - valid_view.end(), [] __device__(uint8_t e) { - return e == ParseResult::UNSUPPORTED; - }); - if (exception_exists) { - CUDF_FAIL("There exists unsupported timestamp schema!"); - } + thrust::any_of(rmm::exec_policy(stream), + valid_view.begin(), + valid_view.end(), + [] __device__(uint8_t e) { return e == ParseResult::UNSUPPORTED; }); + if (exception_exists) { CUDF_FAIL("There exists unsupported timestamp schema!"); } // build the updated nullmask and compute the null count auto [valid_bitmask, valid_null_count] = cudf::detail::valid_if( - valid_view.begin(), valid_view.end(), - [] __device__(uint8_t e) { return e == 0; }, stream, mr); + valid_view.begin(), + valid_view.end(), + [] __device__(uint8_t e) { return e == 0; }, + stream, + mr); // `output null count > input null count` indicates that there are new null // values generated during the `to_timestamp` transaction to replace invalid // inputs. - if (ansi_mode && input.null_count() < valid_null_count) { - return nullptr; - } + if (ansi_mode && input.null_count() < valid_null_count) { return nullptr; } result_col->set_null_mask(valid_bitmask, valid_null_count, stream); return std::move(result_col); } -} // namespace +} // namespace namespace spark_rapids_jni { @@ -714,16 +664,16 @@ namespace spark_rapids_jni { * If does not have time zone in string, use the default time zone. */ std::unique_ptr string_to_timestamp_with_tz( - cudf::strings_column_view const &input, - cudf::column_view const &transitions, - cudf::strings_column_view const &tz_indices, - cudf::strings_column_view const &special_datetime_lit, - cudf::size_type default_tz_index, bool ansi_mode) { - if (input.size() == 0) { - return nullptr; - } - return to_timestamp(input, special_datetime_lit, ansi_mode, true, - default_tz_index, &transitions, &tz_indices); + cudf::strings_column_view const& input, + cudf::column_view const& transitions, + cudf::strings_column_view const& tz_indices, + cudf::strings_column_view const& special_datetime_lit, + cudf::size_type default_tz_index, + bool ansi_mode) +{ + if (input.size() == 0) { return nullptr; } + return to_timestamp( + input, special_datetime_lit, ansi_mode, true, default_tz_index, &transitions, &tz_indices); } /** @@ -734,13 +684,13 @@ std::unique_ptr string_to_timestamp_with_tz( * invalid. */ std::unique_ptr string_to_timestamp_without_tz( - cudf::strings_column_view const &input, - cudf::strings_column_view const &special_datetime_lit, bool allow_time_zone, - bool ansi_mode) { - if (input.size() == 0) { - return nullptr; - } + cudf::strings_column_view const& input, + cudf::strings_column_view const& special_datetime_lit, + bool allow_time_zone, + bool ansi_mode) +{ + if (input.size() == 0) { return nullptr; } return to_timestamp(input, special_datetime_lit, ansi_mode, allow_time_zone); } -} // namespace spark_rapids_jni +} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp index 9c536b6837..bcdfe55ebf 100644 --- a/src/main/cpp/src/datetime_parser.hpp +++ b/src/main/cpp/src/datetime_parser.hpp @@ -78,11 +78,12 @@ namespace spark_rapids_jni { * if there exists invalid inputs and ANSI mode is on. */ std::unique_ptr string_to_timestamp_with_tz( - cudf::strings_column_view const &input, - cudf::column_view const &transitions, - cudf::strings_column_view const &tz_indices, - cudf::strings_column_view const &special_datetime_lit, - cudf::size_type default_tz_index, bool ansi_mode); + cudf::strings_column_view const& input, + cudf::column_view const& transitions, + cudf::strings_column_view const& tz_indices, + cudf::strings_column_view const& special_datetime_lit, + cudf::size_type default_tz_index, + bool ansi_mode); /** * @@ -140,8 +141,9 @@ std::unique_ptr string_to_timestamp_with_tz( * if there exists invalid inputs and ANSI mode is on. */ std::unique_ptr string_to_timestamp_without_tz( - cudf::strings_column_view const &input, - cudf::strings_column_view const &special_datetime_lit, bool allow_time_zone, - bool ansi_mode); + cudf::strings_column_view const& input, + cudf::strings_column_view const& special_datetime_lit, + bool allow_time_zone, + bool ansi_mode); -} // namespace spark_rapids_jni +} // namespace spark_rapids_jni From 2cf49405eb2dc20d0803736eab4518abafa27a25 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 11 Jan 2024 13:40:07 +0800 Subject: [PATCH 12/35] Copyrights;typos --- src/main/cpp/CMakeLists.txt | 2 +- src/main/cpp/src/CastStringJni.cpp | 2 +- src/main/cpp/src/datetime_parser.cu | 22 +- src/main/cpp/src/datetime_parser.hpp | 6 +- src/main/cpp/tests/CMakeLists.txt | 3 - src/main/cpp/tests/datetime_parser.cpp | 188 ------------------ .../nvidia/spark/rapids/jni/CastStrings.java | 2 +- .../spark/rapids/jni/GpuTimeZoneDB.java | 2 +- .../spark/rapids/jni/CastStringsTest.java | 2 +- 9 files changed, 19 insertions(+), 210 deletions(-) delete mode 100644 src/main/cpp/tests/datetime_parser.cpp diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index 4eabade61b..d83253747b 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp index ee0a053b88..6d3fb6b405 100644 --- a/src/main/cpp/src/CastStringJni.cpp +++ b/src/main/cpp/src/CastStringJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index 7e2a73d959..25eef7291c 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -126,7 +126,7 @@ __device__ __host__ bool is_valid_digits(int segment, int digits) } /** - * We have to dintinguish INVALID value with UNSUPPORTED value. + * We have to distinguish INVALID value with UNSUPPORTED value. * INVALID means the value is invalid in Spark SQL. * UNSUPPORTED means the value is valid in Spark SQL but not supported by rapids * yet. As for INVALID values, we treat them in the same as Spark SQL. As for @@ -149,7 +149,7 @@ struct parse_timestamp_string_fn { __device__ thrust::tuple operator()(const cudf::size_type& idx) const { - // inherit the nullmask of the input column + // inherit the null mask of the input column if (!d_strings.is_valid(idx)) { return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID); } @@ -221,7 +221,7 @@ struct parse_timestamp_string_fn { * This function is purposed to be fully align to Apache Spark's behavior. The * function returns the status along with the result: 0 - successfully parsed * the timezone offset 1 - not a valid UTC-like timezone representation, maybe - * valid regioned-base rep 2 - not a valid timezone representation + * valid region-based rep 2 - not a valid timezone representation * * Valid patterns: * with colon @@ -267,7 +267,7 @@ struct parse_timestamp_string_fn { } else if (sign_char == '-') { sign = -1L; } else { - // if the rep starts with UTC|GMT, it can NOT be regioned-base rep + // if the rep starts with UTC|GMT, it can NOT be region-based rep return {0, char_offset < 3 ? 1 : 2}; } @@ -291,7 +291,7 @@ struct parse_timestamp_string_fn { // deal with `:` if (*(ptr + char_offset) == ':') { // 1. (i == 1) one_digit mm with ss is invalid (+11:2:3) - // 2. (i == 2) one_dight ss is invalid (+11:22:3) + // 2. (i == 2) one_digit ss is invalid (+11:22:3) // 3. trailing `:` is invalid (GMT+8:) if (i > 0 || len == ++char_offset) return {0, 2}; has_colon = true; @@ -336,7 +336,7 @@ struct parse_timestamp_string_fn { } /** - * Perform binaryserach to search out the timezone offset based on loose epoch + * Perform binary search to search out the timezone offset based on loose epoch * instants. Basically, this is the same approach as * `convert_timestamp_tz_functor`. */ @@ -571,7 +571,7 @@ struct parse_timestamp_string_fn { * The common entrance of string_to_timestamp, which combines two paths: * with_timezone and without_timezone. This function returns the The * transitions, tz_indices and default_tz_index are only for handling inputs - * with timezone. So, this function distinguish with_timezone callsfrom + * with timezone. So, this function distinguish with_timezone calls from * without_timezone ones by checking if transitions and tz_indices are nullptr. * */ @@ -637,7 +637,7 @@ std::unique_ptr to_timestamp(cudf::strings_column_view const& inpu [] __device__(uint8_t e) { return e == ParseResult::UNSUPPORTED; }); if (exception_exists) { CUDF_FAIL("There exists unsupported timestamp schema!"); } - // build the updated nullmask and compute the null count + // build the updated null mask and compute the null count auto [valid_bitmask, valid_null_count] = cudf::detail::valid_if( valid_view.begin(), valid_view.end(), @@ -660,7 +660,7 @@ namespace spark_rapids_jni { /** * Parse string column with time zone to timestamp column, - * Returns a pair of timestamp column and a bool indicates whether successed. + * Returns a pair of timestamp column and a bool indicates whether it successes. * If does not have time zone in string, use the default time zone. */ std::unique_ptr string_to_timestamp_with_tz( @@ -678,7 +678,7 @@ std::unique_ptr string_to_timestamp_with_tz( /** * Parse string column with time zone to timestamp column, - * Returns a pair of timestamp column and a bool indicates whether successed. + * Returns a pair of timestamp column and a bool indicates whether it successes. * Do not use the time zone in string. * If allow_time_zone is false and string contains time zone, then the string is * invalid. diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp index bcdfe55ebf..b1753af6a2 100644 --- a/src/main/cpp/src/datetime_parser.hpp +++ b/src/main/cpp/src/datetime_parser.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -68,7 +68,7 @@ namespace spark_rapids_jni { * @param transitions TimezoneDB, the table of transitions contains all * information for timezones * @param tz_indices TimezoneDB index of region-based timezone IDs - * @param special_datetime_lit cache of special datetimes + * @param special_datetime_lit cache of special date times * @param default_tz_index the index of default timezone in TimezoneDB, if input * date-like string does not contain a time zone (like: YYYY-MM-DD:hhmmss), use * this time zone. @@ -132,7 +132,7 @@ std::unique_ptr string_to_timestamp_with_tz( * * * @param input input string column view. - * @param special_datetime_lit cache of special datetimes + * @param special_datetime_lit cache of special date times * @param allow_time_zone whether allow time zone in the timestamp string. e.g.: * 1991-04-14T02:00:00Asia/Shanghai is invalid when do not allow time zone. * @param ansi_mode whether enforce ANSI mode or not. If true, exception will be diff --git a/src/main/cpp/tests/CMakeLists.txt b/src/main/cpp/tests/CMakeLists.txt index 1f58176327..617df6dfde 100644 --- a/src/main/cpp/tests/CMakeLists.txt +++ b/src/main/cpp/tests/CMakeLists.txt @@ -57,9 +57,6 @@ ConfigureTest(FORMAT_FLOAT ConfigureTest(CAST_FLOAT_TO_STRING cast_float_to_string.cpp) -ConfigureTest(DATETIME_PARSER - datetime_parser.cpp) - ConfigureTest(DATETIME_REBASE datetime_rebase.cpp) diff --git a/src/main/cpp/tests/datetime_parser.cpp b/src/main/cpp/tests/datetime_parser.cpp deleted file mode 100644 index 9ab4271327..0000000000 --- a/src/main/cpp/tests/datetime_parser.cpp +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include - -// - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using timestamp_col = - cudf::test::fixed_width_column_wrapper; -using micros_col = - cudf::test::fixed_width_column_wrapper; -struct DateTimeParserTest : public cudf::test::BaseFixture {}; - -TEST_F(DateTimeParserTest, ParseTimestamp) -{ - auto ts_strings = cudf::test::strings_column_wrapper( - { - "2023", - " 2023 ", - " 2023-11 ", - " 2023-11-5 ", - " 2023-11-05 3:04:55 ", - " 2023-11-05T03:4:55 ", - " 2023-11-05T3:4:55 ", - " 2023-11-5T3:4:55.", - " 2023-11-5T3:4:55.Iran", - " 2023-11-5T3:4:55.1 ", - " 2023-11-5T3:4:55.1Iran", - " 2023-11-05T03:04:55.123456 ", - " 2023-11-05T03:04:55.123456Iran ", - " 222222 ", - " ", // invalid - "", // invalid - "1-" // invalid - - }, - { - - 0, // null bit - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1 - - }); - auto d_2023_1_1 = (2023L * 365L * 86400L + 1 * 30L * 86400L + 1 * 86400L) * 1000000L; - auto d_2023_11_1 = (2023L * 365L * 86400L + 11 * 30L * 86400L + 1 * 86400L) * 1000000L; - auto d_2023_11_5 = (2023L * 365L * 86400L + 11L * 30L * 86400L + 5L * 86400L) * 1000000L; - auto t_3_4_55 = (3L * 3600L + 4L * 60L + 55L) * 1000000L; - auto d_2023_11_5_t_3_4_55 = d_2023_11_5 + t_3_4_55; - auto ts_col = timestamp_col( - { - - 0L, - d_2023_1_1, - d_2023_11_1, - d_2023_11_5, - d_2023_11_5_t_3_4_55, - d_2023_11_5_t_3_4_55, - d_2023_11_5_t_3_4_55, - d_2023_11_5_t_3_4_55, - d_2023_11_5_t_3_4_55, - d_2023_11_5_t_3_4_55 + 100000, - d_2023_11_5_t_3_4_55 + 100000, - d_2023_11_5_t_3_4_55 + 123456, - d_2023_11_5_t_3_4_55 + 123456, - (222222L * 365L * 86400L + 1 * 30L * 86400L + 1 * 86400L) * 1000000L, - 0L, - 0L, - 0L - - }, - { - 0, // null bit - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 0, // null bit - 0, // null bit - 0 // null bit - - }); - auto ret = - spark_rapids_jni::string_to_timestamp(cudf::strings_column_view(ts_strings), "Z", true, false); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(ts_col, *(ret.first)); - assert(ret.second == true); - - ts_strings = cudf::test::strings_column_wrapper( - { - - "invalid" - - }, - { - - 1 - - }); - ts_col = timestamp_col( - { - - 0L - - }, - {0 - - }); - ret = - spark_rapids_jni::string_to_timestamp(cudf::strings_column_view(ts_strings), "Z", true, true); - assert(ret.first == nullptr); - assert(ret.second == false); - - ts_strings = cudf::test::strings_column_wrapper( - { - - " Epoch ", " NOW ", " today ", " tomoRRow ", " yesTERday " - - }, - { - - 1, 1, 1, 1, 1 - - }); - ts_col = timestamp_col( - {// Temp implement: epoch -> 111, now -> 222, ... , yesterday -> 555 - 111L, - 222L, - 333L, - 444L, - 555L - - }, - {1, 1, 1, 1, 1 - - }); - ret = - spark_rapids_jni::string_to_timestamp(cudf::strings_column_view(ts_strings), "Z", true, true); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(ts_col, *(ret.first)); - assert(ret.second == true); -} diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java index b383468e7e..eb5c09b062 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java index cc9ba04a8e..cccf831081 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java @@ -1,5 +1,5 @@ /* -* Copyright (c) 2023, NVIDIA CORPORATION. +* Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java index a8939bc825..e5384e4be7 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 8eff534ec7bc76497a46a11e0e8759ca3f45942a Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 11 Jan 2024 17:25:03 +0800 Subject: [PATCH 13/35] Fix compile error; Update comments --- src/main/cpp/src/datetime_parser.cu | 2 +- src/main/cpp/src/datetime_parser.hpp | 11 ++--------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index 25eef7291c..8e9c503722 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -651,7 +651,7 @@ std::unique_ptr to_timestamp(cudf::strings_column_view const& inpu if (ansi_mode && input.null_count() < valid_null_count) { return nullptr; } result_col->set_null_mask(valid_bitmask, valid_null_count, stream); - return std::move(result_col); + return result_col; } } // namespace diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp index b1753af6a2..7ee05b84ec 100644 --- a/src/main/cpp/src/datetime_parser.hpp +++ b/src/main/cpp/src/datetime_parser.hpp @@ -58,11 +58,7 @@ namespace spark_rapids_jni { * - +|-hhmmss * - Region-based zone IDs in the form `area/city`, such as `Europe/Paris` * - * Unlike Spark, Spark-Rapids only supports the following time zones: - * - Z - Zulu time zone UTC+0 - * - +|-[h]h:[m]m - * - Region-based zone IDs in the form `area/city`, such as `Europe/Paris` - * + * Unlike Spark, Spark-Rapids currently does not support DST time zones. * * @param input input string column view. * @param transitions TimezoneDB, the table of transitions contains all @@ -125,10 +121,7 @@ std::unique_ptr string_to_timestamp_with_tz( * - +|-hhmmss * - Region-based zone IDs in the form `area/city`, such as `Europe/Paris` * - * Unlike Spark, Spark-Rapids only supports the following time zones: - * - Z - Zulu time zone UTC+0 - * - +|-[h]h:[m]m - * - Region-based zone IDs in the form `area/city`, such as `Europe/Paris` + * Unlike Spark, Spark-Rapids currently does not support DST time zones. * * * @param input input string column view. From 5dbc7ebc785026032167b030b249b68526f36f2e Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Fri, 12 Jan 2024 18:30:36 +0800 Subject: [PATCH 14/35] Remove supports for cast special strings(epoch now today yesterday tomorrow) to timestamp because only Spark31x supports and Spark320+ does not supports --- src/main/cpp/src/CastStringJni.cpp | 11 +---- src/main/cpp/src/datetime_parser.cu | 47 ++----------------- src/main/cpp/src/datetime_parser.hpp | 4 -- .../nvidia/spark/rapids/jni/CastStrings.java | 17 +++---- .../spark/rapids/jni/GpuTimeZoneDB.java | 23 ++------- .../spark/rapids/jni/CastStringsTest.java | 8 +++- 6 files changed, 21 insertions(+), 89 deletions(-) diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp index 6d3fb6b405..82a3dc3242 100644 --- a/src/main/cpp/src/CastStringJni.cpp +++ b/src/main/cpp/src/CastStringJni.cpp @@ -263,7 +263,6 @@ Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv* env, jlong input_column, jlong transitions_handle, jlong tz_indices_col, - jlong special_dt_lit_col, jint tz_default_index, jboolean ansi_enabled) { @@ -277,13 +276,10 @@ Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv* env, reinterpret_cast(transitions_handle)->column(0); auto const& tz_indices_view = cudf::strings_column_view(*reinterpret_cast(tz_indices_col)); - auto const& special_dt_lit_view = - cudf::strings_column_view(*reinterpret_cast(special_dt_lit_col)); - auto const tz_index = static_cast(tz_default_index); auto ret_cv = spark_rapids_jni::string_to_timestamp_with_tz( - input_view, transitions, tz_indices_view, special_dt_lit_view, tz_index, ansi_enabled); + input_view, transitions, tz_indices_view, tz_index, ansi_enabled); if (ret_cv) { return cudf::jni::release_as_jlong(ret_cv); } } CATCH_STD(env, 0); @@ -298,7 +294,6 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestampWithoutTimeZone(JNIEnv* env, jclass, jlong input_column, - jlong special_dt_lit_col, jboolean allow_time_zone, jboolean ansi_enabled) { @@ -307,11 +302,9 @@ Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestampWithoutTimeZone(JNIEnv* cudf::jni::auto_set_device(env); auto const& input_view = cudf::strings_column_view(*reinterpret_cast(input_column)); - auto const& special_dt_lit_view = - cudf::strings_column_view(*reinterpret_cast(special_dt_lit_col)); auto ret_cv = spark_rapids_jni::string_to_timestamp_without_tz( - input_view, special_dt_lit_view, allow_time_zone, ansi_enabled); + input_view, allow_time_zone, ansi_enabled); if (ret_cv) { return cudf::jni::release_as_jlong(ret_cv); } } CATCH_STD(env, 0); diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index 8e9c503722..3ba84f2d26 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -87,28 +87,6 @@ __device__ __host__ inline bool is_whitespace(const char chr) } } -/** - * Whether the given two strings are equal, - * used to compare special timestamp strings ignoring case: - * "epoch", "now", "today", "yesterday", "tomorrow" - * the expect string should be lower-case a-z chars - */ -__device__ inline bool equals_ascii_ignore_case(char const* actual_begin, - char const* actual_end, - char const* expect_begin, - char const* expect_end) -{ - if (actual_end - actual_begin != expect_end - expect_begin) { return false; } - - while (expect_begin < expect_end) { - // the diff between upper case and lower case for a same char is 32 - if (*actual_begin != *expect_begin && *actual_begin != (*expect_begin - 32)) { return false; } - actual_begin++; - expect_begin++; - } - return true; -} - /** * Ported from Spark */ @@ -137,7 +115,6 @@ enum ParseResult { OK = 0, INVALID = 1, UNSUPPORTED = 2 }; template struct parse_timestamp_string_fn { column_device_view const d_strings; - column_device_view const special_datetime_names; size_type default_tz_index; bool allow_tz_in_date_str = true; // The list column of transitions to figure out the correct offset @@ -420,16 +397,6 @@ struct parse_timestamp_string_fn { --end_ptr; } - // TODO: support special dates [epoch, now, today, yesterday, tomorrow] - for (size_type i = 0; i < special_datetime_names.size(); i++) { - auto const& ref = special_datetime_names.element(i); - if (equals_ascii_ignore_case(curr_ptr, end_ptr, ref.data(), ref.data() + ref.size_bytes())) { - *parsed_tz_ptr = ref.data(); - *parsed_tz_length = ref.size_bytes(); - return ParseResult::UNSUPPORTED; - } - } - if (curr_ptr == end_ptr) { return ParseResult::INVALID; } const char* const bytes = curr_ptr; @@ -576,7 +543,6 @@ struct parse_timestamp_string_fn { * */ std::unique_ptr to_timestamp(cudf::strings_column_view const& input, - cudf::strings_column_view const& special_datetime_lit, bool ansi_mode, bool allow_tz_in_date_str = true, size_type default_tz_index = 1000000000, @@ -587,9 +553,6 @@ std::unique_ptr to_timestamp(cudf::strings_column_view const& inpu auto const mr = rmm::mr::get_current_device_resource(); auto d_strings = cudf::column_device_view::create(input.parent(), stream); - auto d_special_datetime_lit = - cudf::column_device_view::create(special_datetime_lit.parent(), stream); - // column to store the result timestamp auto result_col = cudf::make_timestamp_column(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}, @@ -610,7 +573,7 @@ std::unique_ptr to_timestamp(cudf::strings_column_view const& inpu thrust::make_tuple(result_col->mutable_view().begin(), result_valid_col->mutable_view().begin())), parse_timestamp_string_fn{ - *d_strings, *d_special_datetime_lit, default_tz_index, allow_tz_in_date_str}); + *d_strings, default_tz_index, allow_tz_in_date_str}); } else { auto const ft_cdv_ptr = column_device_view::create(*transitions, stream); auto const d_transitions = lists_column_device_view{*ft_cdv_ptr}; @@ -624,7 +587,7 @@ std::unique_ptr to_timestamp(cudf::strings_column_view const& inpu thrust::make_tuple(result_col->mutable_view().begin(), result_valid_col->mutable_view().begin())), parse_timestamp_string_fn{ - *d_strings, *d_special_datetime_lit, default_tz_index, true, d_transitions, *d_tz_indices}); + *d_strings, default_tz_index, true, d_transitions, *d_tz_indices}); } auto valid_view = result_valid_col->mutable_view(); @@ -667,13 +630,12 @@ std::unique_ptr string_to_timestamp_with_tz( cudf::strings_column_view const& input, cudf::column_view const& transitions, cudf::strings_column_view const& tz_indices, - cudf::strings_column_view const& special_datetime_lit, cudf::size_type default_tz_index, bool ansi_mode) { if (input.size() == 0) { return nullptr; } return to_timestamp( - input, special_datetime_lit, ansi_mode, true, default_tz_index, &transitions, &tz_indices); + input, ansi_mode, true, default_tz_index, &transitions, &tz_indices); } /** @@ -685,12 +647,11 @@ std::unique_ptr string_to_timestamp_with_tz( */ std::unique_ptr string_to_timestamp_without_tz( cudf::strings_column_view const& input, - cudf::strings_column_view const& special_datetime_lit, bool allow_time_zone, bool ansi_mode) { if (input.size() == 0) { return nullptr; } - return to_timestamp(input, special_datetime_lit, ansi_mode, allow_time_zone); + return to_timestamp(input, ansi_mode, allow_time_zone); } } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp index 7ee05b84ec..c7f9d5ec65 100644 --- a/src/main/cpp/src/datetime_parser.hpp +++ b/src/main/cpp/src/datetime_parser.hpp @@ -64,7 +64,6 @@ namespace spark_rapids_jni { * @param transitions TimezoneDB, the table of transitions contains all * information for timezones * @param tz_indices TimezoneDB index of region-based timezone IDs - * @param special_datetime_lit cache of special date times * @param default_tz_index the index of default timezone in TimezoneDB, if input * date-like string does not contain a time zone (like: YYYY-MM-DD:hhmmss), use * this time zone. @@ -77,7 +76,6 @@ std::unique_ptr string_to_timestamp_with_tz( cudf::strings_column_view const& input, cudf::column_view const& transitions, cudf::strings_column_view const& tz_indices, - cudf::strings_column_view const& special_datetime_lit, cudf::size_type default_tz_index, bool ansi_mode); @@ -125,7 +123,6 @@ std::unique_ptr string_to_timestamp_with_tz( * * * @param input input string column view. - * @param special_datetime_lit cache of special date times * @param allow_time_zone whether allow time zone in the timestamp string. e.g.: * 1991-04-14T02:00:00Asia/Shanghai is invalid when do not allow time zone. * @param ansi_mode whether enforce ANSI mode or not. If true, exception will be @@ -135,7 +132,6 @@ std::unique_ptr string_to_timestamp_with_tz( */ std::unique_ptr string_to_timestamp_without_tz( cudf::strings_column_view const& input, - cudf::strings_column_view const& special_datetime_lit, bool allow_time_zone, bool ansi_mode); diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java index eb5c09b062..c515269c27 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java @@ -200,10 +200,9 @@ public static ColumnVector toTimestamp(ColumnView cv, ZoneId defaultTimeZone, bo Integer tzIndex = singleton.getZoneIDMap().get(defaultTimeZone.normalized().toString()); try (Table transitions = singleton.getTransitions(); - ColumnVector tzIndices = singleton.getZoneIDVector(); - ColumnVector specialTz = singleton.getSpecialTzVector()) { + ColumnVector tzIndices = singleton.getZoneIDVector()) { return new ColumnVector(toTimestamp(cv.getNativeView(), transitions.getNativeView(), - tzIndices.getNativeView(), specialTz.getNativeView(), tzIndex, ansiEnabled)); + tzIndices.getNativeView(), tzIndex, ansiEnabled)); } } @@ -246,11 +245,7 @@ public static ColumnVector toTimestampWithoutTimeZone(ColumnView cv, boolean all if (!singleton.isLoaded()) { GpuTimeZoneDB.cacheDatabase(); } - - try (ColumnVector specialTz = singleton.getSpecialTzVector()) { - return new ColumnVector(toTimestampWithoutTimeZone(cv.getNativeView(), specialTz.getNativeView(), - allowTimeZone, ansiEnabled)); - } + return new ColumnVector(toTimestampWithoutTimeZone(cv.getNativeView(), allowTimeZone, ansiEnabled)); } private static native long toInteger(long nativeColumnView, boolean ansi_enabled, boolean strip, @@ -265,7 +260,7 @@ private static native long toIntegersWithBase(long nativeColumnView, int base, boolean ansiEnabled, int dtype); private static native long fromIntegersWithBase(long nativeColumnView, int base); private static native long toTimestamp(long input, - long transitions, long tzIndices, long specialDate, int tzIndex, boolean ansiEnabled); - private static native long toTimestampWithoutTimeZone(long input, - long specialDate, boolean allowTimeZone, boolean ansiEnabled); + long transitions, long tzIndices, int tzIndex, boolean ansiEnabled); + private static native long toTimestampWithoutTimeZone(long input, boolean allowTimeZone, + boolean ansiEnabled); } diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java index cccf831081..6b09d2dda0 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java @@ -35,8 +35,6 @@ public class GpuTimeZoneDB { public static final int TIMEOUT_SECS = 300; - public static final String[] SPECIAL_TZ_LITERALS = {"epoch", "now", "today", "tomorrow", "yesterday"}; - // For the timezone database, we store the transitions in a ColumnVector that is a list of // structs. The type of this column vector is: @@ -44,7 +42,6 @@ public class GpuTimeZoneDB { private CompletableFuture> zoneIdToTableFuture; private CompletableFuture fixedTransitionsFuture; private CompletableFuture zoneIdVectorFuture; - private CompletableFuture specialTzLiteralsFuture; private boolean closed = false; @@ -52,7 +49,6 @@ public class GpuTimeZoneDB { zoneIdToTableFuture = new CompletableFuture<>(); fixedTransitionsFuture = new CompletableFuture<>(); zoneIdVectorFuture = new CompletableFuture<>(); - specialTzLiteralsFuture = new CompletableFuture<>(); } private static GpuTimeZoneDB instance = new GpuTimeZoneDB(); @@ -163,7 +159,7 @@ public static ZoneId getZoneId(String timeZoneId) { public boolean isLoaded() { return zoneIdToTableFuture.isDone() && fixedTransitionsFuture.isDone() && - zoneIdVectorFuture.isDone() && specialTzLiteralsFuture.isDone(); + zoneIdVectorFuture.isDone(); } private void loadData(Executor executor) throws IllegalStateException { @@ -286,18 +282,14 @@ private void doLoadData() { try (HostColumnVector fixedTransitions = HostColumnVector.fromLists(resultType, masterTransitions.toArray(new List[0]))) { try (HostColumnVector zoneIdVector = HostColumnVector.fromStrings(zondIdList.toArray(new String[0]))) { - try (HostColumnVector specialTzVector = HostColumnVector.fromStrings(SPECIAL_TZ_LITERALS)) { - fixedTransitionsFuture.complete(fixedTransitions.incRefCount()); - zoneIdVectorFuture.complete(zoneIdVector.incRefCount()); - specialTzLiteralsFuture.complete(specialTzVector.incRefCount()); - } + fixedTransitionsFuture.complete(fixedTransitions.incRefCount()); + zoneIdVectorFuture.complete(zoneIdVector.incRefCount()); } } } catch (Exception e) { fixedTransitionsFuture.completeExceptionally(e); zoneIdToTableFuture.completeExceptionally(e); zoneIdVectorFuture.completeExceptionally(e); - specialTzLiteralsFuture.completeExceptionally(e); throw e; } } @@ -340,15 +332,6 @@ public ColumnVector getZoneIDVector() { } } - public ColumnVector getSpecialTzVector() { - try { - HostColumnVector hcv = specialTzLiteralsFuture.get(TIMEOUT_SECS, TimeUnit.SECONDS); - return hcv.copyToDevice(); - } catch (InterruptedException | ExecutionException | TimeoutException e) { - throw new RuntimeException(e); - } - } - public Table getTransitions() { try (ColumnVector fixedTransitions = getFixedTransitions()) { return new Table(fixedTransitions); diff --git a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java index e5384e4be7..03ab672c4c 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java @@ -436,11 +436,15 @@ void toTimestampTestWithTz() { CastStrings.toTimestamp(input, ZoneId.of("UTC"), false); } }); + // Throw unsupported exception for symbols of special dates + // Note: Spark 31x supports "epoch", "now", "today", "yesterday", "tomorrow". + // But Spark 32x to Spark 35x do not supports. + // Currently JNI do not supports for (String date : new String[]{"epoch", "now", "today", "yesterday", "tomorrow"}) - assertThrows(ai.rapids.cudf.CudfException.class, () -> { + assertThrows(IllegalArgumentException.class, () -> { try (ColumnVector input = ColumnVector.fromStrings(date)) { - CastStrings.toTimestamp(input, ZoneId.of("UTC"), false); + CastStrings.toTimestamp(input, ZoneId.of("UTC"), true); } }); From 167df5057a32c71e4083e2709438a9f582c550be Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Fri, 12 Jan 2024 18:40:08 +0800 Subject: [PATCH 15/35] Add comments --- .../nvidia/spark/rapids/jni/CastStrings.java | 42 +++++++++++++++---- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java index c515269c27..d27fa5c118 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java @@ -167,10 +167,23 @@ public static ColumnVector fromIntegersWithBase(ColumnView cv, int base) { * `[+-]yyyy*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` * `[+-]yyyy*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` * - * Supports the following time zones: - * - Z - Zulu time zone UTC+0 - * - +|-[h]h:[m]m - * - Region-based zone IDs in the form `area/city`, such as `Europe/Paris` + * Spark supports the following zone id forms: + * - Z - Zulu time zone UTC+0 + * - +|-[h]h:[m]m + * - A short id, see + * https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS + * - An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-, + * and a suffix in the formats: + * - +|-h[h] + * - +|-hh[:]mm + * - +|-hh:mm:ss + * - +|-hhmmss + * - Region-based zone IDs in the form `area/city`, such as `Europe/Paris` + * + * Unlike Spark, Spark-Rapids currently does not support DST time zones. + * + * Note: Do not support cast special strings(epoch now today yesterday tomorrow) to timestamp. + * Spark31x supports cast special strings while Spark320+ do not supports * * Example: * input = [" 2023", "2023-01-01T08:00:00Asia/Shanghai "] @@ -219,10 +232,23 @@ public static ColumnVector toTimestamp(ColumnView cv, ZoneId defaultTimeZone, bo * `[+-]yyyy*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` * `[+-]yyyy*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` * - * Supports the following time zones: - * - Z - Zulu time zone UTC+0 - * - +|-[h]h:[m]m - * - Region-based zone IDs in the form `area/city`, such as `Europe/Paris` + * Spark supports the following zone id forms: + * - Z - Zulu time zone UTC+0 + * - +|-[h]h:[m]m + * - A short id, see + * https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS + * - An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-, + * and a suffix in the formats: + * - +|-h[h] + * - +|-hh[:]mm + * - +|-hh:mm:ss + * - +|-hhmmss + * - Region-based zone IDs in the form `area/city`, such as `Europe/Paris` + * + * Unlike Spark, Spark-Rapids currently does not support DST time zones. + * + * Note: Do not support cast special strings(epoch now today yesterday tomorrow) to timestamp. + * Spark31x supports cast special strings while Spark320+ do not supports * * Example: * input = [" 2023", "2023-01-01T08:00:00Asia/Shanghai "] From ec1c68687c1cc4267cd7d9c118a1126f343587c9 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Mon, 15 Jan 2024 10:08:28 +0800 Subject: [PATCH 16/35] Add comments; Add test cases --- .../com/nvidia/spark/rapids/jni/CastStrings.java | 9 +++++++-- .../nvidia/spark/rapids/jni/CastStringsTest.java | 16 ++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java index d27fa5c118..f45d2163d2 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java @@ -190,14 +190,19 @@ public static ColumnVector fromIntegersWithBase(ColumnView cv, int base) { * ts = toTimestamp(input, "UTC", allowSpecialExpressions = true, ansiEnabled = * false) * ts is: ['2023-01-01 00:00:00', '2023-01-01T00:00:00'] + * + * Example: + * input = ["2023-01-01T08:00:00 non-exist-time-zone"] + * In ANSI mode: throws IllegalArgumentException + * In non-ANSI mode: return null value * * @param cv The input string column to be converted. * @param defaultTimeZone Use the default time zone if string does not * contain time zone. * @param ansiEnabled is Ansi mode * @return a timestamp column - * @throws IllegalArgumentException if cv contains invalid value when - * ansiEnabled is true + * @throws IllegalArgumentException if cv contains invalid value or the time zone is + * non-existed when ansiEnabled is true */ public static ColumnVector toTimestamp(ColumnView cv, ZoneId defaultTimeZone, boolean ansiEnabled) { if (!GpuTimeZoneDB.isSupportedTimeZone(defaultTimeZone)) { diff --git a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java index 03ab672c4c..919f0d035e 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java @@ -472,5 +472,21 @@ void toTimestampTestWithTz() { CastStrings.toTimestamp(input, ZoneId.of("UTC"), true); } }); + + // Throw IllegalArgumentException for non-exist-tz in ANSI mode + assertThrows(IllegalArgumentException.class, () -> { + try (ColumnVector input = ColumnVector.fromStrings("2000-01-29 1:2:3 non-exist-tz")) { + CastStrings.toTimestamp(input, ZoneId.of("UTC"), true); + } + }); + + // Return null for non-exist-tz in non-Ansi mode + try ( + ColumnVector input = ColumnVector.fromStrings("2000-01-29 1:2:3 non-exist-tz"); + ColumnVector actual = CastStrings.toTimestamp(input, ZoneId.of("UTC"), false)) { + Long[] expected = {null}; + AssertUtils.assertColumnsAreEqual(expected, actual); + } + } } From 3cba7d06d27eb3b888b27c5a0e4bb22fb3a9cf26 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Mon, 15 Jan 2024 10:39:12 +0800 Subject: [PATCH 17/35] Address comments --- src/main/cpp/src/datetime_parser.cu | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index 3ba84f2d26..ec040e89f2 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -402,6 +402,8 @@ struct parse_timestamp_string_fn { const char* const bytes = curr_ptr; const size_type bytes_length = end_ptr - curr_ptr; + // segments stores: [year, month, day, hour, minute, seconds, microseconds, no_use_item, no_use_item] + // the two tail items are no use, but here keeps them as Spark does int segments[] = {1, 1, 1, 0, 0, 0, 0, 0, 0}; int segments_len = 9; int i = 0; @@ -521,7 +523,9 @@ struct parse_timestamp_string_fn { segments[0] *= year_sign.value_or(1); // above is ported from Spark. - // set components + // copy segments to equivalent kernel timestamp_components + // Note: In order to keep above code is equivalent to Spark implementation, + // did not use `timestamp_components` directly to save values. ts_comp->year = segments[0]; ts_comp->month = static_cast(segments[1]); ts_comp->day = static_cast(segments[2]); From e6af1958ea768013adc41e5a227c0113c7a8fb21 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Mon, 15 Jan 2024 10:52:20 +0800 Subject: [PATCH 18/35] Fix case --- src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java index 919f0d035e..372ae3bd42 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java @@ -481,10 +481,11 @@ void toTimestampTestWithTz() { }); // Return null for non-exist-tz in non-Ansi mode + Long[] nullExpected = {null}; try ( ColumnVector input = ColumnVector.fromStrings("2000-01-29 1:2:3 non-exist-tz"); + ColumnVector expected = ColumnVector.timestampMicroSecondsFromBoxedLongs(nullExpected); ColumnVector actual = CastStrings.toTimestamp(input, ZoneId.of("UTC"), false)) { - Long[] expected = {null}; AssertUtils.assertColumnsAreEqual(expected, actual); } From 0b33ff9f3b6b5a22475de3f7c1af3cfdea3277e7 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Mon, 15 Jan 2024 10:54:14 +0800 Subject: [PATCH 19/35] Update --- src/main/cpp/src/datetime_parser.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index ec040e89f2..86190ba861 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -383,8 +383,6 @@ struct parse_timestamp_string_fn { size_type* parsed_tz_length, cudf::string_view const& timestamp_str) const { - if (timestamp_str.empty()) { return ParseResult::INVALID; } - const char* curr_ptr = timestamp_str.data(); const char* end_ptr = curr_ptr + timestamp_str.size_bytes(); From a8fc54cf83589b5e17a82dbbe1c54e78ac6d79af Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Mon, 15 Jan 2024 19:08:14 +0800 Subject: [PATCH 20/35] Support short time zone IDs, like PST, CTT...... --- src/main/cpp/src/CastStringJni.cpp | 7 ++-- src/main/cpp/src/datetime_parser.cu | 34 ++++++++++++------- src/main/cpp/src/datetime_parser.hpp | 3 +- .../nvidia/spark/rapids/jni/CastStrings.java | 7 ++-- .../spark/rapids/jni/GpuTimeZoneDB.java | 34 +++++++++++++++++++ .../spark/rapids/jni/CastStringsTest.java | 3 ++ 6 files changed, 68 insertions(+), 20 deletions(-) diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp index 82a3dc3242..ab4f977750 100644 --- a/src/main/cpp/src/CastStringJni.cpp +++ b/src/main/cpp/src/CastStringJni.cpp @@ -264,7 +264,8 @@ Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv* env, jlong transitions_handle, jlong tz_indices_col, jint tz_default_index, - jboolean ansi_enabled) + jboolean ansi_enabled, + jlong tz_short_ids) { JNI_NULL_CHECK(env, input_column, "input column is null", 0); try { @@ -277,9 +278,9 @@ Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv* env, auto const& tz_indices_view = cudf::strings_column_view(*reinterpret_cast(tz_indices_col)); auto const tz_index = static_cast(tz_default_index); - + const cudf::column_view *tz_short_ids_view = reinterpret_cast(tz_short_ids); auto ret_cv = spark_rapids_jni::string_to_timestamp_with_tz( - input_view, transitions, tz_indices_view, tz_index, ansi_enabled); + input_view, transitions, tz_indices_view, tz_index, ansi_enabled, *tz_short_ids_view); if (ret_cv) { return cudf::jni::release_as_jlong(ret_cv); } } CATCH_STD(env, 0); diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index 86190ba861..656215364d 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -123,6 +123,7 @@ struct parse_timestamp_string_fn { // looseTzInstant: int64>>. thrust::optional transitions = thrust::nullopt; thrust::optional tz_indices = thrust::nullopt; + thrust::optional tz_short_ids = thrust::nullopt; __device__ thrust::tuple operator()(const cudf::size_type& idx) const { @@ -161,6 +162,19 @@ struct parse_timestamp_string_fn { tz_offset = extract_timezone_offset(compute_loose_epoch_s(ts_comp), default_tz_index); } else { auto tz_view = string_view(tz_lit_ptr, tz_lit_len); + + // Map short TZ ID to region-based timezone if tz_view is a short ID + auto const& short_name_col = tz_short_ids->child(0); + auto const& region_based_col = tz_short_ids->child(1); + for (size_type i = 0; i < tz_short_ids->size(); i++) { + auto const& curr_short_id = short_name_col.element(i); + if (curr_short_id == tz_view) { + // find short ID, replace tz_view with mapped region TZ ID + tz_view = region_based_col.element(i); + break; + } + } + // Firstly, try parsing as utc-like timezone rep if (auto [utc_offset, ret_code] = parse_utc_like_tz(tz_view); ret_code == 0) { tz_offset = utc_offset; @@ -192,7 +206,6 @@ struct parse_timestamp_string_fn { } /** - * TODO: support CST/PST/AST * * Parse UTC-like timezone representation such as: UTC+11:22:33, GMT-8:08:01. * This function is purposed to be fully align to Apache Spark's behavior. The @@ -207,8 +220,6 @@ struct parse_timestamp_string_fn { * without colon * hh only : ^(GMT|UTC)?[+-](\d|0[0-9]|1[0-8]) * hh:mm:(ss) : ^(GMT|UTC)?[+-](0[0-9]|1[0-8])([0-5][0-9])?([0-5][0-9])? - * special symbols: - * ^(Z|CST|PST|AST|...) * * additional restriction: 18:00:00 is the upper bound (which means 18:00:01 * is invalid) @@ -220,12 +231,6 @@ struct parse_timestamp_string_fn { char const* ptr = tz_lit.data(); - // try to parse Z - if (*ptr == 'Z') { - if (len > 1) return {0, 1}; - return {0, 0}; - } - size_t char_offset = 0; // skip UTC|GMT if existing if (len > 2 && ((*ptr == 'G' && *(ptr + 1) == 'M' && *(ptr + 2) == 'T') || @@ -549,7 +554,8 @@ std::unique_ptr to_timestamp(cudf::strings_column_view const& inpu bool allow_tz_in_date_str = true, size_type default_tz_index = 1000000000, cudf::column_view const* transitions = nullptr, - cudf::strings_column_view const* tz_indices = nullptr) + cudf::strings_column_view const* tz_indices = nullptr, + cudf::column_view const* tz_short_ids = nullptr) { auto const stream = cudf::get_default_stream(); auto const mr = rmm::mr::get_current_device_resource(); @@ -580,6 +586,7 @@ std::unique_ptr to_timestamp(cudf::strings_column_view const& inpu auto const ft_cdv_ptr = column_device_view::create(*transitions, stream); auto const d_transitions = lists_column_device_view{*ft_cdv_ptr}; auto d_tz_indices = cudf::column_device_view::create(tz_indices->parent(), stream); + auto d_tz_short_ids = column_device_view::create(*tz_short_ids, stream); thrust::transform( rmm::exec_policy(stream), @@ -589,7 +596,7 @@ std::unique_ptr to_timestamp(cudf::strings_column_view const& inpu thrust::make_tuple(result_col->mutable_view().begin(), result_valid_col->mutable_view().begin())), parse_timestamp_string_fn{ - *d_strings, default_tz_index, true, d_transitions, *d_tz_indices}); + *d_strings, default_tz_index, true, d_transitions, *d_tz_indices, *d_tz_short_ids}); } auto valid_view = result_valid_col->mutable_view(); @@ -633,11 +640,12 @@ std::unique_ptr string_to_timestamp_with_tz( cudf::column_view const& transitions, cudf::strings_column_view const& tz_indices, cudf::size_type default_tz_index, - bool ansi_mode) + bool ansi_mode, + cudf::column_view const& tz_short_ids) { if (input.size() == 0) { return nullptr; } return to_timestamp( - input, ansi_mode, true, default_tz_index, &transitions, &tz_indices); + input, ansi_mode, true, default_tz_index, &transitions, &tz_indices, &tz_short_ids); } /** diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp index c7f9d5ec65..14bfa6a060 100644 --- a/src/main/cpp/src/datetime_parser.hpp +++ b/src/main/cpp/src/datetime_parser.hpp @@ -77,7 +77,8 @@ std::unique_ptr string_to_timestamp_with_tz( cudf::column_view const& transitions, cudf::strings_column_view const& tz_indices, cudf::size_type default_tz_index, - bool ansi_mode); + bool ansi_mode, + cudf::column_view const& tz_short_ids); /** * diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java index f45d2163d2..455e71333c 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java @@ -218,9 +218,10 @@ public static ColumnVector toTimestamp(ColumnView cv, ZoneId defaultTimeZone, bo Integer tzIndex = singleton.getZoneIDMap().get(defaultTimeZone.normalized().toString()); try (Table transitions = singleton.getTransitions(); - ColumnVector tzIndices = singleton.getZoneIDVector()) { + ColumnVector tzIndices = singleton.getZoneIDVector(); + ColumnVector tzShortIDs = singleton.getTimeZoneShortIDs()) { return new ColumnVector(toTimestamp(cv.getNativeView(), transitions.getNativeView(), - tzIndices.getNativeView(), tzIndex, ansiEnabled)); + tzIndices.getNativeView(), tzIndex, ansiEnabled, tzShortIDs.getNativeView())); } } @@ -291,7 +292,7 @@ private static native long toIntegersWithBase(long nativeColumnView, int base, boolean ansiEnabled, int dtype); private static native long fromIntegersWithBase(long nativeColumnView, int base); private static native long toTimestamp(long input, - long transitions, long tzIndices, int tzIndex, boolean ansiEnabled); + long transitions, long tzIndices, int tzIndex, boolean ansiEnabled, long tzShortIDs); private static native long toTimestampWithoutTimeZone(long input, boolean allowTimeZone, boolean ansiEnabled); } diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java index 6b09d2dda0..990d601889 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java @@ -42,6 +42,9 @@ public class GpuTimeZoneDB { private CompletableFuture> zoneIdToTableFuture; private CompletableFuture fixedTransitionsFuture; private CompletableFuture zoneIdVectorFuture; + // Used to store Java ZoneId.SHORT_IDS Map, e.g.: PST:America/Los_Angeles + // Note: also add a entry: Z->UTC + private HostColumnVector shortIDs; private boolean closed = false; @@ -171,10 +174,41 @@ private void loadData(Executor executor) throws IllegalStateException { } } + /** + * load ZoneId.SHORT_IDS and append Z->UTC. + * The first 3 entries are: Z->UTC, PST->America/Los_Angeles, CTT->Asia/Shanghai + */ + private void loadTimeZoneShortIDs() { + HostColumnVector.DataType type = new HostColumnVector.StructType(false, + new HostColumnVector.BasicType(false, DType.STRING), + new HostColumnVector.BasicType(false, DType.STRING)); + ArrayList data = new ArrayList<>(); + // add Z->UTC + data.add(new HostColumnVector.StructData("Z", "UTC")); + // add PST CTT + for (Map.Entry e : ZoneId.SHORT_IDS.entrySet()) { + if (e.getKey().equals("PST") || e.getKey().equals("CTT")) { + data.add(new HostColumnVector.StructData(e.getKey(), e.getValue())); + } + } + // add others + for (Map.Entry e : ZoneId.SHORT_IDS.entrySet()) { + if (!(e.getKey().equals("PST") || e.getKey().equals("CTT"))) { + data.add(new HostColumnVector.StructData(e.getKey(), e.getValue())); + } + } + shortIDs = HostColumnVector.fromStructs(type, data); + } + + public ColumnVector getTimeZoneShortIDs() { + return shortIDs.copyToDevice(); + } + @SuppressWarnings("unchecked") private void doLoadData() { synchronized (this) { try { + loadTimeZoneShortIDs(); Map zoneIdToTable = new HashMap<>(); List> masterTransitions = new ArrayList<>(); // Build a timezone ID index for the rendering of timezone IDs which may be included in datetime-like strings. diff --git a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java index 372ae3bd42..6bee3c771c 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java @@ -404,6 +404,9 @@ void toTimestampTestWithTz() { entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 00:1:20.3 -020103", 1571536943300000L)); entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 -8:08:01 ", 1571640105100000L)); entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1+4:59:59", 1571592825100000L)); + // short TZ ID: BST->Asia/Dhaka, CTT->Asia/Shanghai + entries.add(new AbstractMap.SimpleEntry<>("2023-11-5T03:04:55.1 CTT", 1699124695100000L)); + entries.add(new AbstractMap.SimpleEntry<>("2023-11-5T03:04:55.1 BST", 1699124695100000L + 7200L * 1000000L)); // BST is 2 hours later than CTT int validDataSize = entries.size(); From 10eba22dec1ba22e5a3e6db57e0e563ebe1c1d50 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Mon, 15 Jan 2024 19:14:23 +0800 Subject: [PATCH 21/35] Format code --- src/main/cpp/src/CastStringJni.cpp | 18 +++++++----------- src/main/cpp/src/datetime_parser.cu | 16 +++++++--------- src/main/cpp/src/datetime_parser.hpp | 7 +++---- 3 files changed, 17 insertions(+), 24 deletions(-) diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp index ab4f977750..fa48650f32 100644 --- a/src/main/cpp/src/CastStringJni.cpp +++ b/src/main/cpp/src/CastStringJni.cpp @@ -277,9 +277,9 @@ Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv* env, reinterpret_cast(transitions_handle)->column(0); auto const& tz_indices_view = cudf::strings_column_view(*reinterpret_cast(tz_indices_col)); - auto const tz_index = static_cast(tz_default_index); - const cudf::column_view *tz_short_ids_view = reinterpret_cast(tz_short_ids); - auto ret_cv = spark_rapids_jni::string_to_timestamp_with_tz( + auto const tz_index = static_cast(tz_default_index); + const cudf::column_view* tz_short_ids_view = reinterpret_cast(tz_short_ids); + auto ret_cv = spark_rapids_jni::string_to_timestamp_with_tz( input_view, transitions, tz_indices_view, tz_index, ansi_enabled, *tz_short_ids_view); if (ret_cv) { return cudf::jni::release_as_jlong(ret_cv); } } @@ -291,12 +291,8 @@ Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv* env, JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Parse failed on Ansi mode", 0); } -JNIEXPORT jlong JNICALL -Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestampWithoutTimeZone(JNIEnv* env, - jclass, - jlong input_column, - jboolean allow_time_zone, - jboolean ansi_enabled) +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestampWithoutTimeZone( + JNIEnv* env, jclass, jlong input_column, jboolean allow_time_zone, jboolean ansi_enabled) { JNI_NULL_CHECK(env, input_column, "input column is null", 0); try { @@ -304,8 +300,8 @@ Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestampWithoutTimeZone(JNIEnv* auto const& input_view = cudf::strings_column_view(*reinterpret_cast(input_column)); - auto ret_cv = spark_rapids_jni::string_to_timestamp_without_tz( - input_view, allow_time_zone, ansi_enabled); + auto ret_cv = + spark_rapids_jni::string_to_timestamp_without_tz(input_view, allow_time_zone, ansi_enabled); if (ret_cv) { return cudf::jni::release_as_jlong(ret_cv); } } CATCH_STD(env, 0); diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index 656215364d..ffaa8e6b6f 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -164,7 +164,7 @@ struct parse_timestamp_string_fn { auto tz_view = string_view(tz_lit_ptr, tz_lit_len); // Map short TZ ID to region-based timezone if tz_view is a short ID - auto const& short_name_col = tz_short_ids->child(0); + auto const& short_name_col = tz_short_ids->child(0); auto const& region_based_col = tz_short_ids->child(1); for (size_type i = 0; i < tz_short_ids->size(); i++) { auto const& curr_short_id = short_name_col.element(i); @@ -405,8 +405,8 @@ struct parse_timestamp_string_fn { const char* const bytes = curr_ptr; const size_type bytes_length = end_ptr - curr_ptr; - // segments stores: [year, month, day, hour, minute, seconds, microseconds, no_use_item, no_use_item] - // the two tail items are no use, but here keeps them as Spark does + // segments stores: [year, month, day, hour, minute, seconds, microseconds, no_use_item, + // no_use_item] the two tail items are no use, but here keeps them as Spark does int segments[] = {1, 1, 1, 0, 0, 0, 0, 0, 0}; int segments_len = 9; int i = 0; @@ -580,8 +580,7 @@ std::unique_ptr to_timestamp(cudf::strings_column_view const& inpu thrust::make_zip_iterator( thrust::make_tuple(result_col->mutable_view().begin(), result_valid_col->mutable_view().begin())), - parse_timestamp_string_fn{ - *d_strings, default_tz_index, allow_tz_in_date_str}); + parse_timestamp_string_fn{*d_strings, default_tz_index, allow_tz_in_date_str}); } else { auto const ft_cdv_ptr = column_device_view::create(*transitions, stream); auto const d_transitions = lists_column_device_view{*ft_cdv_ptr}; @@ -655,10 +654,9 @@ std::unique_ptr string_to_timestamp_with_tz( * If allow_time_zone is false and string contains time zone, then the string is * invalid. */ -std::unique_ptr string_to_timestamp_without_tz( - cudf::strings_column_view const& input, - bool allow_time_zone, - bool ansi_mode) +std::unique_ptr string_to_timestamp_without_tz(cudf::strings_column_view const& input, + bool allow_time_zone, + bool ansi_mode) { if (input.size() == 0) { return nullptr; } return to_timestamp(input, ansi_mode, allow_time_zone); diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp index 14bfa6a060..f750594f9f 100644 --- a/src/main/cpp/src/datetime_parser.hpp +++ b/src/main/cpp/src/datetime_parser.hpp @@ -131,9 +131,8 @@ std::unique_ptr string_to_timestamp_with_tz( * @returns the pointer of the timestamp result column, which points to nullptr * if there exists invalid inputs and ANSI mode is on. */ -std::unique_ptr string_to_timestamp_without_tz( - cudf::strings_column_view const& input, - bool allow_time_zone, - bool ansi_mode); +std::unique_ptr string_to_timestamp_without_tz(cudf::strings_column_view const& input, + bool allow_time_zone, + bool ansi_mode); } // namespace spark_rapids_jni From 24e81cd54d2f6b9acdfcac0729d9b19aa7a38a40 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Tue, 16 Jan 2024 09:48:34 +0800 Subject: [PATCH 22/35] Update comments --- .../com/nvidia/spark/rapids/jni/CastStrings.java | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java index 455e71333c..a87a754f35 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java @@ -182,8 +182,11 @@ public static ColumnVector fromIntegersWithBase(ColumnView cv, int base) { * * Unlike Spark, Spark-Rapids currently does not support DST time zones. * - * Note: Do not support cast special strings(epoch now today yesterday tomorrow) to timestamp. + * Note: + * - Do not support cast special strings(epoch now today yesterday tomorrow) to timestamp. * Spark31x supports cast special strings while Spark320+ do not supports + * - Do not support DST time zones, throw ai.rapids.cudf.CudfException + * if contains DST time zones. * * Example: * input = [" 2023", "2023-01-01T08:00:00Asia/Shanghai "] @@ -201,8 +204,9 @@ public static ColumnVector fromIntegersWithBase(ColumnView cv, int base) { * contain time zone. * @param ansiEnabled is Ansi mode * @return a timestamp column - * @throws IllegalArgumentException if cv contains invalid value or the time zone is - * non-existed when ansiEnabled is true + * @throws IllegalArgumentException if any string in cv has invalid format or the time zone is + * non-existed/wrong when ansiEnabled is true + * @throws CudfException if time zone is a DST time zone */ public static ColumnVector toTimestamp(ColumnView cv, ZoneId defaultTimeZone, boolean ansiEnabled) { if (!GpuTimeZoneDB.isSupportedTimeZone(defaultTimeZone)) { @@ -269,8 +273,9 @@ public static ColumnVector toTimestamp(ColumnView cv, ZoneId defaultTimeZone, bo * when do not allow time zone. * @param ansiEnabled is Ansi mode * @return a timestamp column - * @throws IllegalArgumentException if cv contains invalid value when - * ansiEnabled is true + * @throws IllegalArgumentException if any string in cv has invalid format or contains time zone + * while `allowTimeZone` is false when ANSI is true. + * */ public static ColumnVector toTimestampWithoutTimeZone(ColumnView cv, boolean allowTimeZone, boolean ansiEnabled) { GpuTimeZoneDB singleton = GpuTimeZoneDB.getInstance(); From 374dede799d244e5a537f4158c2d8b2429c6b6e0 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 18 Jan 2024 17:22:52 +0800 Subject: [PATCH 23/35] Update comments --- src/main/cpp/src/datetime_parser.cu | 29 ++++++++++--------- .../spark/rapids/jni/CastStringsTest.java | 2 +- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index ffaa8e6b6f..aa21459c0e 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -542,11 +542,13 @@ struct parse_timestamp_string_fn { }; /** - * The common entrance of string_to_timestamp, which combines two paths: - * with_timezone and without_timezone. This function returns the The - * transitions, tz_indices and default_tz_index are only for handling inputs - * with timezone. So, this function distinguish with_timezone calls from - * without_timezone ones by checking if transitions and tz_indices are nullptr. + * The common entrance of string_to_timestamp, two paths call this function: + * - `string_to_timestamp_with_tz` : with time zone + * - `string_to_timestamp_without_tz` : without time zone + * The parameters transitions, tz_indices and default_tz_index are only for handling + * inputs with timezone. + * It's called from `string_to_timestamp_without_tz` if transitions and tz_indices + * are nullptr, otherwise called from `string_to_timestamp_with_tz`. * */ std::unique_ptr to_timestamp(cudf::strings_column_view const& input, @@ -630,9 +632,11 @@ std::unique_ptr to_timestamp(cudf::strings_column_view const& inpu namespace spark_rapids_jni { /** - * Parse string column with time zone to timestamp column, - * Returns a pair of timestamp column and a bool indicates whether it successes. - * If does not have time zone in string, use the default time zone. + * Parse string column with time zone to timestamp column. + * If a string does not have time zone in it, use the default time zone. + * Returns nullptr if ANSI mode is true and strings have any invalid value, returns non-null + * timestamp column otherwise. + * */ std::unique_ptr string_to_timestamp_with_tz( cudf::strings_column_view const& input, @@ -648,11 +652,10 @@ std::unique_ptr string_to_timestamp_with_tz( } /** - * Parse string column with time zone to timestamp column, - * Returns a pair of timestamp column and a bool indicates whether it successes. - * Do not use the time zone in string. - * If allow_time_zone is false and string contains time zone, then the string is - * invalid. + * Parse string column without time zone to timestamp column. + * Returns nullptr if ANSI mode is true and strings have any invalid value, returns non-null + * timestamp column otherwise. + * */ std::unique_ptr string_to_timestamp_without_tz(cudf::strings_column_view const& input, bool allow_time_zone, diff --git a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java index 6bee3c771c..ca3049bfed 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java @@ -440,7 +440,7 @@ void toTimestampTestWithTz() { } }); - // Throw unsupported exception for symbols of special dates + // Throw IllegalArgumentException for symbols of special dates // Note: Spark 31x supports "epoch", "now", "today", "yesterday", "tomorrow". // But Spark 32x to Spark 35x do not supports. // Currently JNI do not supports From 1d0ef4c6b477e4ae5a39e305a37ce53e93141fc9 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Mon, 22 Jan 2024 23:20:38 +0800 Subject: [PATCH 24/35] Address comments --- src/main/cpp/src/datetime_parser.cu | 146 +++++++++--------- .../spark/rapids/jni/GpuTimeZoneDB.java | 64 +++----- 2 files changed, 94 insertions(+), 116 deletions(-) diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index aa21459c0e..996b3a253a 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -119,8 +119,7 @@ struct parse_timestamp_string_fn { bool allow_tz_in_date_str = true; // The list column of transitions to figure out the correct offset // to adjust the timestamp. The type of the values in this column is - // LIST>. + // LIST>. thrust::optional transitions = thrust::nullopt; thrust::optional tz_indices = thrust::nullopt; thrust::optional tz_short_ids = thrust::nullopt; @@ -157,43 +156,44 @@ struct parse_timestamp_string_fn { // path with timezone, in which timezone offset has to be determined before // computing unix_timestamp - int64_t tz_offset; + int64_t utc_offset; if (tz_lit_ptr == nullptr) { - tz_offset = extract_timezone_offset(compute_loose_epoch_s(ts_comp), default_tz_index); + // no tz in the string tailing, use default tz + utc_offset = extract_timezone_offset(compute_epoch_s(ts_comp), default_tz_index); } else { auto tz_view = string_view(tz_lit_ptr, tz_lit_len); - // Map short TZ ID to region-based timezone if tz_view is a short ID - auto const& short_name_col = tz_short_ids->child(0); - auto const& region_based_col = tz_short_ids->child(1); - for (size_type i = 0; i < tz_short_ids->size(); i++) { - auto const& curr_short_id = short_name_col.element(i); - if (curr_short_id == tz_view) { - // find short ID, replace tz_view with mapped region TZ ID - tz_view = region_based_col.element(i); - break; - } + // map tz short IDs, has three map types: + // 1: Z->UTC; + // 2: short ID->regional based tz + // 3: MST->"-07:00" + auto const& short_tz_id_col = tz_short_ids->child(0); + auto const& map_to_tz_col = tz_short_ids->child(1); + auto const it = thrust::upper_bound( + thrust::seq, short_tz_id_col.begin(), short_tz_id_col.end(), tz_view); + if (it != short_tz_id_col.end() && *it == tz_view) { + auto short_tz_id_idx = static_cast(it - short_tz_id_col.begin()); + // found a map, replace with mapped tz + tz_view = map_to_tz_col.element(short_tz_id_idx); } // Firstly, try parsing as utc-like timezone rep - if (auto [utc_offset, ret_code] = parse_utc_like_tz(tz_view); ret_code == 0) { - tz_offset = utc_offset; - } else if (ret_code == 1) { + auto [utc_offset, ret_code] = parse_utc_like_tz(tz_view); + if (ret_code == ParseUtcLikeTzResult::UTC_LIKE_TZ) { + utc_offset = utc_offset; + } else if (ret_code == ParseUtcLikeTzResult::NOT_UTC_LIKE_TZ) { // Then, try parsing as region-based timezone ID auto tz_index = query_index_from_tz_db(tz_view); - // tz_index < size(tzDB): found the ID in tzDB - // size(tzDB) <= tz_index < size(tzIDs): found the ID but not supported - // yet tz_index == size(tzIDs): invalid timezone ID - if (tz_index > transitions->size()) { - if (tz_index == tz_indices->size()) - return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, - ParseResult::INVALID); + if (tz_index < 0) { + // invalid tz return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, - ParseResult::UNSUPPORTED); + ParseResult::INVALID); + } else { + // supported tz + utc_offset = extract_timezone_offset(compute_epoch_s(ts_comp), tz_index); } - tz_offset = extract_timezone_offset(compute_loose_epoch_s(ts_comp), tz_index); } else { - // (ret_code == 2) quick path to mark value invalid + // (ret_code == ParseUtcLikeTzResult::INVALID) quick path to mark value invalid return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID); } } @@ -202,16 +202,20 @@ struct parse_timestamp_string_fn { auto const ts_unaligned = compute_epoch_us(ts_comp); return thrust::make_tuple( - cudf::timestamp_us{cudf::duration_us{ts_unaligned - tz_offset * 1000000L}}, ParseResult::OK); + cudf::timestamp_us{cudf::duration_us{ts_unaligned - utc_offset * 1000000L}}, ParseResult::OK); } + enum ParseUtcLikeTzResult { + UTC_LIKE_TZ = 0, // successfully parsed the timezone offset + NOT_UTC_LIKE_TZ = 1, // not a valid UTC-like timezone representation, maybe valid region-based + INVALID = 2 // not a valid timezone representation + }; + /** * * Parse UTC-like timezone representation such as: UTC+11:22:33, GMT-8:08:01. * This function is purposed to be fully align to Apache Spark's behavior. The - * function returns the status along with the result: 0 - successfully parsed - * the timezone offset 1 - not a valid UTC-like timezone representation, maybe - * valid region-based rep 2 - not a valid timezone representation + * function returns the status along with the ParseUtcLikeTzResult result. * * Valid patterns: * with colon @@ -224,7 +228,7 @@ struct parse_timestamp_string_fn { * additional restriction: 18:00:00 is the upper bound (which means 18:00:01 * is invalid) */ - __device__ inline thrust::pair parse_utc_like_tz( + __device__ inline thrust::pair parse_utc_like_tz( string_view const& tz_lit) const { size_type len = tz_lit.size_bytes(); @@ -239,7 +243,7 @@ struct parse_timestamp_string_fn { } // return for the pattern UTC|GMT (without exact offset) - if (len == char_offset) return {0, 0}; + if (len == char_offset) return {0, ParseUtcLikeTzResult::UTC_LIKE_TZ}; // parse sign +|- char const sign_char = *(ptr + char_offset++); @@ -250,7 +254,7 @@ struct parse_timestamp_string_fn { sign = -1L; } else { // if the rep starts with UTC|GMT, it can NOT be region-based rep - return {0, char_offset < 3 ? 1 : 2}; + return {0, char_offset < 3 ? ParseUtcLikeTzResult::NOT_UTC_LIKE_TZ : ParseUtcLikeTzResult::INVALID}; } // parse hh:mm:ss @@ -259,14 +263,14 @@ struct parse_timestamp_string_fn { for (size_type i = 0; i < 3; i++) { // deal with the first digit hms[i] = *(ptr + char_offset++) - '0'; - if (hms[i] < 0 || hms[i] > 9) return {0, 2}; + if (hms[i] < 0 || hms[i] > 9) return {0, ParseUtcLikeTzResult::INVALID}; // deal with trailing single digit instant: // hh(GMT+8) - valid // mm(GMT+11:2) - must be separated from (h)h by `:` // ss(GMT-11:22:3) - invalid if (len == char_offset) { - if (i == 2 || (i == 1 && !has_colon)) return {0, 2}; + if (i == 2 || (i == 1 && !has_colon)) return {0, ParseUtcLikeTzResult::INVALID}; break; } @@ -275,68 +279,68 @@ struct parse_timestamp_string_fn { // 1. (i == 1) one_digit mm with ss is invalid (+11:2:3) // 2. (i == 2) one_digit ss is invalid (+11:22:3) // 3. trailing `:` is invalid (GMT+8:) - if (i > 0 || len == ++char_offset) return {0, 2}; + if (i > 0 || len == ++char_offset) return {0, ParseUtcLikeTzResult::INVALID}; has_colon = true; continue; } // deal with the second digit auto digit = *(ptr + char_offset++) - '0'; - if (digit < 0 || digit > 9) return {0, 2}; + if (digit < 0 || digit > 9) return {0, ParseUtcLikeTzResult::INVALID}; hms[i] = hms[i] * 10 + digit; if (len == char_offset) break; // deal with `:` if (*(ptr + char_offset) == ':') { // trailing `:` is invalid (UTC+11:) - if (len == ++char_offset) return {0, 2}; + if (len == ++char_offset) return {0, ParseUtcLikeTzResult::INVALID}; has_colon = true; } } // the upper bound is 18:00:00 (regardless of sign) - if (hms[0] > 18 || hms[1] > 59 || hms[2] > 59) return {0, 2}; - if (hms[0] == 18 && hms[1] + hms[2] > 0) return {0, 2}; + if (hms[0] > 18 || hms[1] > 59 || hms[2] > 59) return {0, ParseUtcLikeTzResult::INVALID}; + if (hms[0] == 18 && hms[1] + hms[2] > 0) return {0, ParseUtcLikeTzResult::INVALID}; - return {sign * (hms[0] * 3600L + hms[1] * 60L + hms[2]), 0}; + return {sign * (hms[0] * 3600L + hms[1] * 60L + hms[2]), ParseUtcLikeTzResult::UTC_LIKE_TZ}; } /** - * TODO: replace linear search with more efficient approach (like prefix tree) + * tz_indices is sorted, use binary search to find tz index. */ __device__ inline int query_index_from_tz_db(string_view const& tz_lit) const { - auto predicate = [tz = tz_indices, &tz_lit] __device__(auto const i) { - return tz->element(i) == tz_lit; - }; - auto ret = thrust::find_if(thrust::seq, - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(tz_indices->size()), - predicate); - - return *ret; + auto const it = thrust::upper_bound(thrust::seq, + tz_indices->begin(), + tz_indices->end(), + tz_lit); + if (it != tz_indices->end() && *it == tz_lit) { + return it - tz_indices->begin(); + } else { + return -1; + } } /** - * Perform binary search to search out the timezone offset based on loose epoch + * Perform binary search to search out the timezone offset based on local epoch * instants. Basically, this is the same approach as * `convert_timestamp_tz_functor`. */ - __device__ inline int64_t extract_timezone_offset(int64_t loose_epoch_second, + __device__ inline int64_t extract_timezone_offset(int64_t local_epoch_second, size_type tz_index) const { + auto const& tz_instants = transitions->child().child(1); auto const& utc_offsets = transitions->child().child(2); - auto const& loose_instants = transitions->child().child(3); auto const local_transitions = cudf::list_device_view{*transitions, tz_index}; auto const list_size = local_transitions.size(); auto const transition_times = cudf::device_span( - loose_instants.data() + local_transitions.element_offset(0), + tz_instants.data() + local_transitions.element_offset(0), static_cast(list_size)); auto const it = thrust::upper_bound( - thrust::seq, transition_times.begin(), transition_times.end(), loose_epoch_second); + thrust::seq, transition_times.begin(), transition_times.end(), local_epoch_second); auto const idx = static_cast(thrust::distance(transition_times.begin(), it)); auto const list_offset = local_transitions.element_offset(idx - 1); @@ -344,24 +348,9 @@ struct parse_timestamp_string_fn { } /** - * The formula to compute loose epoch from local time. The loose epoch is used - * to search for the corresponding timezone offset of specific zone ID from - * TimezoneDB. The target of loose epoch is to transfer local time to a number - * which is proportional to the real timestamp as easily as possible. Loose - * epoch, as a computation approach, helps us to align probe(kernel side) to - * the TimezoneDB(Java side). Then, we can apply binary search based on loose - * epoch instants of TimezoneDB to find out the correct timezone offset. - */ - __device__ inline int64_t compute_loose_epoch_s(timestamp_components const& ts) const - { - return (ts.year * 400 + (ts.month - 1) * 31 + ts.day - 1) * 86400L + ts.hour * 3600L + - ts.minute * 60L + ts.second; - } - - /** - * Leverage STL to convert local time to UTC unix_timestamp(in millisecond) + * Leverage STL to convert local time to UTC unix_timestamp(in seconds) */ - __device__ inline int64_t compute_epoch_us(timestamp_components const& ts) const + __device__ inline int64_t compute_epoch_s(timestamp_components const& ts) const { auto const ymd = // chrono class handles the leap year calculations for us cuda::std::chrono::year_month_day(cuda::std::chrono::year{ts.year}, @@ -369,8 +358,15 @@ struct parse_timestamp_string_fn { cuda::std::chrono::day{static_cast(ts.day)}); auto days = cuda::std::chrono::sys_days(ymd).time_since_epoch().count(); - int64_t timestamp_s = (days * 24L * 3600L) + (ts.hour * 3600L) + (ts.minute * 60L) + ts.second; + return (days * 24L * 3600L) + (ts.hour * 3600L) + (ts.minute * 60L) + ts.second; + } + /** + * Leverage STL to convert local time to UTC unix_timestamp(in milliseconds) + */ + __device__ inline int64_t compute_epoch_us(timestamp_components const& ts) const + { + int64_t timestamp_s = compute_epoch_s(ts); return timestamp_s * 1000000L + ts.microseconds; } diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java index 990d601889..087aba7549 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java @@ -17,18 +17,18 @@ package com.nvidia.spark.rapids.jni; import java.time.Instant; -import java.time.LocalDateTime; import java.time.ZoneId; import java.time.zone.ZoneOffsetTransition; import java.time.zone.ZoneRules; import java.time.zone.ZoneRulesException; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.TimeZone; import java.util.concurrent.*; -import java.util.function.Function; import ai.rapids.cudf.*; @@ -175,26 +175,21 @@ private void loadData(Executor executor) throws IllegalStateException { } /** - * load ZoneId.SHORT_IDS and append Z->UTC. - * The first 3 entries are: Z->UTC, PST->America/Los_Angeles, CTT->Asia/Shanghai + * load ZoneId.SHORT_IDS and append Z->UTC, then sort the IDs. */ private void loadTimeZoneShortIDs() { HostColumnVector.DataType type = new HostColumnVector.StructType(false, new HostColumnVector.BasicType(false, DType.STRING), new HostColumnVector.BasicType(false, DType.STRING)); ArrayList data = new ArrayList<>(); - // add Z->UTC - data.add(new HostColumnVector.StructData("Z", "UTC")); - // add PST CTT - for (Map.Entry e : ZoneId.SHORT_IDS.entrySet()) { - if (e.getKey().equals("PST") || e.getKey().equals("CTT")) { - data.add(new HostColumnVector.StructData(e.getKey(), e.getValue())); - } - } - // add others - for (Map.Entry e : ZoneId.SHORT_IDS.entrySet()) { - if (!(e.getKey().equals("PST") || e.getKey().equals("CTT"))) { - data.add(new HostColumnVector.StructData(e.getKey(), e.getValue())); + List idList = new ArrayList<>(ZoneId.SHORT_IDS.keySet()); + idList.add("Z"); + Collections.sort(idList); + for (String id : idList) { + if (id.equals("Z")) { + data.add(new HostColumnVector.StructData(id, "UTC")); + } else { + data.add(new HostColumnVector.StructData(id, ZoneId.SHORT_IDS.get(id))); } } shortIDs = HostColumnVector.fromStructs(type, data); @@ -208,23 +203,26 @@ public ColumnVector getTimeZoneShortIDs() { private void doLoadData() { synchronized (this) { try { + // load ZoneId.SHORT_IDS and append Z->UTC, then sort the IDs. loadTimeZoneShortIDs(); + Map zoneIdToTable = new HashMap<>(); List> masterTransitions = new ArrayList<>(); // Build a timezone ID index for the rendering of timezone IDs which may be included in datetime-like strings. // For instance: "2023-11-5T03:04:55.1 Asia/Shanghai" -> This index helps to find the // offset of "Asia/Shanghai" in timezoneDB. // - // Currently, we do NOT support all timezone IDs. For unsupported ones, we ought to throw Exception anyway. And - // for invalid ones, we replace them with NULL value when ANSI mode is off. Therefore, we need to distinguish the - // unsupported ones from invalid ones which means the unsupported Ids need to be collected as well. - // To distinguish supported IDs from unsupported ones, we place all unsupported IDs behind supported ones: - // 1. Collect the IDs of all supported timezones in the order of masterTransitions. - // 2. Append the IDs of all unsupported timezones after the suported ones. + // Currently, we do NOT support all timezone IDs. For unsupported time zones, like invalid ones, + // we replace them with NULL value when ANSI mode is off when parsing string to timestamp. + // This list only contains supported time zones. List zondIdList = new ArrayList<>(); List unsupportedZoneList = new ArrayList<>(); + + // sort the IDs + String[] availableIDs = TimeZone.getAvailableIDs(); + Arrays.sort(availableIDs); - for (String tzId : TimeZone.getAvailableIDs()) { + for (String tzId : availableIDs) { ZoneId zoneId; try { zoneId = ZoneId.of(tzId).normalized(); // we use the normalized form to dedupe @@ -257,17 +255,6 @@ private void doLoadData() { first.getOffsetBefore().getTotalSeconds(), Long.MIN_VALUE) ); transitions.forEach(t -> { - // A simple approach to transform LocalDateTime to a value which is proportional to - // the exact EpochSecond. After caching these values along with EpochSeconds, we - // can easily search out which time zone transition rule we should apply according - // to LocalDateTime structs. The searching procedure is same as the binary search with - // exact EpochSeconds(convert_timestamp_tz_functor), except using "loose instant" - // as search index instead of exact EpochSeconds. - Function localToLooseEpochSecond = lt -> - 86400L * (lt.getYear() * 400L + (lt.getMonthValue() - 1) * 31L + - lt.getDayOfMonth() - 1) + - 3600L * lt.getHour() + 60L * lt.getMinute() + lt.getSecond(); - // Whether transition is an overlap vs gap. // In Spark: // if it's a gap, then we use the offset after *on* the instant @@ -279,8 +266,7 @@ private void doLoadData() { new HostColumnVector.StructData( t.getInstant().getEpochSecond(), t.getInstant().getEpochSecond() + t.getOffsetAfter().getTotalSeconds(), - t.getOffsetAfter().getTotalSeconds(), - localToLooseEpochSecond.apply(t.getDateTimeAfter()) + t.getOffsetAfter().getTotalSeconds() ) ); } else { @@ -288,8 +274,7 @@ private void doLoadData() { new HostColumnVector.StructData( t.getInstant().getEpochSecond(), t.getInstant().getEpochSecond() + t.getOffsetBefore().getTotalSeconds(), - t.getOffsetAfter().getTotalSeconds(), - localToLooseEpochSecond.apply(t.getDateTimeBefore()) + t.getOffsetAfter().getTotalSeconds() ) ); } @@ -311,9 +296,6 @@ private void doLoadData() { HostColumnVector.DataType resultType = new HostColumnVector.ListType(false, childType); - // Append the IDs of all unsupported timezones after the suported ones. - zondIdList.addAll(unsupportedZoneList); - try (HostColumnVector fixedTransitions = HostColumnVector.fromLists(resultType, masterTransitions.toArray(new List[0]))) { try (HostColumnVector zoneIdVector = HostColumnVector.fromStrings(zondIdList.toArray(new String[0]))) { fixedTransitionsFuture.complete(fixedTransitions.incRefCount()); From b87584023c37678d9af7e1c6e86de042899b2e45 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Wed, 24 Jan 2024 00:18:16 +0800 Subject: [PATCH 25/35] Address comments --- src/main/cpp/src/datetime_parser.cu | 204 +++++++++++------- .../nvidia/spark/rapids/jni/CastStrings.java | 5 + .../spark/rapids/jni/GpuTimeZoneDB.java | 71 ++++-- .../spark/rapids/jni/CastStringsTest.java | 10 +- 4 files changed, 195 insertions(+), 95 deletions(-) diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index 996b3a253a..18750c9a27 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -103,6 +103,18 @@ __device__ __host__ bool is_valid_digits(int segment, int digits) (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2); } +/** + * function to get a string from string view + */ +struct get_string_fn { + column_device_view const& string_view; + + __device__ cudf::string_view operator()(size_t idx) + { + return string_view.element(idx); + } +}; + /** * We have to distinguish INVALID value with UNSUPPORTED value. * INVALID means the value is invalid in Spark SQL. @@ -119,9 +131,10 @@ struct parse_timestamp_string_fn { bool allow_tz_in_date_str = true; // The list column of transitions to figure out the correct offset // to adjust the timestamp. The type of the values in this column is - // LIST>. + // LIST>. thrust::optional transitions = thrust::nullopt; - thrust::optional tz_indices = thrust::nullopt; + thrust::optional sorted_tz_names = thrust::nullopt; thrust::optional tz_short_ids = thrust::nullopt; __device__ thrust::tuple operator()(const cudf::size_type& idx) const @@ -159,42 +172,51 @@ struct parse_timestamp_string_fn { int64_t utc_offset; if (tz_lit_ptr == nullptr) { // no tz in the string tailing, use default tz - utc_offset = extract_timezone_offset(compute_epoch_s(ts_comp), default_tz_index); + utc_offset = compute_utc_offset(compute_loose_epoch_s(ts_comp), default_tz_index); } else { auto tz_view = string_view(tz_lit_ptr, tz_lit_len); - // map tz short IDs, has three map types: - // 1: Z->UTC; - // 2: short ID->regional based tz - // 3: MST->"-07:00" - auto const& short_tz_id_col = tz_short_ids->child(0); - auto const& map_to_tz_col = tz_short_ids->child(1); - auto const it = thrust::upper_bound( - thrust::seq, short_tz_id_col.begin(), short_tz_id_col.end(), tz_view); - if (it != short_tz_id_col.end() && *it == tz_view) { - auto short_tz_id_idx = static_cast(it - short_tz_id_col.begin()); - // found a map, replace with mapped tz - tz_view = map_to_tz_col.element(short_tz_id_idx); + // map tz short IDs to time zone index in transitions. + // Here only handle regional base tz map: short ID->regional based tz + // Note: here do not handle special short IDs: EST: -05:00; HST: -10:00; MST: -07:00 + auto const short_tz_id_col = tz_short_ids->child(0); + auto const map_to_tz_col = tz_short_ids->child(1); + auto string_iter_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), + get_string_fn{short_tz_id_col}); + auto string_iter_end = string_iter_begin + short_tz_id_col.size(); + auto it = thrust::lower_bound( + thrust::seq, string_iter_begin, string_iter_end, tz_view, thrust::less()); + int tz_index_for_short_tz = -1; + if (it != string_iter_end && *it == tz_view) { + // found a map, get the time zone index + auto short_id_index = static_cast(it - string_iter_begin); + tz_index_for_short_tz = static_cast(map_to_tz_col.element(short_id_index)); } - // Firstly, try parsing as utc-like timezone rep - auto [utc_offset, ret_code] = parse_utc_like_tz(tz_view); - if (ret_code == ParseUtcLikeTzResult::UTC_LIKE_TZ) { - utc_offset = utc_offset; - } else if (ret_code == ParseUtcLikeTzResult::NOT_UTC_LIKE_TZ) { - // Then, try parsing as region-based timezone ID - auto tz_index = query_index_from_tz_db(tz_view); - if (tz_index < 0) { - // invalid tz - return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, - ParseResult::INVALID); + if (tz_index_for_short_tz >= 0) { + // it's a supported short ID, and found the tz index + utc_offset = compute_utc_offset(compute_loose_epoch_s(ts_comp), tz_index_for_short_tz); + } else { + // Firstly, try parsing as utc-like timezone rep + // Note: parse_utc_like_tz handles special short IDs: EST: -05:00; HST: -10:00; MST: -07:00 + auto [fix_offset, ret_code] = parse_utc_like_tz(tz_view); + if (ret_code == ParseUtcLikeTzResult::UTC_LIKE_TZ) { + utc_offset = fix_offset; + } else if (ret_code == ParseUtcLikeTzResult::NOT_UTC_LIKE_TZ) { + // Then, try parsing as region-based timezone ID + auto tz_index = query_index_from_tz_db(tz_view); + if (tz_index < 0) { + // TODO: distinguish unsupported and invalid tz + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, + ParseResult::INVALID); + } else { + // supported tz + utc_offset = compute_utc_offset(compute_loose_epoch_s(ts_comp), tz_index); + } } else { - // supported tz - utc_offset = extract_timezone_offset(compute_epoch_s(ts_comp), tz_index); + // (ret_code == ParseUtcLikeTzResult::INVALID) quick path to mark value invalid + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID); } - } else { - // (ret_code == ParseUtcLikeTzResult::INVALID) quick path to mark value invalid - return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID); } } @@ -205,10 +227,10 @@ struct parse_timestamp_string_fn { cudf::timestamp_us{cudf::duration_us{ts_unaligned - utc_offset * 1000000L}}, ParseResult::OK); } - enum ParseUtcLikeTzResult { - UTC_LIKE_TZ = 0, // successfully parsed the timezone offset - NOT_UTC_LIKE_TZ = 1, // not a valid UTC-like timezone representation, maybe valid region-based - INVALID = 2 // not a valid timezone representation + enum ParseUtcLikeTzResult { + UTC_LIKE_TZ = 0, // successfully parsed the timezone offset + NOT_UTC_LIKE_TZ = 1, // not a valid UTC-like timezone representation, maybe valid region-based + INVALID = 2 // not a valid timezone representation }; /** @@ -218,6 +240,11 @@ struct parse_timestamp_string_fn { * function returns the status along with the ParseUtcLikeTzResult result. * * Valid patterns: + * Z: means UTC + * short tz IDs that is UTC like + * EST: -05:00 + * HST: -10:00 + * MST: -07:00 * with colon * hh:mm : ^(GMT|UTC)?[+-](\d|0[0-9]|1[0-8]):(\d|[0-5][0-9]) * hh:mm:ss : ^(GMT|UTC)?[+-](\d|0[0-9]|1[0-8]):[0-5][0-9]:[0-5][0-9] @@ -235,6 +262,23 @@ struct parse_timestamp_string_fn { char const* ptr = tz_lit.data(); + // Z time zone + if (len == 1 && *ptr == 'Z') { return {0, ParseUtcLikeTzResult::UTC_LIKE_TZ}; } + + // handle short tz IDs that is UTC like: EST, HST, MST + if (len == 3) { + if ((*ptr == 'E' && *(ptr + 1) == 'S' && *(ptr + 2) == 'T')) { + // EST: -05:00 + return {-5L * 3600L, ParseUtcLikeTzResult::UTC_LIKE_TZ}; + } else if ((*ptr == 'H' && *(ptr + 1) == 'S' && *(ptr + 2) == 'T')) { + // HST: -10:00 + return {-10L * 3600L, ParseUtcLikeTzResult::UTC_LIKE_TZ}; + } else if ((*ptr == 'M' && *(ptr + 1) == 'S' && *(ptr + 2) == 'T')) { + // MST: -07:00 + return {-7L * 3600L, ParseUtcLikeTzResult::UTC_LIKE_TZ}; + } + } + size_t char_offset = 0; // skip UTC|GMT if existing if (len > 2 && ((*ptr == 'G' && *(ptr + 1) == 'M' && *(ptr + 2) == 'T') || @@ -254,7 +298,8 @@ struct parse_timestamp_string_fn { sign = -1L; } else { // if the rep starts with UTC|GMT, it can NOT be region-based rep - return {0, char_offset < 3 ? ParseUtcLikeTzResult::NOT_UTC_LIKE_TZ : ParseUtcLikeTzResult::INVALID}; + return { + 0, char_offset < 3 ? ParseUtcLikeTzResult::NOT_UTC_LIKE_TZ : ParseUtcLikeTzResult::INVALID}; } // parse hh:mm:ss @@ -306,67 +351,75 @@ struct parse_timestamp_string_fn { } /** - * tz_indices is sorted, use binary search to find tz index. + * use binary search to find tz index. */ __device__ inline int query_index_from_tz_db(string_view const& tz_lit) const { - auto const it = thrust::upper_bound(thrust::seq, - tz_indices->begin(), - tz_indices->end(), - tz_lit); - if (it != tz_indices->end() && *it == tz_lit) { - return it - tz_indices->begin(); + auto string_iter_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), + get_string_fn{*sorted_tz_names}); + auto string_iter_end = string_iter_begin + sorted_tz_names->size(); + auto it = thrust::lower_bound( + thrust::seq, string_iter_begin, string_iter_end, tz_lit, thrust::less()); + if (it != string_iter_end && *it == tz_lit) { + // found tz + return static_cast(it - string_iter_begin); } else { + // not found tz return -1; } } /** - * Perform binary search to search out the timezone offset based on local epoch + * Perform binary search to search out the offset from UTC based on local epoch * instants. Basically, this is the same approach as * `convert_timestamp_tz_functor`. */ - __device__ inline int64_t extract_timezone_offset(int64_t local_epoch_second, - size_type tz_index) const + __device__ inline int64_t compute_utc_offset(int64_t loose_epoch_second, size_type tz_index) const { - auto const& tz_instants = transitions->child().child(1); auto const& utc_offsets = transitions->child().child(2); + auto const& loose_instants = transitions->child().child(3); auto const local_transitions = cudf::list_device_view{*transitions, tz_index}; auto const list_size = local_transitions.size(); auto const transition_times = cudf::device_span( - tz_instants.data() + local_transitions.element_offset(0), + loose_instants.data() + local_transitions.element_offset(0), static_cast(list_size)); auto const it = thrust::upper_bound( - thrust::seq, transition_times.begin(), transition_times.end(), local_epoch_second); + thrust::seq, transition_times.begin(), transition_times.end(), loose_epoch_second); auto const idx = static_cast(thrust::distance(transition_times.begin(), it)); auto const list_offset = local_transitions.element_offset(idx - 1); - return static_cast(utc_offsets.element(list_offset)); } /** - * Leverage STL to convert local time to UTC unix_timestamp(in seconds) + * The formula to compute loose epoch from local time. The loose epoch is used + * to search for the corresponding timezone offset of specific zone ID from + * TimezoneDB. The target of loose epoch is to transfer local time to a number + * which is proportional to the real timestamp as easily as possible. Loose + * epoch, as a computation approach, helps us to align probe(kernel side) to + * the TimezoneDB(Java side). Then, we can apply binary search based on loose + * epoch instants of TimezoneDB to find out the correct timezone offset. */ - __device__ inline int64_t compute_epoch_s(timestamp_components const& ts) const + __device__ inline int64_t compute_loose_epoch_s(timestamp_components const& ts) const { - auto const ymd = // chrono class handles the leap year calculations for us - cuda::std::chrono::year_month_day(cuda::std::chrono::year{ts.year}, - cuda::std::chrono::month{static_cast(ts.month)}, - cuda::std::chrono::day{static_cast(ts.day)}); - auto days = cuda::std::chrono::sys_days(ymd).time_since_epoch().count(); - - return (days * 24L * 3600L) + (ts.hour * 3600L) + (ts.minute * 60L) + ts.second; + return (ts.year * 400 + (ts.month - 1) * 31 + ts.day - 1) * 86400L + ts.hour * 3600L + + ts.minute * 60L + ts.second; } /** - * Leverage STL to convert local time to UTC unix_timestamp(in milliseconds) + * Leverage STL to convert local time to UTC unix_timestamp(in millisecond) */ __device__ inline int64_t compute_epoch_us(timestamp_components const& ts) const { - int64_t timestamp_s = compute_epoch_s(ts); + auto const ymd = // chrono class handles the leap year calculations for us + cuda::std::chrono::year_month_day(cuda::std::chrono::year{ts.year}, + cuda::std::chrono::month{static_cast(ts.month)}, + cuda::std::chrono::day{static_cast(ts.day)}); + auto days = cuda::std::chrono::sys_days(ymd).time_since_epoch().count(); + + int64_t timestamp_s = (days * 24L * 3600L) + (ts.hour * 3600L) + (ts.minute * 60L) + ts.second; return timestamp_s * 1000000L + ts.microseconds; } @@ -541,19 +594,20 @@ struct parse_timestamp_string_fn { * The common entrance of string_to_timestamp, two paths call this function: * - `string_to_timestamp_with_tz` : with time zone * - `string_to_timestamp_without_tz` : without time zone - * The parameters transitions, tz_indices and default_tz_index are only for handling + * The parameters transitions, sorted_tz_names and default_tz_index are only for handling * inputs with timezone. - * It's called from `string_to_timestamp_without_tz` if transitions and tz_indices + * It's called from `string_to_timestamp_without_tz` if transitions and sorted_tz_names * are nullptr, otherwise called from `string_to_timestamp_with_tz`. * */ -std::unique_ptr to_timestamp(cudf::strings_column_view const& input, - bool ansi_mode, - bool allow_tz_in_date_str = true, - size_type default_tz_index = 1000000000, - cudf::column_view const* transitions = nullptr, - cudf::strings_column_view const* tz_indices = nullptr, - cudf::column_view const* tz_short_ids = nullptr) +std::unique_ptr to_timestamp( + cudf::strings_column_view const& input, + bool ansi_mode, + bool allow_tz_in_date_str = true, + size_type default_tz_index = 1000000000, + cudf::column_view const* transitions = nullptr, + cudf::strings_column_view const* sorted_tz_names = nullptr, + cudf::column_view const* tz_short_ids = nullptr) { auto const stream = cudf::get_default_stream(); auto const mr = rmm::mr::get_current_device_resource(); @@ -570,7 +624,7 @@ std::unique_ptr to_timestamp(cudf::strings_column_view const& inpu auto result_valid_col = cudf::make_fixed_width_column( cudf::data_type{cudf::type_id::UINT8}, input.size(), cudf::mask_state::UNALLOCATED, stream, mr); - if (transitions == nullptr || tz_indices == nullptr) { + if (transitions == nullptr || sorted_tz_names == nullptr) { thrust::transform( rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -582,7 +636,7 @@ std::unique_ptr to_timestamp(cudf::strings_column_view const& inpu } else { auto const ft_cdv_ptr = column_device_view::create(*transitions, stream); auto const d_transitions = lists_column_device_view{*ft_cdv_ptr}; - auto d_tz_indices = cudf::column_device_view::create(tz_indices->parent(), stream); + auto d_sorted_tz_names = cudf::column_device_view::create(sorted_tz_names->parent(), stream); auto d_tz_short_ids = column_device_view::create(*tz_short_ids, stream); thrust::transform( @@ -593,7 +647,7 @@ std::unique_ptr to_timestamp(cudf::strings_column_view const& inpu thrust::make_tuple(result_col->mutable_view().begin(), result_valid_col->mutable_view().begin())), parse_timestamp_string_fn{ - *d_strings, default_tz_index, true, d_transitions, *d_tz_indices, *d_tz_short_ids}); + *d_strings, default_tz_index, true, d_transitions, *d_sorted_tz_names, *d_tz_short_ids}); } auto valid_view = result_valid_col->mutable_view(); @@ -637,14 +691,14 @@ namespace spark_rapids_jni { std::unique_ptr string_to_timestamp_with_tz( cudf::strings_column_view const& input, cudf::column_view const& transitions, - cudf::strings_column_view const& tz_indices, + cudf::strings_column_view const& sorted_tz_names, cudf::size_type default_tz_index, bool ansi_mode, cudf::column_view const& tz_short_ids) { if (input.size() == 0) { return nullptr; } return to_timestamp( - input, ansi_mode, true, default_tz_index, &transitions, &tz_indices, &tz_short_ids); + input, ansi_mode, true, default_tz_index, &transitions, &sorted_tz_names, &tz_short_ids); } /** diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java index a87a754f35..cb0a29ce1f 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java @@ -266,6 +266,11 @@ public static ColumnVector toTimestamp(ColumnView cv, ZoneId defaultTimeZone, bo * allowSpecialExpressions = true, ansiEnabled = false) * ts is: ['2023-01-01 00:00:00', '2023-01-01T08:00:00'] * + * Note: this function will never use the time zones in the strings. + * allowTimeZone means whether allow time zone in the timestamp string. + * If allowTimeZone is true, the time zones are ignored if has. + * if allowTimeZone is false, then this function will throw exception if has any time zone in the strings and it's ANSI mode. + * * @param cv The input string column to be converted. * @param allowTimeZone whether allow time zone in the timestamp * string. e.g.: diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java index 087aba7549..8189364b6e 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java @@ -17,6 +17,7 @@ package com.nvidia.spark.rapids.jni; import java.time.Instant; +import java.time.LocalDateTime; import java.time.ZoneId; import java.time.zone.ZoneOffsetTransition; import java.time.zone.ZoneRules; @@ -24,11 +25,13 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.TimeZone; import java.util.concurrent.*; +import java.util.function.Function; import ai.rapids.cudf.*; @@ -175,21 +178,29 @@ private void loadData(Executor executor) throws IllegalStateException { } /** - * load ZoneId.SHORT_IDS and append Z->UTC, then sort the IDs. + * load ZoneId.SHORT_IDS and map to time zone index in transition table. */ - private void loadTimeZoneShortIDs() { + private void loadTimeZoneShortIDs(Map zoneIdToTable) { HostColumnVector.DataType type = new HostColumnVector.StructType(false, new HostColumnVector.BasicType(false, DType.STRING), - new HostColumnVector.BasicType(false, DType.STRING)); + new HostColumnVector.BasicType(false, DType.INT32)); ArrayList data = new ArrayList<>(); + // copy short IDs List idList = new ArrayList<>(ZoneId.SHORT_IDS.keySet()); - idList.add("Z"); + // sort short IDs Collections.sort(idList); for (String id : idList) { - if (id.equals("Z")) { - data.add(new HostColumnVector.StructData(id, "UTC")); + String mapTo = ZoneId.SHORT_IDS.get(id); + if (mapTo.startsWith("+") || mapTo.startsWith("-")) { + // skip: EST: -05:00; HST: -10:00; MST: -07:00 + // kernel will handle EST, HST, MST + // ZoneId.SHORT_IDS is deprecated, so it will not probably change } else { - data.add(new HostColumnVector.StructData(id, ZoneId.SHORT_IDS.get(id))); + Integer index = zoneIdToTable.get(mapTo); + // some short IDs are DST, skip unsupported + if (index != null) { + data.add(new HostColumnVector.StructData(id, index)); + } } } shortIDs = HostColumnVector.fromStructs(type, data); @@ -203,9 +214,6 @@ public ColumnVector getTimeZoneShortIDs() { private void doLoadData() { synchronized (this) { try { - // load ZoneId.SHORT_IDS and append Z->UTC, then sort the IDs. - loadTimeZoneShortIDs(); - Map zoneIdToTable = new HashMap<>(); List> masterTransitions = new ArrayList<>(); // Build a timezone ID index for the rendering of timezone IDs which may be included in datetime-like strings. @@ -218,20 +226,40 @@ private void doLoadData() { List zondIdList = new ArrayList<>(); List unsupportedZoneList = new ArrayList<>(); - // sort the IDs - String[] availableIDs = TimeZone.getAvailableIDs(); - Arrays.sort(availableIDs); - - for (String tzId : availableIDs) { + // collect zone id and sort + List ids = new ArrayList<>(); + for (String tzId : TimeZone.getAvailableIDs()) { ZoneId zoneId; try { zoneId = ZoneId.of(tzId).normalized(); // we use the normalized form to dedupe + ids.add(zoneId); } catch (ZoneRulesException e) { // Sometimes the list of getAvailableIDs() is one of the 3-letter abbreviations, however, // this use is deprecated due to ambiguity reasons (same abbrevation can be used for // multiple time zones). These are not supported by ZoneId.of(...) directly here. continue; } + } + Collections.sort(ids, new Comparator() { + @Override + public int compare(ZoneId o1, ZoneId o2) { + // sort by `getId` + return o1.getId().compareTo(o2.getId()); + } + }); + + // A simple approach to transform LocalDateTime to a value which is proportional to + // the exact EpochSecond. After caching these values along with EpochSeconds, we + // can easily search out which time zone transition rule we should apply according + // to LocalDateTime structs. The searching procedure is same as the binary search with + // exact EpochSeconds(convert_timestamp_tz_functor), except using "loose instant" + // as search index instead of exact EpochSeconds. + Function localToLooseEpochSecond = lt -> + 86400L * (lt.getYear() * 400L + (lt.getMonthValue() - 1) * 31L + + lt.getDayOfMonth() - 1) + + 3600L * lt.getHour() + 60L * lt.getMinute() + lt.getSecond(); + + for (ZoneId zoneId : ids) { ZoneRules zoneRules = zoneId.getRules(); // Filter by non-repeating rules if (!zoneRules.isFixedOffset() && !zoneRules.getTransitionRules().isEmpty()) { @@ -266,7 +294,8 @@ private void doLoadData() { new HostColumnVector.StructData( t.getInstant().getEpochSecond(), t.getInstant().getEpochSecond() + t.getOffsetAfter().getTotalSeconds(), - t.getOffsetAfter().getTotalSeconds() + t.getOffsetAfter().getTotalSeconds(), + localToLooseEpochSecond.apply(t.getDateTimeAfter()) // this column is for rebase local date time ) ); } else { @@ -274,7 +303,8 @@ private void doLoadData() { new HostColumnVector.StructData( t.getInstant().getEpochSecond(), t.getInstant().getEpochSecond() + t.getOffsetBefore().getTotalSeconds(), - t.getOffsetAfter().getTotalSeconds() + t.getOffsetAfter().getTotalSeconds(), + localToLooseEpochSecond.apply(t.getDateTimeBefore()) // this column is for rebase local date time ) ); } @@ -288,6 +318,9 @@ private void doLoadData() { } zoneIdToTableFuture.complete(zoneIdToTable); + // load ZoneId.SHORT_IDS and append Z->UTC, then sort the IDs. + loadTimeZoneShortIDs(zoneIdToTable); + HostColumnVector.DataType childType = new HostColumnVector.StructType(false, new HostColumnVector.BasicType(false, DType.INT64), new HostColumnVector.BasicType(false, DType.INT64), @@ -331,6 +364,10 @@ private HostColumnVector getHostFixedTransitions() { } } + /** + * get map from time zone to time zone index in transition table. + * @return map from time zone to time zone index in transition table. + */ public Map getZoneIDMap() { try { return zoneIdToTableFuture.get(TIMEOUT_SECS, TimeUnit.SECONDS); diff --git a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java index ca3049bfed..86f5203249 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java @@ -31,6 +31,7 @@ import ai.rapids.cudf.AssertUtils; import ai.rapids.cudf.ColumnVector; import ai.rapids.cudf.DType; +import ai.rapids.cudf.HostColumnVector; import ai.rapids.cudf.Table; public class CastStringsTest { @@ -407,6 +408,10 @@ void toTimestampTestWithTz() { // short TZ ID: BST->Asia/Dhaka, CTT->Asia/Shanghai entries.add(new AbstractMap.SimpleEntry<>("2023-11-5T03:04:55.1 CTT", 1699124695100000L)); entries.add(new AbstractMap.SimpleEntry<>("2023-11-5T03:04:55.1 BST", 1699124695100000L + 7200L * 1000000L)); // BST is 2 hours later than CTT + // short TZ ID: EST: -05:00; HST: -10:00; MST: -07:00 + entries.add(new AbstractMap.SimpleEntry<>("2023-11-5T03:04:55.1 EST", 1699124695100000L + 13L * 3600L * 1000000L)); // EST is 8 + 5 hours later than Asia/Shanghai + entries.add(new AbstractMap.SimpleEntry<>("2023-11-5T03:04:55.1 HST", 1699124695100000L + 18L * 3600L * 1000000L)); // HST is 8 + 10 hours later than Asia/Shanghai + entries.add(new AbstractMap.SimpleEntry<>("2023-11-5T03:04:55.1 MST", 1699124695100000L + 15L * 3600L * 1000000L)); // MST is 8 + 7 hours later than Asia/Shanghai int validDataSize = entries.size(); @@ -425,7 +430,6 @@ void toTimestampTestWithTz() { entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 10:20:30 -180001", null)); entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 10:20:30 UTC+18:00:10", null)); entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 10:20:30 GMT-23:5", null)); - List inputs = new ArrayList<>(); List expects = new ArrayList<>(); for (Map.Entry entry : entries) { @@ -434,9 +438,9 @@ void toTimestampTestWithTz() { } // Throw unsupported exception for symbols because Europe/London contains DST rules - assertThrows(ai.rapids.cudf.CudfException.class, () -> { + assertThrows(IllegalArgumentException.class, () -> { try (ColumnVector input = ColumnVector.fromStrings("2000-01-29 1:2:3 Europe/London")) { - CastStrings.toTimestamp(input, ZoneId.of("UTC"), false); + CastStrings.toTimestamp(input, ZoneId.of("UTC"), true); } }); From 93a8331de69255b500299cde504b63dbb1aea0e1 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Wed, 24 Jan 2024 15:26:12 +0800 Subject: [PATCH 26/35] Fixes; Comments --- src/main/cpp/src/datetime_parser.cu | 26 ++++++++++--------- .../nvidia/spark/rapids/jni/CastStrings.java | 5 ++-- .../spark/rapids/jni/GpuTimeZoneDB.java | 11 +++++--- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index 18750c9a27..0d1040766e 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -176,21 +176,23 @@ struct parse_timestamp_string_fn { } else { auto tz_view = string_view(tz_lit_ptr, tz_lit_len); - // map tz short IDs to time zone index in transitions. + // try to map tz short IDs to time zone index in transitions. // Here only handle regional base tz map: short ID->regional based tz // Note: here do not handle special short IDs: EST: -05:00; HST: -10:00; MST: -07:00 - auto const short_tz_id_col = tz_short_ids->child(0); - auto const map_to_tz_col = tz_short_ids->child(1); - auto string_iter_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), - get_string_fn{short_tz_id_col}); - auto string_iter_end = string_iter_begin + short_tz_id_col.size(); - auto it = thrust::lower_bound( - thrust::seq, string_iter_begin, string_iter_end, tz_view, thrust::less()); int tz_index_for_short_tz = -1; - if (it != string_iter_end && *it == tz_view) { - // found a map, get the time zone index - auto short_id_index = static_cast(it - string_iter_begin); - tz_index_for_short_tz = static_cast(map_to_tz_col.element(short_id_index)); + if (tz_view.length() == 3) { // short ID length is always 3 + auto const short_tz_id_col = tz_short_ids->child(0); + auto const map_to_tz_col = tz_short_ids->child(1); + auto string_iter_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), + get_string_fn{short_tz_id_col}); + auto string_iter_end = string_iter_begin + short_tz_id_col.size(); + auto it = thrust::lower_bound( + thrust::seq, string_iter_begin, string_iter_end, tz_view, thrust::less()); + if (it != string_iter_end && *it == tz_view) { + // found a map, get the time zone index + auto short_id_index = static_cast(it - string_iter_begin); + tz_index_for_short_tz = static_cast(map_to_tz_col.element(short_id_index)); + } } if (tz_index_for_short_tz >= 0) { diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java index cb0a29ce1f..4a91dc8557 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java @@ -185,8 +185,8 @@ public static ColumnVector fromIntegersWithBase(ColumnView cv, int base) { * Note: * - Do not support cast special strings(epoch now today yesterday tomorrow) to timestamp. * Spark31x supports cast special strings while Spark320+ do not supports - * - Do not support DST time zones, throw ai.rapids.cudf.CudfException - * if contains DST time zones. + * - Do not support DST time zones, return null in non-ANSI mode. + * TODO: DST support. * * Example: * input = [" 2023", "2023-01-01T08:00:00Asia/Shanghai "] @@ -206,7 +206,6 @@ public static ColumnVector fromIntegersWithBase(ColumnView cv, int base) { * @return a timestamp column * @throws IllegalArgumentException if any string in cv has invalid format or the time zone is * non-existed/wrong when ansiEnabled is true - * @throws CudfException if time zone is a DST time zone */ public static ColumnVector toTimestamp(ColumnView cv, ZoneId defaultTimeZone, boolean ansiEnabled) { if (!GpuTimeZoneDB.isSupportedTimeZone(defaultTimeZone)) { diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java index 8189364b6e..eac216478a 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java @@ -45,8 +45,7 @@ public class GpuTimeZoneDB { private CompletableFuture> zoneIdToTableFuture; private CompletableFuture fixedTransitionsFuture; private CompletableFuture zoneIdVectorFuture; - // Used to store Java ZoneId.SHORT_IDS Map, e.g.: PST:America/Los_Angeles - // Note: also add a entry: Z->UTC + // Used to store Java ZoneId.SHORT_IDS Map: PST -> index of America/Los_Angeles in transition table. private HostColumnVector shortIDs; private boolean closed = false; @@ -179,6 +178,7 @@ private void loadData(Executor executor) throws IllegalStateException { /** * load ZoneId.SHORT_IDS and map to time zone index in transition table. + * Note: ignored EST: -05:00; HST: -10:00; MST: -07:00 */ private void loadTimeZoneShortIDs(Map zoneIdToTable) { HostColumnVector.DataType type = new HostColumnVector.StructType(false, @@ -190,6 +190,7 @@ private void loadTimeZoneShortIDs(Map zoneIdToTable) { // sort short IDs Collections.sort(idList); for (String id : idList) { + assert(id.length() == 3); // short ID lenght is always 3 String mapTo = ZoneId.SHORT_IDS.get(id); if (mapTo.startsWith("+") || mapTo.startsWith("-")) { // skip: EST: -05:00; HST: -10:00; MST: -07:00 @@ -200,6 +201,8 @@ private void loadTimeZoneShortIDs(Map zoneIdToTable) { // some short IDs are DST, skip unsupported if (index != null) { data.add(new HostColumnVector.StructData(id, index)); + } else { + // TODO: index should not be null after DST is supported. } } } @@ -231,7 +234,7 @@ private void doLoadData() { for (String tzId : TimeZone.getAvailableIDs()) { ZoneId zoneId; try { - zoneId = ZoneId.of(tzId).normalized(); // we use the normalized form to dedupe + zoneId = ZoneId.of(tzId, ZoneId.SHORT_IDS).normalized(); // we use the normalized form to dedupe ids.add(zoneId); } catch (ZoneRulesException e) { // Sometimes the list of getAvailableIDs() is one of the 3-letter abbreviations, however, @@ -318,7 +321,7 @@ public int compare(ZoneId o1, ZoneId o2) { } zoneIdToTableFuture.complete(zoneIdToTable); - // load ZoneId.SHORT_IDS and append Z->UTC, then sort the IDs. + // load ZoneId.SHORT_IDS loadTimeZoneShortIDs(zoneIdToTable); HostColumnVector.DataType childType = new HostColumnVector.StructType(false, From c8dffb130547e88271fad80e0abdbf39a2d75f2b Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 25 Jan 2024 10:12:30 +0800 Subject: [PATCH 27/35] Refector GpuTimeZoneDB; Add comment for year has max 6 digits --- src/main/cpp/src/datetime_parser.cu | 8 +++++++- .../nvidia/spark/rapids/jni/CastStrings.java | 12 ++++------- .../spark/rapids/jni/GpuTimeZoneDB.java | 20 +++++++++++-------- 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index 0d1040766e..0fdcc4f9bf 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -64,7 +64,13 @@ namespace { * Represents local date time in a time zone. */ struct timestamp_components { - int32_t year; // max 6 digits + /** + * year: Max 6 digits. + * Spark stores timestamp into Long in microseconds. + * A Long is able to represent a timestamp within [+-]200 thousand years. + * Calculated from: Long.MaxValue/MinValue / microseconds_per_year + */ + int32_t year; int8_t month; int8_t day; int8_t hour; diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java index 74cea9e902..d67bc9c208 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java @@ -213,13 +213,10 @@ public static ColumnVector toTimestamp(ColumnView cv, ZoneId defaultTimeZone, bo defaultTimeZone.toString())); } - GpuTimeZoneDB singleton = GpuTimeZoneDB.getInstance(); - GpuTimeZoneDB.cacheDatabase(); - Integer tzIndex = singleton.getZoneIDMap().get(defaultTimeZone.normalized().toString()); - - try (Table transitions = singleton.getTransitions(); - ColumnVector tzIndices = singleton.getZoneIDVector(); - ColumnVector tzShortIDs = singleton.getTimeZoneShortIDs()) { + Integer tzIndex = GpuTimeZoneDB.getZoneIDMap().get(defaultTimeZone.normalized().toString()); + try (Table transitions = GpuTimeZoneDB.getTransitions(); + ColumnVector tzIndices = GpuTimeZoneDB.getZoneIDVector(); + ColumnVector tzShortIDs = GpuTimeZoneDB.getTimeZoneShortIDs()) { return new ColumnVector(toTimestamp(cv.getNativeView(), transitions.getNativeView(), tzIndices.getNativeView(), tzIndex, ansiEnabled, tzShortIDs.getNativeView())); } @@ -279,7 +276,6 @@ public static ColumnVector toTimestamp(ColumnView cv, ZoneId defaultTimeZone, bo * */ public static ColumnVector toTimestampWithoutTimeZone(ColumnView cv, boolean allowTimeZone, boolean ansiEnabled) { - GpuTimeZoneDB.cacheDatabase(); return new ColumnVector(toTimestampWithoutTimeZone(cv.getNativeView(), allowTimeZone, ansiEnabled)); } diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java index 02a171cae6..bf4e171c8d 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java @@ -444,32 +444,36 @@ public int compare(ZoneId o1, ZoneId o2) { * get map from time zone to time zone index in transition table. * @return map from time zone to time zone index in transition table. */ - public Map getZoneIDMap() { - return zoneIdToTable; + public static Map getZoneIDMap() { + cacheDatabase(); + return instance.zoneIdToTable; } /** * Get a map from short ID to time zone index in transitions for the short ID mapped time zone * @return */ - public ColumnVector getTimeZoneShortIDs() { - return shortIDs.copyToDevice(); + public static ColumnVector getTimeZoneShortIDs() { + cacheDatabase(); + return instance.shortIDs.copyToDevice(); } /** * Get a time zone list which is corresponding to the transitions * @return */ - public ColumnVector getZoneIDVector() { - return zoneIdVector.copyToDevice(); + public static ColumnVector getZoneIDVector() { + cacheDatabase(); + return instance.zoneIdVector.copyToDevice(); } /** * Transition table * @return */ - public Table getTransitions() { - try (ColumnVector fixedTransitions = getFixedTransitions()) { + public static Table getTransitions() { + cacheDatabase(); + try (ColumnVector fixedTransitions = instance.getFixedTransitions()) { return new Table(fixedTransitions); } } From 4104173b26630e502a0c4a6fd90c6ebe312f90af Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 25 Jan 2024 10:18:13 +0800 Subject: [PATCH 28/35] format cpp code --- .clang-format | 155 ++++++++++++++++++++++++++++ src/main/cpp/src/datetime_parser.cu | 2 +- 2 files changed, 156 insertions(+), 1 deletion(-) create mode 100644 .clang-format diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000000..26b9a5bf4c --- /dev/null +++ b/.clang-format @@ -0,0 +1,155 @@ +--- +# Refer to the following link for the explanation of each params: +# http://releases.llvm.org/8.0.0/tools/clang/docs/ClangFormatStyleOptions.html +Language: Cpp +# BasedOnStyle: Google +AccessModifierOffset: -1 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: true +AlignConsecutiveBitFields: true +AlignConsecutiveDeclarations: false +AlignConsecutiveMacros: true +AlignEscapedNewlines: Left +AlignOperands: true +AlignTrailingComments: true +AllowAllArgumentsOnNextLine: true +AllowAllConstructorInitializersOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: true +AllowShortCaseLabelsOnASingleLine: true +AllowShortEnumsOnASingleLine: true +AllowShortFunctionsOnASingleLine: All +AllowShortIfStatementsOnASingleLine: true +AllowShortLambdasOnASingleLine: true +AllowShortLoopsOnASingleLine: false +# This is deprecated +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: Yes +BinPackArguments: false +BinPackParameters: false +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false + # disabling the below splits, else, they'll just add to the vertical length of source files! + SplitEmptyFunction: false + SplitEmptyRecord: false + SplitEmptyNamespace: false +BreakAfterJavaFieldAnnotations: false +BreakBeforeBinaryOperators: None +BreakBeforeBraces: WebKit +BreakBeforeInheritanceComma: false +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeColon +BreakInheritanceList: BeforeColon +BreakStringLiterals: true +ColumnLimit: 100 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: true +# Kept the below 2 to be the same as `IndentWidth` to keep everything uniform +ConstructorInitializerIndentWidth: 2 +ContinuationIndentWidth: 2 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IncludeBlocks: Preserve +IncludeIsMainRegex: '([-_](test|unittest))?$' +IndentCaseLabels: true +IndentPPDirectives: None +IndentWidth: 2 +IndentWrappedFunctionNames: false +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Never +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Left +RawStringFormats: + - Language: Cpp + Delimiters: + - cc + - CC + - cpp + - Cpp + - CPP + - 'c++' + - 'C++' + CanonicalDelimiter: '' + - Language: TextProto + Delimiters: + - pb + - PB + - proto + - PROTO + EnclosingFunctions: + - EqualsProto + - EquivToProto + - PARSE_PARTIAL_TEXT_PROTO + - PARSE_TEST_PROTO + - PARSE_TEXT_PROTO + - ParseTextOrDie + - ParseTextProtoOrDie + CanonicalDelimiter: '' + BasedOnStyle: google +# Enabling comment reflow causes doxygen comments to be messed up in their formats! +ReflowComments: true +SortIncludes: true +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceBeforeSquareBrackets: false +SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 2 +SpacesInAngles: false +SpacesInConditionalStatement: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: c++17 +StatementMacros: + - Q_UNUSED + - QT_REQUIRE_VERSION +# Be consistent with indent-width, even for people who use tab for indentation! +TabWidth: 2 +UseTab: Never diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index 0fdcc4f9bf..dd30dae537 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -69,7 +69,7 @@ struct timestamp_components { * Spark stores timestamp into Long in microseconds. * A Long is able to represent a timestamp within [+-]200 thousand years. * Calculated from: Long.MaxValue/MinValue / microseconds_per_year - */ + */ int32_t year; int8_t month; int8_t day; From 5af012c1c764a2273a50e445c024fa67410e747b Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 25 Jan 2024 17:06:03 +0800 Subject: [PATCH 29/35] Remove .clang-format --- .clang-format | 155 -------------------------------------------------- 1 file changed, 155 deletions(-) delete mode 100644 .clang-format diff --git a/.clang-format b/.clang-format deleted file mode 100644 index 26b9a5bf4c..0000000000 --- a/.clang-format +++ /dev/null @@ -1,155 +0,0 @@ ---- -# Refer to the following link for the explanation of each params: -# http://releases.llvm.org/8.0.0/tools/clang/docs/ClangFormatStyleOptions.html -Language: Cpp -# BasedOnStyle: Google -AccessModifierOffset: -1 -AlignAfterOpenBracket: Align -AlignConsecutiveAssignments: true -AlignConsecutiveBitFields: true -AlignConsecutiveDeclarations: false -AlignConsecutiveMacros: true -AlignEscapedNewlines: Left -AlignOperands: true -AlignTrailingComments: true -AllowAllArgumentsOnNextLine: true -AllowAllConstructorInitializersOnNextLine: true -AllowAllParametersOfDeclarationOnNextLine: true -AllowShortBlocksOnASingleLine: true -AllowShortCaseLabelsOnASingleLine: true -AllowShortEnumsOnASingleLine: true -AllowShortFunctionsOnASingleLine: All -AllowShortIfStatementsOnASingleLine: true -AllowShortLambdasOnASingleLine: true -AllowShortLoopsOnASingleLine: false -# This is deprecated -AlwaysBreakAfterDefinitionReturnType: None -AlwaysBreakAfterReturnType: None -AlwaysBreakBeforeMultilineStrings: true -AlwaysBreakTemplateDeclarations: Yes -BinPackArguments: false -BinPackParameters: false -BraceWrapping: - AfterClass: false - AfterControlStatement: false - AfterEnum: false - AfterFunction: false - AfterNamespace: false - AfterObjCDeclaration: false - AfterStruct: false - AfterUnion: false - AfterExternBlock: false - BeforeCatch: false - BeforeElse: false - IndentBraces: false - # disabling the below splits, else, they'll just add to the vertical length of source files! - SplitEmptyFunction: false - SplitEmptyRecord: false - SplitEmptyNamespace: false -BreakAfterJavaFieldAnnotations: false -BreakBeforeBinaryOperators: None -BreakBeforeBraces: WebKit -BreakBeforeInheritanceComma: false -BreakBeforeTernaryOperators: true -BreakConstructorInitializersBeforeComma: false -BreakConstructorInitializers: BeforeColon -BreakInheritanceList: BeforeColon -BreakStringLiterals: true -ColumnLimit: 100 -CommentPragmas: '^ IWYU pragma:' -CompactNamespaces: false -ConstructorInitializerAllOnOneLineOrOnePerLine: true -# Kept the below 2 to be the same as `IndentWidth` to keep everything uniform -ConstructorInitializerIndentWidth: 2 -ContinuationIndentWidth: 2 -Cpp11BracedListStyle: true -DerivePointerAlignment: false -DisableFormat: false -ExperimentalAutoDetectBinPacking: false -FixNamespaceComments: true -ForEachMacros: - - foreach - - Q_FOREACH - - BOOST_FOREACH -IncludeBlocks: Preserve -IncludeIsMainRegex: '([-_](test|unittest))?$' -IndentCaseLabels: true -IndentPPDirectives: None -IndentWidth: 2 -IndentWrappedFunctionNames: false -JavaScriptQuotes: Leave -JavaScriptWrapImports: true -KeepEmptyLinesAtTheStartOfBlocks: false -MacroBlockBegin: '' -MacroBlockEnd: '' -MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None -ObjCBinPackProtocolList: Never -ObjCBlockIndentWidth: 2 -ObjCSpaceAfterProperty: false -ObjCSpaceBeforeProtocolList: true -PenaltyBreakAssignment: 2 -PenaltyBreakBeforeFirstCallParameter: 1 -PenaltyBreakComment: 300 -PenaltyBreakFirstLessLess: 120 -PenaltyBreakString: 1000 -PenaltyBreakTemplateDeclaration: 10 -PenaltyExcessCharacter: 1000000 -PenaltyReturnTypeOnItsOwnLine: 200 -PointerAlignment: Left -RawStringFormats: - - Language: Cpp - Delimiters: - - cc - - CC - - cpp - - Cpp - - CPP - - 'c++' - - 'C++' - CanonicalDelimiter: '' - - Language: TextProto - Delimiters: - - pb - - PB - - proto - - PROTO - EnclosingFunctions: - - EqualsProto - - EquivToProto - - PARSE_PARTIAL_TEXT_PROTO - - PARSE_TEST_PROTO - - PARSE_TEXT_PROTO - - ParseTextOrDie - - ParseTextProtoOrDie - CanonicalDelimiter: '' - BasedOnStyle: google -# Enabling comment reflow causes doxygen comments to be messed up in their formats! -ReflowComments: true -SortIncludes: true -SortUsingDeclarations: true -SpaceAfterCStyleCast: false -SpaceAfterTemplateKeyword: true -SpaceBeforeAssignmentOperators: true -SpaceBeforeCpp11BracedList: false -SpaceBeforeCtorInitializerColon: true -SpaceBeforeInheritanceColon: true -SpaceBeforeParens: ControlStatements -SpaceBeforeRangeBasedForLoopColon: true -SpaceBeforeSquareBrackets: false -SpaceInEmptyBlock: false -SpaceInEmptyParentheses: false -SpacesBeforeTrailingComments: 2 -SpacesInAngles: false -SpacesInConditionalStatement: false -SpacesInContainerLiterals: true -SpacesInCStyleCastParentheses: false -SpacesInParentheses: false -SpacesInSquareBrackets: false -Standard: c++17 -StatementMacros: - - Q_UNUSED - - QT_REQUIRE_VERSION -# Be consistent with indent-width, even for people who use tab for indentation! -TabWidth: 2 -UseTab: Never From 97a8f8f0135a922acb3bc20a6f118c78b207638f Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 25 Jan 2024 17:09:29 +0800 Subject: [PATCH 30/35] Fix do not support non-normalized time zone, like: Etc/GMT; Optimize short time zone ID handling, remove binary search on short IDs --- src/main/cpp/src/CastStringJni.cpp | 14 +- src/main/cpp/src/datetime_parser.cu | 154 ++++------ src/main/cpp/src/datetime_parser.hpp | 12 +- .../nvidia/spark/rapids/jni/CastStrings.java | 11 +- .../spark/rapids/jni/GpuTimeZoneDB.java | 278 +++++++++--------- .../spark/rapids/jni/CastStringsTest.java | 8 +- 6 files changed, 211 insertions(+), 266 deletions(-) diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp index fa48650f32..1d39e7152e 100644 --- a/src/main/cpp/src/CastStringJni.cpp +++ b/src/main/cpp/src/CastStringJni.cpp @@ -264,8 +264,7 @@ Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv* env, jlong transitions_handle, jlong tz_indices_col, jint tz_default_index, - jboolean ansi_enabled, - jlong tz_short_ids) + jboolean ansi_enabled) { JNI_NULL_CHECK(env, input_column, "input column is null", 0); try { @@ -275,12 +274,11 @@ Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv* env, cudf::strings_column_view(*reinterpret_cast(input_column)); auto const transitions = reinterpret_cast(transitions_handle)->column(0); - auto const& tz_indices_view = - cudf::strings_column_view(*reinterpret_cast(tz_indices_col)); - auto const tz_index = static_cast(tz_default_index); - const cudf::column_view* tz_short_ids_view = reinterpret_cast(tz_short_ids); - auto ret_cv = spark_rapids_jni::string_to_timestamp_with_tz( - input_view, transitions, tz_indices_view, tz_index, ansi_enabled, *tz_short_ids_view); + const cudf::column_view* tz_indices_view = + reinterpret_cast(tz_indices_col); + auto const tz_index = static_cast(tz_default_index); + auto ret_cv = spark_rapids_jni::string_to_timestamp_with_tz( + input_view, transitions, *tz_indices_view, tz_index, ansi_enabled); if (ret_cv) { return cudf::jni::release_as_jlong(ret_cv); } } CATCH_STD(env, 0); diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index dd30dae537..fb72944cdb 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -140,8 +140,7 @@ struct parse_timestamp_string_fn { // LIST>. thrust::optional transitions = thrust::nullopt; - thrust::optional sorted_tz_names = thrust::nullopt; - thrust::optional tz_short_ids = thrust::nullopt; + thrust::optional tz_indices = thrust::nullopt; __device__ thrust::tuple operator()(const cudf::size_type& idx) const { @@ -181,50 +180,23 @@ struct parse_timestamp_string_fn { utc_offset = compute_utc_offset(compute_loose_epoch_s(ts_comp), default_tz_index); } else { auto tz_view = string_view(tz_lit_ptr, tz_lit_len); - - // try to map tz short IDs to time zone index in transitions. - // Here only handle regional base tz map: short ID->regional based tz - // Note: here do not handle special short IDs: EST: -05:00; HST: -10:00; MST: -07:00 - int tz_index_for_short_tz = -1; - if (tz_view.length() == 3) { // short ID length is always 3 - auto const short_tz_id_col = tz_short_ids->child(0); - auto const map_to_tz_col = tz_short_ids->child(1); - auto string_iter_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), - get_string_fn{short_tz_id_col}); - auto string_iter_end = string_iter_begin + short_tz_id_col.size(); - auto it = thrust::lower_bound( - thrust::seq, string_iter_begin, string_iter_end, tz_view, thrust::less()); - if (it != string_iter_end && *it == tz_view) { - // found a map, get the time zone index - auto short_id_index = static_cast(it - string_iter_begin); - tz_index_for_short_tz = static_cast(map_to_tz_col.element(short_id_index)); - } - } - - if (tz_index_for_short_tz >= 0) { - // it's a supported short ID, and found the tz index - utc_offset = compute_utc_offset(compute_loose_epoch_s(ts_comp), tz_index_for_short_tz); - } else { - // Firstly, try parsing as utc-like timezone rep - // Note: parse_utc_like_tz handles special short IDs: EST: -05:00; HST: -10:00; MST: -07:00 - auto [fix_offset, ret_code] = parse_utc_like_tz(tz_view); - if (ret_code == ParseUtcLikeTzResult::UTC_LIKE_TZ) { - utc_offset = fix_offset; - } else if (ret_code == ParseUtcLikeTzResult::NOT_UTC_LIKE_TZ) { - // Then, try parsing as region-based timezone ID - auto tz_index = query_index_from_tz_db(tz_view); - if (tz_index < 0) { - // TODO: distinguish unsupported and invalid tz - return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, - ParseResult::INVALID); - } else { - // supported tz - utc_offset = compute_utc_offset(compute_loose_epoch_s(ts_comp), tz_index); - } - } else { - // (ret_code == ParseUtcLikeTzResult::INVALID) quick path to mark value invalid + // Firstly, try parsing as utc-like timezone rep + auto [fix_offset, ret_code] = parse_utc_like_tz(tz_view); + if (ret_code == ParseUtcLikeTzResult::UTC_LIKE_TZ) { + utc_offset = fix_offset; + } else if (ret_code == ParseUtcLikeTzResult::NOT_UTC_LIKE_TZ) { + // Then, try parsing as region-based timezone ID + auto tz_index = query_index_from_tz_db(tz_view); + if (tz_index < 0) { + // TODO: distinguish unsupported and invalid tz return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID); + } else { + // supported tz + utc_offset = compute_utc_offset(compute_loose_epoch_s(ts_comp), tz_index); } + } else { + // (ret_code == ParseUtcLikeTzResult::INVALID) quick path to mark value invalid + return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID); } } @@ -247,12 +219,6 @@ struct parse_timestamp_string_fn { * This function is purposed to be fully align to Apache Spark's behavior. The * function returns the status along with the ParseUtcLikeTzResult result. * - * Valid patterns: - * Z: means UTC - * short tz IDs that is UTC like - * EST: -05:00 - * HST: -10:00 - * MST: -07:00 * with colon * hh:mm : ^(GMT|UTC)?[+-](\d|0[0-9]|1[0-8]):(\d|[0-5][0-9]) * hh:mm:ss : ^(GMT|UTC)?[+-](\d|0[0-9]|1[0-8]):[0-5][0-9]:[0-5][0-9] @@ -270,23 +236,6 @@ struct parse_timestamp_string_fn { char const* ptr = tz_lit.data(); - // Z time zone - if (len == 1 && *ptr == 'Z') { return {0, ParseUtcLikeTzResult::UTC_LIKE_TZ}; } - - // handle short tz IDs that is UTC like: EST, HST, MST - if (len == 3) { - if ((*ptr == 'E' && *(ptr + 1) == 'S' && *(ptr + 2) == 'T')) { - // EST: -05:00 - return {-5L * 3600L, ParseUtcLikeTzResult::UTC_LIKE_TZ}; - } else if ((*ptr == 'H' && *(ptr + 1) == 'S' && *(ptr + 2) == 'T')) { - // HST: -10:00 - return {-10L * 3600L, ParseUtcLikeTzResult::UTC_LIKE_TZ}; - } else if ((*ptr == 'M' && *(ptr + 1) == 'S' && *(ptr + 2) == 'T')) { - // MST: -07:00 - return {-7L * 3600L, ParseUtcLikeTzResult::UTC_LIKE_TZ}; - } - } - size_t char_offset = 0; // skip UTC|GMT if existing if (len > 2 && ((*ptr == 'G' && *(ptr + 1) == 'M' && *(ptr + 2) == 'T') || @@ -363,14 +312,18 @@ struct parse_timestamp_string_fn { */ __device__ inline int query_index_from_tz_db(string_view const& tz_lit) const { - auto string_iter_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), - get_string_fn{*sorted_tz_names}); - auto string_iter_end = string_iter_begin + sorted_tz_names->size(); - auto it = thrust::lower_bound( + auto const tz_col = tz_indices->child(0); + auto const index_in_transition_col = tz_indices->child(1); + + auto string_iter_begin = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), get_string_fn{tz_col}); + auto string_iter_end = string_iter_begin + tz_col.size(); + auto it = thrust::lower_bound( thrust::seq, string_iter_begin, string_iter_end, tz_lit, thrust::less()); if (it != string_iter_end && *it == tz_lit) { // found tz - return static_cast(it - string_iter_begin); + auto tz_name_index = static_cast(it - string_iter_begin); + return static_cast(index_in_transition_col.element(tz_name_index)); } else { // not found tz return -1; @@ -602,20 +555,18 @@ struct parse_timestamp_string_fn { * The common entrance of string_to_timestamp, two paths call this function: * - `string_to_timestamp_with_tz` : with time zone * - `string_to_timestamp_without_tz` : without time zone - * The parameters transitions, sorted_tz_names and default_tz_index are only for handling + * The parameters transitions, tz_indices and default_tz_index are only for handling * inputs with timezone. - * It's called from `string_to_timestamp_without_tz` if transitions and sorted_tz_names + * It's called from `string_to_timestamp_without_tz` if transitions and tz_indices * are nullptr, otherwise called from `string_to_timestamp_with_tz`. * */ -std::unique_ptr to_timestamp( - cudf::strings_column_view const& input, - bool ansi_mode, - bool allow_tz_in_date_str = true, - size_type default_tz_index = 1000000000, - cudf::column_view const* transitions = nullptr, - cudf::strings_column_view const* sorted_tz_names = nullptr, - cudf::column_view const* tz_short_ids = nullptr) +std::unique_ptr to_timestamp(cudf::strings_column_view const& input, + bool ansi_mode, + bool allow_tz_in_date_str = true, + size_type default_tz_index = 1000000000, + cudf::column_view const* transitions = nullptr, + cudf::column_view const* tz_indices = nullptr) { auto const stream = cudf::get_default_stream(); auto const mr = rmm::mr::get_current_device_resource(); @@ -632,7 +583,7 @@ std::unique_ptr to_timestamp( auto result_valid_col = cudf::make_fixed_width_column( cudf::data_type{cudf::type_id::UINT8}, input.size(), cudf::mask_state::UNALLOCATED, stream, mr); - if (transitions == nullptr || sorted_tz_names == nullptr) { + if (transitions == nullptr || tz_indices == nullptr) { thrust::transform( rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -644,18 +595,16 @@ std::unique_ptr to_timestamp( } else { auto const ft_cdv_ptr = column_device_view::create(*transitions, stream); auto const d_transitions = lists_column_device_view{*ft_cdv_ptr}; - auto d_sorted_tz_names = cudf::column_device_view::create(sorted_tz_names->parent(), stream); - auto d_tz_short_ids = column_device_view::create(*tz_short_ids, stream); - - thrust::transform( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(input.size()), - thrust::make_zip_iterator( - thrust::make_tuple(result_col->mutable_view().begin(), - result_valid_col->mutable_view().begin())), - parse_timestamp_string_fn{ - *d_strings, default_tz_index, true, d_transitions, *d_sorted_tz_names, *d_tz_short_ids}); + auto d_tz_indices = cudf::column_device_view::create(*tz_indices, stream); + + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + thrust::make_zip_iterator( + thrust::make_tuple(result_col->mutable_view().begin(), + result_valid_col->mutable_view().begin())), + parse_timestamp_string_fn{ + *d_strings, default_tz_index, true, d_transitions, *d_tz_indices}); } auto valid_view = result_valid_col->mutable_view(); @@ -696,17 +645,14 @@ namespace spark_rapids_jni { * timestamp column otherwise. * */ -std::unique_ptr string_to_timestamp_with_tz( - cudf::strings_column_view const& input, - cudf::column_view const& transitions, - cudf::strings_column_view const& sorted_tz_names, - cudf::size_type default_tz_index, - bool ansi_mode, - cudf::column_view const& tz_short_ids) +std::unique_ptr string_to_timestamp_with_tz(cudf::strings_column_view const& input, + cudf::column_view const& transitions, + cudf::column_view const& tz_indices, + cudf::size_type default_tz_index, + bool ansi_mode) { if (input.size() == 0) { return nullptr; } - return to_timestamp( - input, ansi_mode, true, default_tz_index, &transitions, &sorted_tz_names, &tz_short_ids); + return to_timestamp(input, ansi_mode, true, default_tz_index, &transitions, &tz_indices); } /** diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp index f750594f9f..2d45f68dca 100644 --- a/src/main/cpp/src/datetime_parser.hpp +++ b/src/main/cpp/src/datetime_parser.hpp @@ -72,13 +72,11 @@ namespace spark_rapids_jni { * @returns the pointer of the timestamp result column, which points to nullptr * if there exists invalid inputs and ANSI mode is on. */ -std::unique_ptr string_to_timestamp_with_tz( - cudf::strings_column_view const& input, - cudf::column_view const& transitions, - cudf::strings_column_view const& tz_indices, - cudf::size_type default_tz_index, - bool ansi_mode, - cudf::column_view const& tz_short_ids); +std::unique_ptr string_to_timestamp_with_tz(cudf::strings_column_view const& input, + cudf::column_view const& transitions, + cudf::column_view const& tz_indices, + cudf::size_type default_tz_index, + bool ansi_mode); /** * diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java index d67bc9c208..3c4c4a3cc6 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java @@ -210,15 +210,14 @@ public static ColumnVector fromIntegersWithBase(ColumnView cv, int base) { public static ColumnVector toTimestamp(ColumnView cv, ZoneId defaultTimeZone, boolean ansiEnabled) { if (!GpuTimeZoneDB.isSupportedTimeZone(defaultTimeZone)) { throw new IllegalArgumentException(String.format("Unsupported timezone: %s", - defaultTimeZone.toString())); + defaultTimeZone.getId())); } - Integer tzIndex = GpuTimeZoneDB.getZoneIDMap().get(defaultTimeZone.normalized().toString()); + Integer tzIndex = GpuTimeZoneDB.getZoneIDMap().get(defaultTimeZone.getId()); try (Table transitions = GpuTimeZoneDB.getTransitions(); - ColumnVector tzIndices = GpuTimeZoneDB.getZoneIDVector(); - ColumnVector tzShortIDs = GpuTimeZoneDB.getTimeZoneShortIDs()) { + ColumnVector tzIndices = GpuTimeZoneDB.getZoneIDVector()) { return new ColumnVector(toTimestamp(cv.getNativeView(), transitions.getNativeView(), - tzIndices.getNativeView(), tzIndex, ansiEnabled, tzShortIDs.getNativeView())); + tzIndices.getNativeView(), tzIndex, ansiEnabled)); } } @@ -291,7 +290,7 @@ private static native long toIntegersWithBase(long nativeColumnView, int base, boolean ansiEnabled, int dtype); private static native long fromIntegersWithBase(long nativeColumnView, int base); private static native long toTimestamp(long input, - long transitions, long tzIndices, int tzIndex, boolean ansiEnabled, long tzShortIDs); + long transitions, long tzIndices, int tzIndex, boolean ansiEnabled); private static native long toTimestampWithoutTimeZone(long input, boolean allowTimeZone, boolean ansiEnabled); } diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java index bf4e171c8d..4c84ae1892 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java @@ -34,8 +34,10 @@ import java.util.Collections; import java.util.Comparator; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.TimeZone; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutionException; @@ -69,12 +71,8 @@ public class GpuTimeZoneDB { // zone id to index in `fixedTransitions` private Map zoneIdToTable; - // Used to store Java ZoneId.SHORT_IDS Map, e.g.: For PST -> America/Los_Angeles - // Save: PST -> index of America/Los_Angeles in transition table. - private HostColumnVector shortIDs; - - // zone id list - private HostColumnVector zoneIdVector; + // host column vector for `zoneIdToTable`, sorted by time zone strings + private HostColumnVector zoneIdToTableVec; // Guarantee singleton instance private GpuTimeZoneDB() { @@ -218,13 +216,9 @@ private void closeResources() { fixedTransitions.close(); fixedTransitions = null; } - if (shortIDs != null) { - shortIDs.close(); - shortIDs = null; - } - if (zoneIdVector != null) { - zoneIdVector.close(); - zoneIdVector = null; + if (zoneIdToTableVec != null) { + zoneIdToTableVec.close(); + zoneIdToTableVec = null; } } @@ -284,74 +278,36 @@ public static ZoneId getZoneId(String timeZoneId) { return ZoneId.of(formattedZoneId, ZoneId.SHORT_IDS); } - /** - * load ZoneId.SHORT_IDS and map to time zone index in transition table. - * Note: ignored EST: -05:00; HST: -10:00; MST: -07:00 - */ - private void loadTimeZoneShortIDs(Map zoneIdToTable) { - HostColumnVector.DataType type = new HostColumnVector.StructType(false, - new HostColumnVector.BasicType(false, DType.STRING), - new HostColumnVector.BasicType(false, DType.INT32)); - ArrayList data = new ArrayList<>(); - // copy short IDs - List idList = new ArrayList<>(ZoneId.SHORT_IDS.keySet()); - // sort short IDs - Collections.sort(idList); - for (String id : idList) { - assert(id.length() == 3); // short ID lenght is always 3 - String mapTo = ZoneId.SHORT_IDS.get(id); - if (mapTo.startsWith("+") || mapTo.startsWith("-")) { - // skip: EST: -05:00; HST: -10:00; MST: -07:00 - // kernel will handle EST, HST, MST - // ZoneId.SHORT_IDS is deprecated, so it will not probably change - } else { - Integer index = zoneIdToTable.get(mapTo); - // some short IDs are DST, skip unsupported - if (index != null) { - data.add(new HostColumnVector.StructData(id, index)); - } else { - // TODO: index should not be null after DST is supported. - } - } - } - shortIDs = HostColumnVector.fromStructs(type, data); - } - @SuppressWarnings("unchecked") private void loadData() { try { - zoneIdToTable = new HashMap<>(); - List> masterTransitions = new ArrayList<>(); - // Build a timezone ID index for the rendering of timezone IDs which may be included in datetime-like strings. - // For instance: "2023-11-5T03:04:55.1 Asia/Shanghai" -> This index helps to find the - // offset of "Asia/Shanghai" in timezoneDB. - // - // Currently, we do NOT support all timezone IDs. For unsupported time zones, like invalid ones, - // we replace them with NULL value when ANSI mode is off when parsing string to timestamp. - // This list only contains supported time zones. - List zondIdList = new ArrayList<>(); - - // collect zone id and sort - List ids = new ArrayList<>(); - for (String tzId : TimeZone.getAvailableIDs()) { - ZoneId zoneId; - try { - zoneId = ZoneId.of(tzId, ZoneId.SHORT_IDS).normalized(); // we use the normalized form to dedupe - ids.add(zoneId); - } catch (ZoneRulesException e) { - // Sometimes the list of getAvailableIDs() is one of the 3-letter abbreviations, however, - // this use is deprecated due to ambiguity reasons (same abbrevation can be used for - // multiple time zones). These are not supported by ZoneId.of(...) directly here. - continue; + // Note: ZoneId.normalized will transform fixed offset time zone to standard fixed offset + // e.g.: ZoneId.of("Etc/GMT").normalized.getId = Z; ZoneId.of("Etc/GMT+0").normalized.getId = Z + // Both Etc/GMT and Etc/GMT+0 have normalized Z. + // We use the normalized form to dedupe, + // but should record map from TimeZone.getAvailableIDs() Set to normalized Set. + // `fixedTransitions` saves transitions for normalized time zones. + // Spark uses time zones from TimeZone.getAvailableIDs() + // So we have a Map from TimeZone.getAvailableIDs() to index of `fixedTransitions`. + + // get and sort time zones + String[] timeZones = TimeZone.getAvailableIDs(); + List sortedTimeZones = new ArrayList<>(Arrays.asList(timeZones)); + // Note: Z is a special normalized time zone from UTC: ZoneId.of("UTC").normalized = Z + // TimeZone.getAvailableIDs does not contains Z and ZoneId.SHORT_IDS also does not contain Z + // Should add Z to `zoneIdToTable` + sortedTimeZones.add("Z"); + Collections.sort(sortedTimeZones); + + // Note: Spark uses ZoneId.SHORT_IDS + // `TimeZone.getAvailableIDs` contains all keys in `ZoneId.SHORT_IDS` + // So do not need extra work for ZoneId.SHORT_IDS, here just check this assumption + for (String tz : ZoneId.SHORT_IDS.keySet()) { + if (!sortedTimeZones.contains(tz)) { + throw new IllegalStateException( + String.format("Can not find short Id %s in time zones %s", tz, sortedTimeZones)); } } - Collections.sort(ids, new Comparator() { - @Override - public int compare(ZoneId o1, ZoneId o2) { - // sort by `getId` - return o1.getId().compareTo(o2.getId()); - } - }); // A simple approach to transform LocalDateTime to a value which is proportional to // the exact EpochSecond. After caching these values along with EpochSeconds, we @@ -364,66 +320,30 @@ public int compare(ZoneId o1, ZoneId o2) { lt.getDayOfMonth() - 1) + 3600L * lt.getHour() + 60L * lt.getMinute() + lt.getSecond(); - for (ZoneId zoneId : ids) { - ZoneRules zoneRules = zoneId.getRules(); + List> masterTransitions = new ArrayList<>(); + + // map: normalizedTimeZone -> index in fixedTransitions + Map mapForNormalizedTimeZone = new HashMap<>(); + // go though all time zones and save by normalized time zone + List sortedSupportedTimeZones = new ArrayList<>(); + for (String timeZone : sortedTimeZones) { + ZoneId normalizedZoneId = ZoneId.of(timeZone, ZoneId.SHORT_IDS).normalized(); + String normalizedTimeZone = normalizedZoneId.getId(); + ZoneRules zoneRules = normalizedZoneId.getRules(); // Filter by non-repeating rules if (!zoneRules.isFixedOffset() && !zoneRules.getTransitionRules().isEmpty()) { continue; } - if (!zoneIdToTable.containsKey(zoneId.getId())) { - List transitions = zoneRules.getTransitions(); + sortedSupportedTimeZones.add(timeZone); + if (!mapForNormalizedTimeZone.containsKey(normalizedTimeZone)) { // dedup + List data = getTransitionData(localToLooseEpochSecond, zoneRules); + // add transition data for time zone int idx = masterTransitions.size(); - List data = new ArrayList<>(); - if (zoneRules.isFixedOffset()) { - data.add( - new HostColumnVector.StructData(Long.MIN_VALUE, Long.MIN_VALUE, - zoneRules.getOffset(Instant.now()).getTotalSeconds(), Long.MIN_VALUE) - ); - } else { - // Capture the first official offset (before any transition) using Long min - ZoneOffsetTransition first = transitions.get(0); - data.add( - new HostColumnVector.StructData(Long.MIN_VALUE, Long.MIN_VALUE, - first.getOffsetBefore().getTotalSeconds(), Long.MIN_VALUE) - ); - transitions.forEach(t -> { - // Whether transition is an overlap vs gap. - // In Spark: - // if it's a gap, then we use the offset after *on* the instant - // If it's an overlap, then there are 2 sets of valid timestamps in that are overlapping - // So, for the transition to UTC, you need to compare to instant + {offset before} - // The time math still uses {offset after} - if (t.isGap()) { - data.add( - new HostColumnVector.StructData( - t.getInstant().getEpochSecond(), - t.getInstant().getEpochSecond() + t.getOffsetAfter().getTotalSeconds(), - t.getOffsetAfter().getTotalSeconds(), - localToLooseEpochSecond.apply(t.getDateTimeAfter()) // this column is for rebase local date time - ) - ); - } else { - data.add( - new HostColumnVector.StructData( - t.getInstant().getEpochSecond(), - t.getInstant().getEpochSecond() + t.getOffsetBefore().getTotalSeconds(), - t.getOffsetAfter().getTotalSeconds(), - localToLooseEpochSecond.apply(t.getDateTimeBefore()) // this column is for rebase local date time - ) - ); - } - }); - } + mapForNormalizedTimeZone.put(normalizedTimeZone, idx); masterTransitions.add(data); - zoneIdToTable.put(zoneId.getId(), idx); - // Collect the IDs of all supported timezones in the order of masterTransitions - zondIdList.add(zoneId.getId()); } } - // load ZoneId.SHORT_IDS - loadTimeZoneShortIDs(zoneIdToTable); - HostColumnVector.DataType childType = new HostColumnVector.StructType(false, new HostColumnVector.BasicType(false, DType.INT64), new HostColumnVector.BasicType(false, DType.INT64), @@ -432,39 +352,117 @@ public int compare(ZoneId o1, ZoneId o2) { HostColumnVector.DataType resultType = new HostColumnVector.ListType(false, childType); + // generate all transitions for all time zones fixedTransitions = HostColumnVector.fromLists(resultType, masterTransitions.toArray(new List[0])); - zoneIdVector = HostColumnVector.fromStrings(zondIdList.toArray(new String[0])); + + // generate `zoneIdToTable`, key should be time zone not normalized time zone + zoneIdToTable = new HashMap<>(); + for (String timeZone : sortedSupportedTimeZones) { + // map from time zone to normalized + String normalized = ZoneId.of(timeZone, ZoneId.SHORT_IDS).normalized().getId(); + Integer index = mapForNormalizedTimeZone.get(normalized); + if (index != null) { + zoneIdToTable.put(timeZone, index); + } else { + throw new IllegalStateException("Could not find index for normalized time zone " + normalized); + } + } + // generate host vector + zoneIdToTableVec = generateZoneIdToTableVec(sortedSupportedTimeZones, zoneIdToTable); + } catch (IllegalStateException e) { + throw e; } catch (Exception e) { throw new IllegalStateException("load time zone DB cache failed!", e); } + } + // generate transition data for a time zone + private List getTransitionData(Function localToLooseEpochSecond, + ZoneRules zoneRules) { + List transitions = zoneRules.getTransitions(); + List data = new ArrayList<>(); + if (zoneRules.isFixedOffset()) { + data.add( + new HostColumnVector.StructData(Long.MIN_VALUE, Long.MIN_VALUE, + zoneRules.getOffset(Instant.now()).getTotalSeconds(), Long.MIN_VALUE) + ); + } else { + // Capture the first official offset (before any transition) using Long min + ZoneOffsetTransition first = transitions.get(0); + data.add( + new HostColumnVector.StructData(Long.MIN_VALUE, Long.MIN_VALUE, + first.getOffsetBefore().getTotalSeconds(), Long.MIN_VALUE) + ); + transitions.forEach(t -> { + // Whether transition is an overlap vs gap. + // In Spark: + // if it's a gap, then we use the offset after *on* the instant + // If it's an overlap, then there are 2 sets of valid timestamps in that are overlapping + // So, for the transition to UTC, you need to compare to instant + {offset before} + // The time math still uses {offset after} + if (t.isGap()) { + data.add( + new HostColumnVector.StructData( + t.getInstant().getEpochSecond(), + t.getInstant().getEpochSecond() + t.getOffsetAfter().getTotalSeconds(), + t.getOffsetAfter().getTotalSeconds(), + localToLooseEpochSecond.apply(t.getDateTimeAfter()) // this column is for rebase local date time + ) + ); + } else { + data.add( + new HostColumnVector.StructData( + t.getInstant().getEpochSecond(), + t.getInstant().getEpochSecond() + t.getOffsetBefore().getTotalSeconds(), + t.getOffsetAfter().getTotalSeconds(), + localToLooseEpochSecond.apply(t.getDateTimeBefore()) // this column is for rebase local date time + ) + ); + } + }); + } + return data; } /** - * get map from time zone to time zone index in transition table. - * @return map from time zone to time zone index in transition table. + * Generate map from time zone to index in transition table. + * regular time zone map to normalized time zone, then get from + * @param sortedSupportedTimeZones is sorted and supported time zones + * @param zoneIdToTableMap is a map from non-normalized time zone to index in transition table */ - public static Map getZoneIDMap() { - cacheDatabase(); - return instance.zoneIdToTable; + private static HostColumnVector generateZoneIdToTableVec(List sortedSupportedTimeZones, Map zoneIdToTableMap) { + HostColumnVector.DataType type = new HostColumnVector.StructType(false, + new HostColumnVector.BasicType(false, DType.STRING), + new HostColumnVector.BasicType(false, DType.INT32)); + ArrayList data = new ArrayList<>(); + + for (String timeZone : sortedSupportedTimeZones) { + Integer mapTo = zoneIdToTableMap.get(timeZone); + if (mapTo != null) { + data.add(new HostColumnVector.StructData(timeZone, mapTo)); + } else { + throw new IllegalStateException("Could not find index for time zone " + timeZone); + } + } + return HostColumnVector.fromStructs(type, data); } /** - * Get a map from short ID to time zone index in transitions for the short ID mapped time zone - * @return + * get map from time zone to time zone index in transition table. + * @return map from time zone to time zone index in transition table. */ - public static ColumnVector getTimeZoneShortIDs() { + public static Map getZoneIDMap() { cacheDatabase(); - return instance.shortIDs.copyToDevice(); + return instance.zoneIdToTable; } /** - * Get a time zone list which is corresponding to the transitions + * Get vector from time zone to index in transition table * @return */ public static ColumnVector getZoneIDVector() { cacheDatabase(); - return instance.zoneIdVector.copyToDevice(); + return instance.zoneIdToTableVec.copyToDevice(); } /** @@ -493,7 +491,7 @@ private ColumnVector getFixedTransitions() { * @return list of fixed transitions */ List getHostFixedTransitions(String zoneId) { - zoneId = ZoneId.of(zoneId).normalized().toString(); // we use the normalized form to dedupe + zoneId = ZoneId.of(zoneId, ZoneId.SHORT_IDS).normalized().toString(); // we use the normalized form to dedupe Integer idx = getZoneIDMap().get(zoneId); if (idx == null) { return null; diff --git a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java index 86f5203249..de2a7738f6 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java @@ -412,6 +412,10 @@ void toTimestampTestWithTz() { entries.add(new AbstractMap.SimpleEntry<>("2023-11-5T03:04:55.1 EST", 1699124695100000L + 13L * 3600L * 1000000L)); // EST is 8 + 5 hours later than Asia/Shanghai entries.add(new AbstractMap.SimpleEntry<>("2023-11-5T03:04:55.1 HST", 1699124695100000L + 18L * 3600L * 1000000L)); // HST is 8 + 10 hours later than Asia/Shanghai entries.add(new AbstractMap.SimpleEntry<>("2023-11-5T03:04:55.1 MST", 1699124695100000L + 15L * 3600L * 1000000L)); // MST is 8 + 7 hours later than Asia/Shanghai + // test time zones not in notmalized names, e.g,: ZoneId.of("Etc/GMT").normalized.getId = Z; ZoneId.of("Etc/GMT+0").normalized.getId = Z; Etc/GMT+10 -> -10:00 + entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 Etc/GMT", 1571610824100000L)); + entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 Etc/GMT+0", 1571610824100000L)); + entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 Etc/GMT+10", 1571592825100000L)); int validDataSize = entries.size(); @@ -459,8 +463,10 @@ void toTimestampTestWithTz() { try ( ColumnVector input = ColumnVector.fromStrings(inputs.toArray(new String[0])); ColumnVector expected = ColumnVector.timestampMicroSecondsFromBoxedLongs(expects.toArray(new Long[0])); - ColumnVector actual = CastStrings.toTimestamp(input, ZoneId.of("UTC"), false)) { + ColumnVector actual = CastStrings.toTimestamp(input, ZoneId.of("UTC"), false); + ColumnVector actual2 = CastStrings.toTimestamp(input, ZoneId.of("Z"), false)) { AssertUtils.assertColumnsAreEqual(expected, actual); + AssertUtils.assertColumnsAreEqual(expected, actual2); } // Should NOT throw exception because all inputs are valid From 0a7efd9b473880f8a64ed5a2f9d8571d423a5333 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Fri, 26 Jan 2024 13:34:19 +0800 Subject: [PATCH 31/35] Refector to address comments --- src/main/cpp/src/datetime_parser.cu | 144 +++++++++--------- src/main/cpp/src/datetime_parser.hpp | 16 +- .../spark/rapids/jni/GpuTimeZoneDB.java | 2 +- 3 files changed, 78 insertions(+), 84 deletions(-) diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index fb72944cdb..e1ad607555 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,24 +16,17 @@ #include "datetime_parser.hpp" -#include -#include - -#include - #include #include #include #include #include #include - #include #include +#include #include #include - -#include #include #include #include @@ -49,15 +42,12 @@ #include #include -using column = cudf::column; -using column_device_view = cudf::column_device_view; -using column_view = cudf::column_view; -using lists_column_device_view = cudf::detail::lists_column_device_view; -using size_type = cudf::size_type; -using string_view = cudf::string_view; -using struct_view = cudf::struct_view; -using table_view = cudf::table_view; +#include +#include +#include + +namespace spark_rapids_jni { namespace { /** @@ -99,11 +89,11 @@ __device__ __host__ inline bool is_whitespace(const char chr) __device__ __host__ bool is_valid_digits(int segment, int digits) { // A Long is able to represent a timestamp within [+-]200 thousand years - const int constexpr maxDigitsYear = 6; + constexpr int maxDigitsYear = 6; // For the nanosecond part, more than 6 digits is allowed, but will be // truncated. return segment == 6 || (segment == 0 && digits >= 4 && digits <= maxDigitsYear) || - // For the zoneId segment(7), it's could be zero digits when it's a + // For the zoneId segment(7), it could be zero digits when it's a // region-based zone ID (segment == 7 && digits <= 2) || (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2); @@ -113,7 +103,7 @@ __device__ __host__ bool is_valid_digits(int segment, int digits) * function to get a string from string view */ struct get_string_fn { - column_device_view const& string_view; + cudf::column_device_view const& string_view; __device__ cudf::string_view operator()(size_t idx) { @@ -132,15 +122,18 @@ enum ParseResult { OK = 0, INVALID = 1, UNSUPPORTED = 2 }; template struct parse_timestamp_string_fn { - column_device_view const d_strings; - size_type default_tz_index; - bool allow_tz_in_date_str = true; + // below three are required: + cudf::column_device_view const& d_strings; + cudf::size_type const default_tz_index; + bool const allow_tz_in_date_str; + + // below two are optinal: // The list column of transitions to figure out the correct offset // to adjust the timestamp. The type of the values in this column is // LIST>. - thrust::optional transitions = thrust::nullopt; - thrust::optional tz_indices = thrust::nullopt; + thrust::optional transitions = thrust::nullopt; + thrust::optional tz_indices = thrust::nullopt; __device__ thrust::tuple operator()(const cudf::size_type& idx) const { @@ -152,16 +145,15 @@ struct parse_timestamp_string_fn { auto const d_str = d_strings.element(idx); timestamp_components ts_comp{}; - char const* tz_lit_ptr = nullptr; - size_type tz_lit_len = 0; + char const* tz_lit_ptr = nullptr; + cudf::size_type tz_lit_len = 0; switch (parse_string_to_timestamp_us(&ts_comp, &tz_lit_ptr, &tz_lit_len, d_str)) { case ParseResult::INVALID: return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID); case ParseResult::UNSUPPORTED: return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::UNSUPPORTED); - case ParseResult::OK: - default: break; + case ParseResult::OK: break; } if constexpr (!with_timezone) { @@ -179,14 +171,14 @@ struct parse_timestamp_string_fn { // no tz in the string tailing, use default tz utc_offset = compute_utc_offset(compute_loose_epoch_s(ts_comp), default_tz_index); } else { - auto tz_view = string_view(tz_lit_ptr, tz_lit_len); + auto const tz_view = cudf::string_view(tz_lit_ptr, tz_lit_len); // Firstly, try parsing as utc-like timezone rep auto [fix_offset, ret_code] = parse_utc_like_tz(tz_view); if (ret_code == ParseUtcLikeTzResult::UTC_LIKE_TZ) { utc_offset = fix_offset; } else if (ret_code == ParseUtcLikeTzResult::NOT_UTC_LIKE_TZ) { // Then, try parsing as region-based timezone ID - auto tz_index = query_index_from_tz_db(tz_view); + auto const tz_index = query_index_from_tz_db(tz_view); if (tz_index < 0) { // TODO: distinguish unsupported and invalid tz return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID); @@ -216,7 +208,7 @@ struct parse_timestamp_string_fn { /** * * Parse UTC-like timezone representation such as: UTC+11:22:33, GMT-8:08:01. - * This function is purposed to be fully align to Apache Spark's behavior. The + * This function is purposed to be fully aligned to Apache Spark's behavior. The * function returns the status along with the ParseUtcLikeTzResult result. * * with colon @@ -230,9 +222,9 @@ struct parse_timestamp_string_fn { * is invalid) */ __device__ inline thrust::pair parse_utc_like_tz( - string_view const& tz_lit) const + cudf::string_view const& tz_lit) const { - size_type len = tz_lit.size_bytes(); + cudf::size_type const len = tz_lit.size_bytes(); char const* ptr = tz_lit.data(); @@ -262,7 +254,7 @@ struct parse_timestamp_string_fn { // parse hh:mm:ss int64_t hms[3] = {0L, 0L, 0L}; bool has_colon = false; - for (size_type i = 0; i < 3; i++) { + for (cudf::size_type i = 0; i < 3; i++) { // deal with the first digit hms[i] = *(ptr + char_offset++) - '0'; if (hms[i] < 0 || hms[i] > 9) return {0, ParseUtcLikeTzResult::INVALID}; @@ -287,7 +279,7 @@ struct parse_timestamp_string_fn { } // deal with the second digit - auto digit = *(ptr + char_offset++) - '0'; + auto const digit = *(ptr + char_offset++) - '0'; if (digit < 0 || digit > 9) return {0, ParseUtcLikeTzResult::INVALID}; hms[i] = hms[i] * 10 + digit; @@ -310,19 +302,20 @@ struct parse_timestamp_string_fn { /** * use binary search to find tz index. */ - __device__ inline int query_index_from_tz_db(string_view const& tz_lit) const + __device__ inline int query_index_from_tz_db(cudf::string_view const& tz_lit) const { auto const tz_col = tz_indices->child(0); auto const index_in_transition_col = tz_indices->child(1); - auto string_iter_begin = + auto const string_iter_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), get_string_fn{tz_col}); - auto string_iter_end = string_iter_begin + tz_col.size(); - auto it = thrust::lower_bound( - thrust::seq, string_iter_begin, string_iter_end, tz_lit, thrust::less()); + auto const string_iter_end = string_iter_begin + tz_col.size(); + auto const it = thrust::lower_bound( + thrust::seq, string_iter_begin, string_iter_end, tz_lit, thrust::less()); if (it != string_iter_end && *it == tz_lit) { // found tz - auto tz_name_index = static_cast(it - string_iter_begin); + auto const tz_name_index = + static_cast(thrust::distance(string_iter_begin, it)); return static_cast(index_in_transition_col.element(tz_name_index)); } else { // not found tz @@ -335,7 +328,8 @@ struct parse_timestamp_string_fn { * instants. Basically, this is the same approach as * `convert_timestamp_tz_functor`. */ - __device__ inline int64_t compute_utc_offset(int64_t loose_epoch_second, size_type tz_index) const + __device__ inline int64_t compute_utc_offset(int64_t const loose_epoch_second, + cudf::size_type const tz_index) const { auto const& utc_offsets = transitions->child().child(2); auto const& loose_instants = transitions->child().child(3); @@ -349,7 +343,7 @@ struct parse_timestamp_string_fn { auto const it = thrust::upper_bound( thrust::seq, transition_times.begin(), transition_times.end(), loose_epoch_second); - auto const idx = static_cast(thrust::distance(transition_times.begin(), it)); + auto const idx = static_cast(thrust::distance(transition_times.begin(), it)); auto const list_offset = local_transitions.element_offset(idx - 1); return static_cast(utc_offsets.element(list_offset)); } @@ -362,6 +356,10 @@ struct parse_timestamp_string_fn { * epoch, as a computation approach, helps us to align probe(kernel side) to * the TimezoneDB(Java side). Then, we can apply binary search based on loose * epoch instants of TimezoneDB to find out the correct timezone offset. + * + * Loose epoch column is used for binary search. + * Here we use 400 days a year, it's safe, because mapping from local time to + * loose epoch is monotonic. */ __device__ inline int64_t compute_loose_epoch_s(timestamp_components const& ts) const { @@ -370,7 +368,7 @@ struct parse_timestamp_string_fn { } /** - * Leverage STL to convert local time to UTC unix_timestamp(in millisecond) + * Leverage STL to convert local time to UTC timestamp(in microseconds) */ __device__ inline int64_t compute_epoch_us(timestamp_components const& ts) const { @@ -378,9 +376,10 @@ struct parse_timestamp_string_fn { cuda::std::chrono::year_month_day(cuda::std::chrono::year{ts.year}, cuda::std::chrono::month{static_cast(ts.month)}, cuda::std::chrono::day{static_cast(ts.day)}); - auto days = cuda::std::chrono::sys_days(ymd).time_since_epoch().count(); + auto const days = cuda::std::chrono::sys_days(ymd).time_since_epoch().count(); - int64_t timestamp_s = (days * 24L * 3600L) + (ts.hour * 3600L) + (ts.minute * 60L) + ts.second; + int64_t const timestamp_s = + (days * 24L * 3600L) + (ts.hour * 3600L) + (ts.minute * 60L) + ts.second; return timestamp_s * 1000000L + ts.microseconds; } @@ -395,7 +394,7 @@ struct parse_timestamp_string_fn { __device__ inline ParseResult parse_string_to_timestamp_us( timestamp_components* ts_comp, char const** parsed_tz_ptr, - size_type* parsed_tz_length, + cudf::size_type* parsed_tz_length, cudf::string_view const& timestamp_str) const { const char* curr_ptr = timestamp_str.data(); @@ -412,8 +411,8 @@ struct parse_timestamp_string_fn { if (curr_ptr == end_ptr) { return ParseResult::INVALID; } - const char* const bytes = curr_ptr; - const size_type bytes_length = end_ptr - curr_ptr; + const char* const bytes = curr_ptr; + const cudf::size_type bytes_length = end_ptr - curr_ptr; // segments stores: [year, month, day, hour, minute, seconds, microseconds, no_use_item, // no_use_item] the two tail items are no use, but here keeps them as Spark does @@ -424,7 +423,6 @@ struct parse_timestamp_string_fn { int current_segment_digits = 0; size_t j = 0; int digits_milli = 0; - // bool just_time = false; thrust::optional year_sign; if ('-' == bytes[j] || '+' == bytes[j]) { if ('-' == bytes[j]) { @@ -436,11 +434,10 @@ struct parse_timestamp_string_fn { } while (j < bytes_length) { - char b = bytes[j]; - int parsed_value = static_cast(b - '0'); + char const b = bytes[j]; + int const parsed_value = static_cast(b - '0'); if (parsed_value < 0 || parsed_value > 9) { if (0 == j && 'T' == b) { - // just_time = true; i += 3; } else if (i < 2) { if (b == '-') { @@ -450,7 +447,6 @@ struct parse_timestamp_string_fn { current_segment_digits = 0; i += 1; } else if (0 == i && ':' == b && !year_sign.has_value()) { - // just_time = true; if (!is_valid_digits(3, current_segment_digits)) { return ParseResult::INVALID; } segments[3] = current_segment_value; current_segment_value = 0; @@ -561,17 +557,17 @@ struct parse_timestamp_string_fn { * are nullptr, otherwise called from `string_to_timestamp_with_tz`. * */ -std::unique_ptr to_timestamp(cudf::strings_column_view const& input, - bool ansi_mode, - bool allow_tz_in_date_str = true, - size_type default_tz_index = 1000000000, - cudf::column_view const* transitions = nullptr, - cudf::column_view const* tz_indices = nullptr) +std::unique_ptr to_timestamp( + cudf::strings_column_view const& input, + bool const ansi_mode, + bool const allow_tz_in_date_str, + cudf::size_type const default_tz_index = -1, + cudf::column_view const* transitions = nullptr, + cudf::column_view const* tz_indices = nullptr, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { - auto const stream = cudf::get_default_stream(); - auto const mr = rmm::mr::get_current_device_resource(); - - auto d_strings = cudf::column_device_view::create(input.parent(), stream); + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); // column to store the result timestamp auto result_col = cudf::make_timestamp_column(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}, @@ -593,9 +589,9 @@ std::unique_ptr to_timestamp(cudf::strings_column_view const& inpu result_valid_col->mutable_view().begin())), parse_timestamp_string_fn{*d_strings, default_tz_index, allow_tz_in_date_str}); } else { - auto const ft_cdv_ptr = column_device_view::create(*transitions, stream); - auto const d_transitions = lists_column_device_view{*ft_cdv_ptr}; - auto d_tz_indices = cudf::column_device_view::create(*tz_indices, stream); + auto const ft_cdv_ptr = cudf::column_device_view::create(*transitions, stream); + auto const d_transitions = cudf::detail::lists_column_device_view{*ft_cdv_ptr}; + auto const d_tz_indices = cudf::column_device_view::create(*tz_indices, stream); thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -634,9 +630,7 @@ std::unique_ptr to_timestamp(cudf::strings_column_view const& inpu return result_col; } -} // namespace - -namespace spark_rapids_jni { +} // anonymous namespace /** * Parse string column with time zone to timestamp column. @@ -648,8 +642,8 @@ namespace spark_rapids_jni { std::unique_ptr string_to_timestamp_with_tz(cudf::strings_column_view const& input, cudf::column_view const& transitions, cudf::column_view const& tz_indices, - cudf::size_type default_tz_index, - bool ansi_mode) + cudf::size_type const default_tz_index, + bool const ansi_mode) { if (input.size() == 0) { return nullptr; } return to_timestamp(input, ansi_mode, true, default_tz_index, &transitions, &tz_indices); @@ -662,8 +656,8 @@ std::unique_ptr string_to_timestamp_with_tz(cudf::strings_column_v * */ std::unique_ptr string_to_timestamp_without_tz(cudf::strings_column_view const& input, - bool allow_time_zone, - bool ansi_mode) + bool const allow_time_zone, + bool const ansi_mode) { if (input.size() == 0) { return nullptr; } return to_timestamp(input, ansi_mode, allow_time_zone); diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp index 2d45f68dca..ba83f43064 100644 --- a/src/main/cpp/src/datetime_parser.hpp +++ b/src/main/cpp/src/datetime_parser.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -61,9 +61,9 @@ namespace spark_rapids_jni { * Unlike Spark, Spark-Rapids currently does not support DST time zones. * * @param input input string column view. - * @param transitions TimezoneDB, the table of transitions contains all - * information for timezones - * @param tz_indices TimezoneDB index of region-based timezone IDs + * @param transitions refer to TimezoneDB, the table of transitions contains all + * information for timezones. + * @param tz_indices refer to TimezoneDB, map from time zone to TimezoneDB transition index. * @param default_tz_index the index of default timezone in TimezoneDB, if input * date-like string does not contain a time zone (like: YYYY-MM-DD:hhmmss), use * this time zone. @@ -75,8 +75,8 @@ namespace spark_rapids_jni { std::unique_ptr string_to_timestamp_with_tz(cudf::strings_column_view const& input, cudf::column_view const& transitions, cudf::column_view const& tz_indices, - cudf::size_type default_tz_index, - bool ansi_mode); + cudf::size_type const default_tz_index, + bool const ansi_mode); /** * @@ -130,7 +130,7 @@ std::unique_ptr string_to_timestamp_with_tz(cudf::strings_column_v * if there exists invalid inputs and ANSI mode is on. */ std::unique_ptr string_to_timestamp_without_tz(cudf::strings_column_view const& input, - bool allow_time_zone, - bool ansi_mode); + bool const allow_time_zone, + bool const ansi_mode); } // namespace spark_rapids_jni diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java index 4c84ae1892..5da89319cb 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java @@ -1,5 +1,5 @@ /* -* Copyright (c) 2023-2024-2024, NVIDIA CORPORATION. +* Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 21f99dbabb323f319e2fa08611908fcc727cac27 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Fri, 26 Jan 2024 17:38:23 +0800 Subject: [PATCH 32/35] Fix cases --- .../java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java | 8 +++++--- .../com/nvidia/spark/rapids/jni/CastStringsTest.java | 2 +- .../java/com/nvidia/spark/rapids/jni/TimeZoneTest.java | 9 ++++++++- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java index 5da89319cb..efcd592604 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java @@ -66,9 +66,12 @@ public class GpuTimeZoneDB { // structs. The type of this column vector is: // LIST> // use this reference to indicate if time zone cache is initialized. + // `fixedTransitions` saves transitions for deduplicated time zones, diferent time zones + // may map to one normalized time zone. private HostColumnVector fixedTransitions; - // zone id to index in `fixedTransitions` + // time zone to index in `fixedTransitions` + // The key of `zoneIdToTable` is the time zone names before dedup. private Map zoneIdToTable; // host column vector for `zoneIdToTable`, sorted by time zone strings @@ -487,11 +490,10 @@ private ColumnVector getFixedTransitions() { * fixed transitions for a particular zoneId. * * It has default visibility so the test can access it. - * @param zoneId + * @param zoneId the time zones from TimeZone.getAvailableIDs without `ZoneId.normalized` * @return list of fixed transitions */ List getHostFixedTransitions(String zoneId) { - zoneId = ZoneId.of(zoneId, ZoneId.SHORT_IDS).normalized().toString(); // we use the normalized form to dedupe Integer idx = getZoneIDMap().get(zoneId); if (idx == null) { return null; diff --git a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java index de2a7738f6..cafe69a6b4 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java @@ -415,7 +415,7 @@ void toTimestampTestWithTz() { // test time zones not in notmalized names, e.g,: ZoneId.of("Etc/GMT").normalized.getId = Z; ZoneId.of("Etc/GMT+0").normalized.getId = Z; Etc/GMT+10 -> -10:00 entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 Etc/GMT", 1571610824100000L)); entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 Etc/GMT+0", 1571610824100000L)); - entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 Etc/GMT+10", 1571592825100000L)); + entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 Etc/GMT+10", 1571646824100000L)); int validDataSize = entries.size(); diff --git a/src/test/java/com/nvidia/spark/rapids/jni/TimeZoneTest.java b/src/test/java/com/nvidia/spark/rapids/jni/TimeZoneTest.java index 7aaec496de..ebc3e2cb58 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/TimeZoneTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/TimeZoneTest.java @@ -22,6 +22,7 @@ import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; import ai.rapids.cudf.ColumnVector; @@ -45,12 +46,18 @@ static void cleanup() { void databaseLoadedTest() { // Check for a few timezones GpuTimeZoneDB instance = GpuTimeZoneDB.getInstance(); + + // UTC+8 is not in `TimeZone.getAvailableIDs`, so return null + // UTC+8 can be handle by kernel directly List transitions = instance.getHostFixedTransitions("UTC+8"); - assertNotNull(transitions); + assertNull(transitions); + assertEquals(1, transitions.size()); transitions = instance.getHostFixedTransitions("Asia/Shanghai"); assertNotNull(transitions); + ZoneId shanghai = ZoneId.of("Asia/Shanghai").normalized(); + // inserted a min transition place holder, so it's n + 1 assertEquals(shanghai.getRules().getTransitions().size() + 1, transitions.size()); } From 6ddb91c12a1b8618911dc30f954b87dc258d8c0e Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Fri, 26 Jan 2024 20:54:50 +0800 Subject: [PATCH 33/35] Fix cudaErrorIllegalAddress error; Fix null pointer bug --- src/main/cpp/src/datetime_parser.cu | 2 +- src/test/java/com/nvidia/spark/rapids/jni/TimeZoneTest.java | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index e1ad607555..eae5503645 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -123,7 +123,7 @@ enum ParseResult { OK = 0, INVALID = 1, UNSUPPORTED = 2 }; template struct parse_timestamp_string_fn { // below three are required: - cudf::column_device_view const& d_strings; + cudf::column_device_view const d_strings; cudf::size_type const default_tz_index; bool const allow_tz_in_date_str; diff --git a/src/test/java/com/nvidia/spark/rapids/jni/TimeZoneTest.java b/src/test/java/com/nvidia/spark/rapids/jni/TimeZoneTest.java index ebc3e2cb58..f50fe64c51 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/TimeZoneTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/TimeZoneTest.java @@ -52,7 +52,6 @@ void databaseLoadedTest() { List transitions = instance.getHostFixedTransitions("UTC+8"); assertNull(transitions); - assertEquals(1, transitions.size()); transitions = instance.getHostFixedTransitions("Asia/Shanghai"); assertNotNull(transitions); From 863cb8330d16f33c20f1e284b7b3125253c74e05 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Fri, 26 Jan 2024 21:17:53 +0800 Subject: [PATCH 34/35] Update comments --- src/main/cpp/src/datetime_parser.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index eae5503645..d02139cce4 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -635,8 +635,9 @@ std::unique_ptr to_timestamp( /** * Parse string column with time zone to timestamp column. * If a string does not have time zone in it, use the default time zone. - * Returns nullptr if ANSI mode is true and strings have any invalid value, returns non-null - * timestamp column otherwise. + * + * Returns nullptr if ANSI mode is true and strings have invalid data, + * otherwise, returns non-null timestamp column(the invalid date will be empty in this column) * */ std::unique_ptr string_to_timestamp_with_tz(cudf::strings_column_view const& input, From de746455d7c52abe94bb219fdb9dac71976e74cf Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Tue, 30 Jan 2024 14:51:35 +0800 Subject: [PATCH 35/35] Refactor --- src/main/cpp/src/datetime_parser.cu | 79 +++++++++++++++-------------- 1 file changed, 42 insertions(+), 37 deletions(-) diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu index d02139cce4..505f9821ad 100644 --- a/src/main/cpp/src/datetime_parser.cu +++ b/src/main/cpp/src/datetime_parser.cu @@ -142,12 +142,14 @@ struct parse_timestamp_string_fn { return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID); } - auto const d_str = d_strings.element(idx); - - timestamp_components ts_comp{}; - char const* tz_lit_ptr = nullptr; - cudf::size_type tz_lit_len = 0; - switch (parse_string_to_timestamp_us(&ts_comp, &tz_lit_ptr, &tz_lit_len, d_str)) { + auto const d_str = d_strings.element(idx); + auto parse_ret_tuple = parse_string_to_timestamp_us(d_str); + auto ts_comp = thrust::get<0>(parse_ret_tuple); + auto tz_lit_ptr = thrust::get<1>(parse_ret_tuple); + auto tz_lit_len = thrust::get<2>(parse_ret_tuple); + auto result = thrust::get<3>(parse_ret_tuple); + + switch (result) { case ParseResult::INVALID: return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID); case ParseResult::UNSUPPORTED: @@ -391,12 +393,15 @@ struct parse_timestamp_string_fn { * Parse a string with time zone to a timestamp. * The bool in the returned tuple is false if the parse failed. */ - __device__ inline ParseResult parse_string_to_timestamp_us( - timestamp_components* ts_comp, - char const** parsed_tz_ptr, - cudf::size_type* parsed_tz_length, - cudf::string_view const& timestamp_str) const + __device__ inline thrust::tuple + parse_string_to_timestamp_us(cudf::string_view const& timestamp_str) const { + timestamp_components ts_comp{}; + char const* parsed_tz_ptr = nullptr; + cudf::size_type parsed_tz_length = -1; + auto invalid_ret = + thrust::make_tuple(ts_comp, parsed_tz_ptr, parsed_tz_length, ParseResult::INVALID); + const char* curr_ptr = timestamp_str.data(); const char* end_ptr = curr_ptr + timestamp_str.size_bytes(); @@ -409,7 +414,7 @@ struct parse_timestamp_string_fn { --end_ptr; } - if (curr_ptr == end_ptr) { return ParseResult::INVALID; } + if (curr_ptr == end_ptr) { return invalid_ret; } const char* const bytes = curr_ptr; const cudf::size_type bytes_length = end_ptr - curr_ptr; @@ -441,72 +446,72 @@ struct parse_timestamp_string_fn { i += 3; } else if (i < 2) { if (b == '-') { - if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; } + if (!is_valid_digits(i, current_segment_digits)) { return invalid_ret; } segments[i] = current_segment_value; current_segment_value = 0; current_segment_digits = 0; i += 1; } else if (0 == i && ':' == b && !year_sign.has_value()) { - if (!is_valid_digits(3, current_segment_digits)) { return ParseResult::INVALID; } + if (!is_valid_digits(3, current_segment_digits)) { return invalid_ret; } segments[3] = current_segment_value; current_segment_value = 0; current_segment_digits = 0; i = 4; } else { - return ParseResult::INVALID; + return invalid_ret; } } else if (2 == i) { if (' ' == b || 'T' == b) { - if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; } + if (!is_valid_digits(i, current_segment_digits)) { return invalid_ret; } segments[i] = current_segment_value; current_segment_value = 0; current_segment_digits = 0; i += 1; } else { - return ParseResult::INVALID; + return invalid_ret; } } else if (3 == i || 4 == i) { if (':' == b) { - if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; } + if (!is_valid_digits(i, current_segment_digits)) { return invalid_ret; } segments[i] = current_segment_value; current_segment_value = 0; current_segment_digits = 0; i += 1; } else { - return ParseResult::INVALID; + return invalid_ret; } } else if (5 == i || 6 == i) { if ('.' == b && 5 == i) { - if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; } + if (!is_valid_digits(i, current_segment_digits)) { return invalid_ret; } segments[i] = current_segment_value; current_segment_value = 0; current_segment_digits = 0; i += 1; } else { if (!is_valid_digits(i, current_segment_digits) || !allow_tz_in_date_str) { - return ParseResult::INVALID; + return invalid_ret; } segments[i] = current_segment_value; current_segment_value = 0; current_segment_digits = 0; i += 1; - *parsed_tz_ptr = bytes + j; + parsed_tz_ptr = bytes + j; // strip the whitespace between timestamp and timezone - while (*parsed_tz_ptr < end_ptr && is_whitespace(**parsed_tz_ptr)) - ++(*parsed_tz_ptr); - *parsed_tz_length = end_ptr - *parsed_tz_ptr; + while (parsed_tz_ptr < end_ptr && is_whitespace(*parsed_tz_ptr)) + ++parsed_tz_ptr; + parsed_tz_length = end_ptr - parsed_tz_ptr; break; } if (i == 6 && '.' != b) { i += 1; } } else { if (i < segments_len && (':' == b || ' ' == b)) { - if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; } + if (!is_valid_digits(i, current_segment_digits)) { return invalid_ret; } segments[i] = current_segment_value; current_segment_value = 0; current_segment_digits = 0; i += 1; } else { - return ParseResult::INVALID; + return invalid_ret; } } } else { @@ -521,7 +526,7 @@ struct parse_timestamp_string_fn { j += 1; } - if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; } + if (!is_valid_digits(i, current_segment_digits)) { return invalid_ret; } segments[i] = current_segment_value; while (digits_milli < 6) { @@ -535,15 +540,15 @@ struct parse_timestamp_string_fn { // copy segments to equivalent kernel timestamp_components // Note: In order to keep above code is equivalent to Spark implementation, // did not use `timestamp_components` directly to save values. - ts_comp->year = segments[0]; - ts_comp->month = static_cast(segments[1]); - ts_comp->day = static_cast(segments[2]); - ts_comp->hour = static_cast(segments[3]); - ts_comp->minute = static_cast(segments[4]); - ts_comp->second = static_cast(segments[5]); - ts_comp->microseconds = segments[6]; - - return ParseResult::OK; + ts_comp.year = segments[0]; + ts_comp.month = static_cast(segments[1]); + ts_comp.day = static_cast(segments[2]); + ts_comp.hour = static_cast(segments[3]); + ts_comp.minute = static_cast(segments[4]); + ts_comp.second = static_cast(segments[5]); + ts_comp.microseconds = segments[6]; + + return thrust::make_tuple(ts_comp, parsed_tz_ptr, parsed_tz_length, ParseResult::OK); } };