From b15c7623e2f0eb2b5a32c7a8d4ad561b84308761 Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Sun, 5 Nov 2023 20:01:02 +0800
Subject: [PATCH 01/35] Add timestamp parser

Signed-off-by: Chong Gao <res_life@163.com>
---
 src/main/cpp/CMakeLists.txt          |   1 +
 src/main/cpp/src/datetime_parser.cu  | 453 +++++++++++++++++++++++++++
 src/main/cpp/src/datetime_parser.hpp |  67 ++++
 3 files changed, 521 insertions(+)
 create mode 100644 src/main/cpp/src/datetime_parser.cu
 create mode 100644 src/main/cpp/src/datetime_parser.hpp
diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index 1ad65687e2..4eabade61b 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -165,6 +165,7 @@ add_library(
   src/cast_float_to_string.cu
   src/cast_string.cu
   src/cast_string_to_float.cu
+  src/datetime_parser.cu
   src/datetime_rebase.cu
   src/decimal_utils.cu
   src/histogram.cu
diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
new file mode 100644
index 0000000000..75a21493dd
--- /dev/null
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <map>
+#include <numeric>
+#include <vector>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/strings/convert/convert_datetime.hpp>
+#include <cudf/strings/detail/converters.hpp>
+#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+#include <cudf/wrappers/timestamps.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+#include <thrust/execution_policy.h>
+#include <thrust/for_each.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/logical.h>
+#include <thrust/optional.h>
+#include <thrust/pair.h>
+#include <thrust/transform.h>
+
+#include "datetime_parser.hpp"
+
+namespace {
+
+using timestamp_components = spark_rapids_jni::timestamp_components;
+
+/**
+ * Get the timestamp from epoch from a local date time in a specific time zone.
+ * Note: local date time may be overlap or gap, refer to `ZonedDateTime.of`
+ *
+ */
+__device__ cudf::timestamp_us
+create_timestamp_from_components_and_zone(timestamp_components local_timestamp_components,
+                                          cudf::string_view time_zone) {
+  // TODO: implements:
+  //   val localDateTime = LocalDateTime.of(localDate, localTime)
+  //   val zonedDateTime = ZonedDateTime.of(localDateTime, zoneId)
+  //   val instant = Instant.from(zonedDateTime) // main work
+  //   instantToMicros(instant)
+  // here just return a zero
+  return cudf::timestamp_us{cudf::duration_us{0L}};
+}
+
+__device__ __host__ inline bool is_digit(const char chr) {
+  return (chr >= '0' && chr <= '9');
+}
+
+__device__ __host__ inline bool is_whitespace(const char chr) {
+  switch (chr) {
+    case ' ':
+    case '\r':
+    case '\t':
+    case '\n': return true;
+    default: return false;
+  }
+}
+
+/**
+ * first trim the time zone,
+ * then format (+|-)h:mm, (+|-)hh:m or (+|-)h:m to (+|-)hh:mm
+ * Refer to: https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/
+ * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L39
+ */
+__device__ __host__ cudf::string_view format_zone_id(const cudf::string_view &time_zone_id) {
+  const char *curr_ptr = time_zone_id.data();
+  const char *end_ptr = curr_ptr + time_zone_id.size_bytes();
+
+  // trim left
+  int num_of_left_white_space = 0;
+  while (curr_ptr < end_ptr && is_whitespace(*curr_ptr)) {
+    ++curr_ptr;
+    ++num_of_left_white_space;
+  }
+  // trim right
+  while (curr_ptr < (end_ptr - 1) && is_whitespace(*(end_ptr - 1))) {
+    --end_ptr;
+  }
+
+  const int length_after_trim = end_ptr - curr_ptr;
+  int state = 0;
+  char ret[] = "+00:00";     // save the formatted result
+  bool is_valid_form = true; // is one form of: (+|-)h:mm$, (+|-)hh:m$, (+|-)h:m$, (+|-)hh:mm$
+  int curr_digit_num = 0;
+  while (curr_ptr <= end_ptr && is_valid_form) {
+    char chr = *curr_ptr;
+    if (0 == state) {                                           // expect '+' or '-'
+      if (curr_ptr == end_ptr || !('+' == chr || '-' == chr)) { // get $
+        is_valid_form = false;
+      } else { // get '+' or '-'
+        ret[0] = chr;
+        state = 1;
+      }
+    } else if (1 == state) {     // exepct hour digits then ':'
+      if (curr_ptr == end_ptr) { // get $
+        is_valid_form = false;
+      } else if (is_digit(chr) && curr_digit_num < 2) { // get digit
+        ++curr_digit_num;
+        // set hh part
+        ret[1] = ret[2];
+        ret[2] = chr;
+      } else if (':' == chr && curr_digit_num > 0) { // get ':'
+        curr_digit_num = 0;
+        state = 2;
+      } else {
+        is_valid_form = false;
+      }
+    } else if (2 == state) {                            // expect minute digits then '$'
+      if (curr_ptr == end_ptr && curr_digit_num > 0) {  // get $
+        state = 3;                                      // success state
+      } else if (is_digit(chr) && curr_digit_num < 2) { // get digit
+        ++curr_digit_num;
+        // set mm part
+        ret[4] = ret[5];
+        ret[5] = chr;
+      } else {
+        is_valid_form = false;
+      }
+    }
+    ++curr_ptr;
+  }
+
+  if (3 == state) {
+    // success
+    return cudf::string_view(ret, 6);
+  } else {
+    // failed to format, just trim time zone id
+    return cudf::string_view(time_zone_id.data() + num_of_left_white_space, length_after_trim);
+  }
+}
+
+__device__ __host__ bool is_valid_digits(int segment, int digits) {
+  // A Long is able to represent a timestamp within [+-]200 thousand years
+  const int constexpr maxDigitsYear = 6;
+  // For the nanosecond part, more than 6 digits is allowed, but will be truncated.
+  return segment == 6 || (segment == 0 && digits >= 4 && digits <= maxDigitsYear) ||
+         // For the zoneId segment(7), it's could be zero digits when it's a region-based zone ID
+         (segment == 7 && digits <= 2) ||
+         (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2);
+}
+
+/**
+ *
+ * Try to parse timestamp string and get a tuple which contains:
+ * - timestamp_components in timestamp string: (year, month, day, hour, minute, seconds,
+ * microseconds). If timestamp string does not contain date and only contains time, then
+ * (year,month,day) is a invalid value (-1, -1, -1). If timestamp string is invalid, then all the
+ * components is -1.
+ * - time zone in timestamp string, use default time zone if it's empty
+ *
+ * Note: the returned time zone is not validated
+ *
+ * Refer to: https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/
+ * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394
+ */
+__device__ __host__ thrust::pair<timestamp_components, cudf::string_view>
+parse_string_to_timestamp_components_tz(cudf::string_view timestamp_str,
+                                        cudf::string_view default_time_zone) {
+  auto error_compoments = timestamp_components{-1, -1, -1, -1, -1, -1, -1};
+  auto error_time_zone = cudf::string_view();
+
+  if (timestamp_str.empty()) {
+    return thrust::make_pair(error_compoments, error_time_zone);
+  }
+
+  const char *curr_ptr = timestamp_str.data();
+  const char *end_ptr = curr_ptr + timestamp_str.size_bytes();
+
+  // trim left
+  while (curr_ptr < end_ptr && is_whitespace(*curr_ptr)) {
+    ++curr_ptr;
+  }
+  // trim right
+  while (curr_ptr < end_ptr - 1 && is_whitespace(*(end_ptr - 1))) {
+    --end_ptr;
+  }
+
+  if (curr_ptr == end_ptr) {
+    return thrust::make_pair(error_compoments, error_time_zone);
+  }
+
+  const char *const bytes = curr_ptr;
+  const size_t bytes_length = end_ptr - curr_ptr;
+
+  thrust::optional<cudf::string_view> tz;
+  int segments[] = {1, 1, 1, 0, 0, 0, 0, 0, 0};
+  int segments_len = 9;
+  int i = 0;
+  int current_segment_value = 0;
+  int current_segment_digits = 0;
+  size_t j = 0;
+  int digits_milli = 0;
+  bool just_time = false;
+  thrust::optional<int> year_sign;
+  if ('-' == bytes[j] || '+' == bytes[j]) {
+    if ('-' == bytes[j]) {
+      year_sign = -1;
+    } else {
+      year_sign = 1;
+    }
+    j += 1;
+  }
+
+  while (j < bytes_length) {
+    char b = bytes[j];
+    int parsed_value = static_cast<int32_t>(b - '0');
+    if (parsed_value < 0 || parsed_value > 9) {
+      if (0 == j && 'T' == b) {
+        just_time = true;
+        i += 3;
+      } else if (i < 2) {
+        if (b == '-') {
+          if (!is_valid_digits(i, current_segment_digits)) {
+            return thrust::make_pair(error_compoments, error_time_zone);
+          }
+          segments[i] = current_segment_value;
+          current_segment_value = 0;
+          current_segment_digits = 0;
+          i += 1;
+        } else if (0 == i && ':' == b && !year_sign.has_value()) {
+          just_time = true;
+          if (!is_valid_digits(3, current_segment_digits)) {
+            return thrust::make_pair(error_compoments, error_time_zone);
+          }
+          segments[3] = current_segment_value;
+          current_segment_value = 0;
+          current_segment_digits = 0;
+          i = 4;
+        } else {
+          return thrust::make_pair(error_compoments, error_time_zone);
+        }
+      } else if (2 == i) {
+        if (' ' == b || 'T' == b) {
+          if (!is_valid_digits(i, current_segment_digits)) {
+            return thrust::make_pair(error_compoments, error_time_zone);
+          }
+          segments[i] = current_segment_value;
+          current_segment_value = 0;
+          current_segment_digits = 0;
+          i += 1;
+        } else {
+          return thrust::make_pair(error_compoments, error_time_zone);
+        }
+      } else if (3 == i || 4 == i) {
+        if (':' == b) {
+          if (!is_valid_digits(i, current_segment_digits)) {
+            return thrust::make_pair(error_compoments, error_time_zone);
+          }
+          segments[i] = current_segment_value;
+          current_segment_value = 0;
+          current_segment_digits = 0;
+          i += 1;
+        } else {
+          return thrust::make_pair(error_compoments, error_time_zone);
+        }
+      } else if (5 == i || 6 == i) {
+        if ('.' == b && 5 == i) {
+          if (!is_valid_digits(i, current_segment_digits)) {
+            return thrust::make_pair(error_compoments, error_time_zone);
+          }
+          segments[i] = current_segment_value;
+          current_segment_value = 0;
+          current_segment_digits = 0;
+          i += 1;
+        } else {
+          if (!is_valid_digits(i, current_segment_digits)) {
+            return thrust::make_pair(error_compoments, error_time_zone);
+          }
+          segments[i] = current_segment_value;
+          current_segment_value = 0;
+          current_segment_digits = 0;
+          i += 1;
+          tz = cudf::string_view(bytes + j, (bytes_length - j));
+          j = bytes_length - 1;
+        }
+        if (i == 6 && '.' != b) {
+          i += 1;
+        }
+      } else {
+        if (i < segments_len && (':' == b || ' ' == b)) {
+          if (!is_valid_digits(i, current_segment_digits)) {
+            return thrust::make_pair(error_compoments, error_time_zone);
+          }
+          segments[i] = current_segment_value;
+          current_segment_value = 0;
+          current_segment_digits = 0;
+          i += 1;
+        } else {
+          return thrust::make_pair(error_compoments, error_time_zone);
+        }
+      }
+    } else {
+      if (6 == i) {
+        digits_milli += 1;
+      }
+      // We will truncate the nanosecond part if there are more than 6 digits, which results
+      // in loss of precision
+      if (6 != i || current_segment_digits < 6) {
+        current_segment_value = current_segment_value * 10 + parsed_value;
+      }
+      current_segment_digits += 1;
+    }
+    j += 1;
+  }
+
+  if (!is_valid_digits(i, current_segment_digits)) {
+    return thrust::make_pair(error_compoments, error_time_zone);
+  }
+  segments[i] = current_segment_value;
+
+  while (digits_milli < 6) {
+    segments[6] *= 10;
+    digits_milli += 1;
+  }
+
+  cudf::string_view timze_zone;
+  if (tz.has_value()) {
+    timze_zone = format_zone_id(tz.value());
+  } else {
+    timze_zone = default_time_zone;
+  }
+
+  segments[0] *= year_sign.value_or(1);
+  // above is translated from Spark.
+
+  // set components
+  auto components = timestamp_components{segments[0],
+                                         static_cast<int8_t>(segments[1]),
+                                         static_cast<int8_t>(segments[2]),
+                                         static_cast<int8_t>(segments[3]),
+                                         static_cast<int8_t>(segments[4]),
+                                         static_cast<int8_t>(segments[5]),
+                                         segments[6]};
+  if (just_time) {
+    components.year = components.month = components.day = -1;
+  }
+  return thrust::make_pair(components, timze_zone);
+}
+
+struct parse_timestamp_string_fn {
+  cudf::column_device_view const d_strings;
+  cudf::string_view default_time_zone;
+
+  __device__ cudf::timestamp_us operator()(const cudf::size_type &idx) const {
+    auto const d_str = d_strings.element<cudf::string_view>(idx);
+    auto components_tz = parse_string_to_timestamp_components_tz(d_str, default_time_zone);
+    return create_timestamp_from_components_and_zone(components_tz.first, components_tz.second);
+  }
+};
+
+/**
+ *
+ * Trims and parses timestamp string column to a timestamp column and a time zone
+ * column
+ *
+ */
+std::unique_ptr<cudf::column> parse_string_to_timestamp_and_time_zone(
+    cudf::strings_column_view const &input, cudf::string_view default_time_zone,
+    rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
+  auto d_strings = cudf::column_device_view::create(input.parent(), stream);
+
+  auto output_timestamp = cudf::make_timestamp_column(
+      cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}, input.size(),
+      cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count(), stream, mr);
+
+  thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0),
+                    thrust::make_counting_iterator(input.size()),
+                    output_timestamp->mutable_view().begin<cudf::timestamp_us>(),
+                    parse_timestamp_string_fn{*d_strings, default_time_zone});
+
+  return output_timestamp;
+}
+
+} // namespace
+
+namespace spark_rapids_jni {
+
+/**
+ *
+ * Trims and parses timestamp string column to a timestamp components column and a time zone
+ * column, then create timestamp column
+ * Refer to: https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/
+ * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394
+ *
+ * @param input input string column view.
+ * @param default_time_zone if input string does not contain a time zone, use this time zone.
+ * @returns timestamp components column and time zone string.
+ * be empty.
+ */
+std::unique_ptr<cudf::column> parse_string_to_timestamp(cudf::strings_column_view const &input,
+                                                        cudf::string_view default_time_zone) {
+  auto timestamp_type = cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS};
+  if (input.size() == 0) {
+    return cudf::make_empty_column(timestamp_type.id());
+  }
+
+  auto const stream = cudf::get_default_stream();
+  auto const mr = rmm::mr::get_current_device_resource();
+  return parse_string_to_timestamp_and_time_zone(input, default_time_zone, stream, mr);
+}
+
+/**
+ *
+ * Refer to `SparkDateTimeUtils.stringToTimestampWithoutTimeZone`
+ */
+std::unique_ptr<cudf::column>
+string_to_timestamp_without_time_zone(cudf::strings_column_view const &input,
+                                      bool allow_time_zone) {
+  // TODO
+  throw std::runtime_error("Not implemented!!!");
+}
+
+/**
+ *
+ * Refer to `SparkDateTimeUtils.stringToTimestamp`
+ */
+std::unique_ptr<cudf::column> string_to_timestamp(cudf::strings_column_view const &input,
+                                                  cudf::string_view time_zone) {
+  // TODO
+  throw std::runtime_error("Not implemented!!!");
+}
+
+} // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp
new file mode 100644
index 0000000000..509fcf8008
--- /dev/null
+++ b/src/main/cpp/src/datetime_parser.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/strings/strings_column_view.hpp>
+
+namespace spark_rapids_jni {
+
+/**
+ * represents local date time in a time zone.
+ */
+struct timestamp_components {
+  int32_t year; // max 6 digits
+  int8_t month;
+  int8_t day;
+  int8_t hour;
+  int8_t minute;
+  int8_t second;
+  int32_t microseconds;
+};
+
+thrust::pair<timestamp_components, cudf::string_view>
+parse_string_to_timestamp_components_tz(cudf::string_view timestamp_str,
+                                        cudf::string_view default_time_zone);
+
+/**
+ *
+ * Trims and parses timestamp string column to a timestamp components column and a time zone
+ * column, then create timestamp column
+ * Refer to: https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/
+ * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394
+ *
+ * @param input input string column view.
+ * @param default_time_zone if input string does not contain a time zone, use this time zone.
+ * @returns timestamp components column and time zone string.
+ * be empty.
+ */
+std::unique_ptr<cudf::column> parse_string_to_timestamp(cudf::strings_column_view const &input,
+                                                        cudf::string_view default_time_zone);
+
+/**
+ *
+ * Refer to `SparkDateTimeUtils.stringToTimestampWithoutTimeZone`
+ */
+std::unique_ptr<cudf::column>
+string_to_timestamp_without_time_zone(cudf::strings_column_view const &input, bool allow_time_zone);
+
+/**
+ *
+ * Refer to `SparkDateTimeUtils.stringToTimestamp`
+ */
+std::unique_ptr<cudf::column> string_to_timestamp(cudf::strings_column_view const &input,
+                                                  cudf::string_view time_zone);
+
+} // namespace spark_rapids_jni

From 73e0f7e86f807d0763d1198d48c235161222327d Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Tue, 12 Dec 2023 10:37:26 +0800
Subject: [PATCH 02/35] Refine parser

---
 src/main/cpp/src/datetime_parser.cu  | 269 ++++++++-------------------
 src/main/cpp/src/datetime_parser.hpp |  81 ++++----
 2 files changed, 125 insertions(+), 225 deletions(-)

diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index 75a21493dd..14ed0a9c9f 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -32,48 +32,46 @@
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/span.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
-#include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/logical.h>
 #include <thrust/optional.h>
-#include <thrust/pair.h>
 #include <thrust/transform.h>
+#include <thrust/tuple.h>
 
 #include "datetime_parser.hpp"
 
 namespace {
 
-using timestamp_components = spark_rapids_jni::timestamp_components;
-
 /**
- * Get the timestamp from epoch from a local date time in a specific time zone.
- * Note: local date time may be overlap or gap, refer to `ZonedDateTime.of`
- *
+ * represents local date time in a time zone.
  */
-__device__ cudf::timestamp_us
-create_timestamp_from_components_and_zone(timestamp_components local_timestamp_components,
-                                          cudf::string_view time_zone) {
-  // TODO: implements:
-  //   val localDateTime = LocalDateTime.of(localDate, localTime)
-  //   val zonedDateTime = ZonedDateTime.of(localDateTime, zoneId)
-  //   val instant = Instant.from(zonedDateTime) // main work
-  //   instantToMicros(instant)
-  // here just return a zero
-  return cudf::timestamp_us{cudf::duration_us{0L}};
-}
+struct timestamp_components {
+  int32_t year; // max 6 digits
+  int8_t month;
+  int8_t day;
+  int8_t hour;
+  int8_t minute;
+  int8_t second;
+  int32_t microseconds;
+};
 
-__device__ __host__ inline bool is_digit(const char chr) {
-  return (chr >= '0' && chr <= '9');
+/**
+ * convert a local time in a time zone to UTC timestamp
+ */
+__device__ __host__ thrust::tuple<cudf::timestamp_us, bool>
+to_utc_timestamp(timestamp_components components, cudf::string_view const &time_zone) {
+  // TODO replace the temp implementation
+  long v = 365L * 86400L * 1000000L * components.year + 30L * 86400L * 1000000L * components.month +
+           86400L * 1000000L * components.day + 3600L * 1000000L * components.hour +
+           60L * 1000000L * components.minute + 1000000L * components.second +
+           components.microseconds;
+  return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{v}}, true);
 }
 
 __device__ __host__ inline bool is_whitespace(const char chr) {
@@ -86,79 +84,6 @@ __device__ __host__ inline bool is_whitespace(const char chr) {
   }
 }
 
-/**
- * first trim the time zone,
- * then format (+|-)h:mm, (+|-)hh:m or (+|-)h:m to (+|-)hh:mm
- * Refer to: https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/
- * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L39
- */
-__device__ __host__ cudf::string_view format_zone_id(const cudf::string_view &time_zone_id) {
-  const char *curr_ptr = time_zone_id.data();
-  const char *end_ptr = curr_ptr + time_zone_id.size_bytes();
-
-  // trim left
-  int num_of_left_white_space = 0;
-  while (curr_ptr < end_ptr && is_whitespace(*curr_ptr)) {
-    ++curr_ptr;
-    ++num_of_left_white_space;
-  }
-  // trim right
-  while (curr_ptr < (end_ptr - 1) && is_whitespace(*(end_ptr - 1))) {
-    --end_ptr;
-  }
-
-  const int length_after_trim = end_ptr - curr_ptr;
-  int state = 0;
-  char ret[] = "+00:00";     // save the formatted result
-  bool is_valid_form = true; // is one form of: (+|-)h:mm$, (+|-)hh:m$, (+|-)h:m$, (+|-)hh:mm$
-  int curr_digit_num = 0;
-  while (curr_ptr <= end_ptr && is_valid_form) {
-    char chr = *curr_ptr;
-    if (0 == state) {                                           // expect '+' or '-'
-      if (curr_ptr == end_ptr || !('+' == chr || '-' == chr)) { // get $
-        is_valid_form = false;
-      } else { // get '+' or '-'
-        ret[0] = chr;
-        state = 1;
-      }
-    } else if (1 == state) {     // exepct hour digits then ':'
-      if (curr_ptr == end_ptr) { // get $
-        is_valid_form = false;
-      } else if (is_digit(chr) && curr_digit_num < 2) { // get digit
-        ++curr_digit_num;
-        // set hh part
-        ret[1] = ret[2];
-        ret[2] = chr;
-      } else if (':' == chr && curr_digit_num > 0) { // get ':'
-        curr_digit_num = 0;
-        state = 2;
-      } else {
-        is_valid_form = false;
-      }
-    } else if (2 == state) {                            // expect minute digits then '$'
-      if (curr_ptr == end_ptr && curr_digit_num > 0) {  // get $
-        state = 3;                                      // success state
-      } else if (is_digit(chr) && curr_digit_num < 2) { // get digit
-        ++curr_digit_num;
-        // set mm part
-        ret[4] = ret[5];
-        ret[5] = chr;
-      } else {
-        is_valid_form = false;
-      }
-    }
-    ++curr_ptr;
-  }
-
-  if (3 == state) {
-    // success
-    return cudf::string_view(ret, 6);
-  } else {
-    // failed to format, just trim time zone id
-    return cudf::string_view(time_zone_id.data() + num_of_left_white_space, length_after_trim);
-  }
-}
-
 __device__ __host__ bool is_valid_digits(int segment, int digits) {
   // A Long is able to represent a timestamp within [+-]200 thousand years
   const int constexpr maxDigitsYear = 6;
@@ -169,28 +94,14 @@ __device__ __host__ bool is_valid_digits(int segment, int digits) {
          (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2);
 }
 
-/**
- *
- * Try to parse timestamp string and get a tuple which contains:
- * - timestamp_components in timestamp string: (year, month, day, hour, minute, seconds,
- * microseconds). If timestamp string does not contain date and only contains time, then
- * (year,month,day) is a invalid value (-1, -1, -1). If timestamp string is invalid, then all the
- * components is -1.
- * - time zone in timestamp string, use default time zone if it's empty
- *
- * Note: the returned time zone is not validated
- *
- * Refer to: https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/
- * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394
- */
-__device__ __host__ thrust::pair<timestamp_components, cudf::string_view>
-parse_string_to_timestamp_components_tz(cudf::string_view timestamp_str,
-                                        cudf::string_view default_time_zone) {
-  auto error_compoments = timestamp_components{-1, -1, -1, -1, -1, -1, -1};
-  auto error_time_zone = cudf::string_view();
+__device__ __host__ thrust::tuple<cudf::timestamp_us, bool>
+parse_string_to_timestamp_us(cudf::string_view const &timestamp_str, const char *default_time_zone,
+                             cudf::size_type default_time_zone_char_len, bool allow_time_zone,
+                             bool allow_special_expressions, bool ansi_mode) {
+  auto error_us = cudf::timestamp_us{cudf::duration_us{0}};
 
   if (timestamp_str.empty()) {
-    return thrust::make_pair(error_compoments, error_time_zone);
+    return thrust::make_tuple(error_us, false);
   }
 
   const char *curr_ptr = timestamp_str.data();
@@ -206,7 +117,7 @@ parse_string_to_timestamp_components_tz(cudf::string_view timestamp_str,
   }
 
   if (curr_ptr == end_ptr) {
-    return thrust::make_pair(error_compoments, error_time_zone);
+    return thrust::make_tuple(error_us, false);
   }
 
   const char *const bytes = curr_ptr;
@@ -241,7 +152,7 @@ parse_string_to_timestamp_components_tz(cudf::string_view timestamp_str,
       } else if (i < 2) {
         if (b == '-') {
           if (!is_valid_digits(i, current_segment_digits)) {
-            return thrust::make_pair(error_compoments, error_time_zone);
+            return thrust::make_tuple(error_us, false);
           }
           segments[i] = current_segment_value;
           current_segment_value = 0;
@@ -250,43 +161,43 @@ parse_string_to_timestamp_components_tz(cudf::string_view timestamp_str,
         } else if (0 == i && ':' == b && !year_sign.has_value()) {
           just_time = true;
           if (!is_valid_digits(3, current_segment_digits)) {
-            return thrust::make_pair(error_compoments, error_time_zone);
+            return thrust::make_tuple(error_us, false);
           }
           segments[3] = current_segment_value;
           current_segment_value = 0;
           current_segment_digits = 0;
           i = 4;
         } else {
-          return thrust::make_pair(error_compoments, error_time_zone);
+          return thrust::make_tuple(error_us, false);
         }
       } else if (2 == i) {
         if (' ' == b || 'T' == b) {
           if (!is_valid_digits(i, current_segment_digits)) {
-            return thrust::make_pair(error_compoments, error_time_zone);
+            return thrust::make_tuple(error_us, false);
           }
           segments[i] = current_segment_value;
           current_segment_value = 0;
           current_segment_digits = 0;
           i += 1;
         } else {
-          return thrust::make_pair(error_compoments, error_time_zone);
+          return thrust::make_tuple(error_us, false);
         }
       } else if (3 == i || 4 == i) {
         if (':' == b) {
           if (!is_valid_digits(i, current_segment_digits)) {
-            return thrust::make_pair(error_compoments, error_time_zone);
+            return thrust::make_tuple(error_us, false);
           }
           segments[i] = current_segment_value;
           current_segment_value = 0;
           current_segment_digits = 0;
           i += 1;
         } else {
-          return thrust::make_pair(error_compoments, error_time_zone);
+          return thrust::make_tuple(error_us, false);
         }
       } else if (5 == i || 6 == i) {
         if ('.' == b && 5 == i) {
           if (!is_valid_digits(i, current_segment_digits)) {
-            return thrust::make_pair(error_compoments, error_time_zone);
+            return thrust::make_tuple(error_us, false);
           }
           segments[i] = current_segment_value;
           current_segment_value = 0;
@@ -294,7 +205,7 @@ parse_string_to_timestamp_components_tz(cudf::string_view timestamp_str,
           i += 1;
         } else {
           if (!is_valid_digits(i, current_segment_digits)) {
-            return thrust::make_pair(error_compoments, error_time_zone);
+            return thrust::make_tuple(error_us, false);
           }
           segments[i] = current_segment_value;
           current_segment_value = 0;
@@ -309,14 +220,14 @@ parse_string_to_timestamp_components_tz(cudf::string_view timestamp_str,
       } else {
         if (i < segments_len && (':' == b || ' ' == b)) {
           if (!is_valid_digits(i, current_segment_digits)) {
-            return thrust::make_pair(error_compoments, error_time_zone);
+            return thrust::make_tuple(error_us, false);
           }
           segments[i] = current_segment_value;
           current_segment_value = 0;
           current_segment_digits = 0;
           i += 1;
         } else {
-          return thrust::make_pair(error_compoments, error_time_zone);
+          return thrust::make_tuple(error_us, false);
         }
       }
     } else {
@@ -334,7 +245,7 @@ parse_string_to_timestamp_components_tz(cudf::string_view timestamp_str,
   }
 
   if (!is_valid_digits(i, current_segment_digits)) {
-    return thrust::make_pair(error_compoments, error_time_zone);
+    return thrust::make_tuple(error_us, false);
   }
   segments[i] = current_segment_value;
 
@@ -345,13 +256,13 @@ parse_string_to_timestamp_components_tz(cudf::string_view timestamp_str,
 
   cudf::string_view timze_zone;
   if (tz.has_value()) {
-    timze_zone = format_zone_id(tz.value());
+    timze_zone = tz.value();
   } else {
-    timze_zone = default_time_zone;
+    timze_zone = cudf::string_view(default_time_zone, default_time_zone_char_len);
   }
 
   segments[0] *= year_sign.value_or(1);
-  // above is translated from Spark.
+  // above is ported from Spark.
 
   // set components
   auto components = timestamp_components{segments[0],
@@ -364,90 +275,74 @@ parse_string_to_timestamp_components_tz(cudf::string_view timestamp_str,
   if (just_time) {
     components.year = components.month = components.day = -1;
   }
-  return thrust::make_pair(components, timze_zone);
+  return to_utc_timestamp(components, timze_zone);
 }
 
 struct parse_timestamp_string_fn {
   cudf::column_device_view const d_strings;
-  cudf::string_view default_time_zone;
+  const char *default_time_zone;
+  cudf::size_type default_time_zone_char_len;
+  bool allow_time_zone;
+  bool allow_special_expressions;
+  bool ansi_mode;
 
-  __device__ cudf::timestamp_us operator()(const cudf::size_type &idx) const {
+  __device__ thrust::tuple<cudf::timestamp_us, bool> operator()(const cudf::size_type &idx) const {
     auto const d_str = d_strings.element<cudf::string_view>(idx);
-    auto components_tz = parse_string_to_timestamp_components_tz(d_str, default_time_zone);
-    return create_timestamp_from_components_and_zone(components_tz.first, components_tz.second);
+    return parse_string_to_timestamp_us(d_str, default_time_zone, default_time_zone_char_len,
+                                        allow_time_zone, allow_special_expressions, ansi_mode);
   }
 };
 
 /**
  *
- * Trims and parses timestamp string column to a timestamp column and a time zone
- * column
+ * Trims and parses timestamp string column to a timestamp column and a is valid column
  *
  */
-std::unique_ptr<cudf::column> parse_string_to_timestamp_and_time_zone(
-    cudf::strings_column_view const &input, cudf::string_view default_time_zone,
-    rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
+std::pair<std::unique_ptr<cudf::column>, std::unique_ptr<cudf::column>>
+string_to_timestamp(cudf::strings_column_view const &input,
+                    std::string_view const &default_time_zone, bool allow_time_zone,
+                    bool allow_special_expressions, bool ansi_mode, rmm::cuda_stream_view stream,
+                    rmm::mr::device_memory_resource *mr) {
   auto d_strings = cudf::column_device_view::create(input.parent(), stream);
 
   auto output_timestamp = cudf::make_timestamp_column(
       cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}, input.size(),
       cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count(), stream, mr);
+  auto output_bool = cudf::make_fixed_width_column(
+      cudf::data_type{cudf::type_id::BOOL8}, input.size(),
+      cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count(), stream, mr);
 
-  thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0),
-                    thrust::make_counting_iterator(input.size()),
-                    output_timestamp->mutable_view().begin<cudf::timestamp_us>(),
-                    parse_timestamp_string_fn{*d_strings, default_time_zone});
-
-  return output_timestamp;
+  thrust::transform(
+      rmm::exec_policy(stream), thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(input.size()),
+      thrust::make_zip_iterator(
+          thrust::make_tuple(output_timestamp->mutable_view().begin<cudf::timestamp_us>(),
+                             output_bool->mutable_view().begin<bool>())),
+      parse_timestamp_string_fn{*d_strings, default_time_zone.data(),
+                                static_cast<cudf::size_type>(default_time_zone.size()),
+                                allow_time_zone, allow_special_expressions, ansi_mode});
+
+  return std::make_pair(std::move(output_timestamp), std::move(output_bool));
 }
 
 } // namespace
 
 namespace spark_rapids_jni {
 
-/**
- *
- * Trims and parses timestamp string column to a timestamp components column and a time zone
- * column, then create timestamp column
- * Refer to: https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/
- * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394
- *
- * @param input input string column view.
- * @param default_time_zone if input string does not contain a time zone, use this time zone.
- * @returns timestamp components column and time zone string.
- * be empty.
- */
-std::unique_ptr<cudf::column> parse_string_to_timestamp(cudf::strings_column_view const &input,
-                                                        cudf::string_view default_time_zone) {
+std::pair<std::unique_ptr<cudf::column>, std::unique_ptr<cudf::column>>
+parse_string_to_timestamp(cudf::strings_column_view const &input,
+                          std::string_view const &default_time_zone, bool allow_time_zone,
+                          bool allow_special_expressions, bool ansi_mode) {
   auto timestamp_type = cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS};
   if (input.size() == 0) {
-    return cudf::make_empty_column(timestamp_type.id());
+    return std::make_pair(cudf::make_empty_column(timestamp_type.id()),
+                          cudf::make_empty_column(cudf::data_type{cudf::type_id::BOOL8}));
   }
 
   auto const stream = cudf::get_default_stream();
   auto const mr = rmm::mr::get_current_device_resource();
-  return parse_string_to_timestamp_and_time_zone(input, default_time_zone, stream, mr);
-}
-
-/**
- *
- * Refer to `SparkDateTimeUtils.stringToTimestampWithoutTimeZone`
- */
-std::unique_ptr<cudf::column>
-string_to_timestamp_without_time_zone(cudf::strings_column_view const &input,
-                                      bool allow_time_zone) {
-  // TODO
-  throw std::runtime_error("Not implemented!!!");
-}
-
-/**
- *
- * Refer to `SparkDateTimeUtils.stringToTimestamp`
- */
-std::unique_ptr<cudf::column> string_to_timestamp(cudf::strings_column_view const &input,
-                                                  cudf::string_view time_zone) {
-  // TODO
-  throw std::runtime_error("Not implemented!!!");
+  return string_to_timestamp(input, default_time_zone, allow_time_zone, allow_special_expressions,
+                             ansi_mode, stream, mr);
 }
 
 } // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp
index 509fcf8008..139e69086b 100644
--- a/src/main/cpp/src/datetime_parser.hpp
+++ b/src/main/cpp/src/datetime_parser.hpp
@@ -19,49 +19,54 @@
 namespace spark_rapids_jni {
 
 /**
- * represents local date time in a time zone.
- */
-struct timestamp_components {
-  int32_t year; // max 6 digits
-  int8_t month;
-  int8_t day;
-  int8_t hour;
-  int8_t minute;
-  int8_t second;
-  int32_t microseconds;
-};
-
-thrust::pair<timestamp_components, cudf::string_view>
-parse_string_to_timestamp_components_tz(cudf::string_view timestamp_str,
-                                        cudf::string_view default_time_zone);
-
-/**
  *
- * Trims and parses timestamp string column to a timestamp components column and a time zone
- * column, then create timestamp column
+ * Trims and parses a timestamp string column with time zone suffix to a timestamp column.
+ * e.g.: 1991-04-14T02:00:00Asia/Shanghai => 1991-04-13 18:00:00
+ *
  * Refer to: https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/
  * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394
  *
- * @param input input string column view.
- * @param default_time_zone if input string does not contain a time zone, use this time zone.
- * @returns timestamp components column and time zone string.
- * be empty.
- */
-std::unique_ptr<cudf::column> parse_string_to_timestamp(cudf::strings_column_view const &input,
-                                                        cudf::string_view default_time_zone);
-
-/**
+ * Formats are:
  *
- * Refer to `SparkDateTimeUtils.stringToTimestampWithoutTimeZone`
- */
-std::unique_ptr<cudf::column>
-string_to_timestamp_without_time_zone(cudf::strings_column_view const &input, bool allow_time_zone);
-
-/**
+ * `[+-]yyyy*`
+ * `[+-]yyyy*-[m]m`
+ * `[+-]yyyy*-[m]m-[d]d`
+ * `[+-]yyyy*-[m]m-[d]d `
+ * `[+-]yyyy*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
+ * `[+-]yyyy*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
+ * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
+ * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
+ *
+ * Spark supports the following zone id forms:
+ *   - Z - Zulu time zone UTC+0
+ *   - +|-[h]h:[m]m
+ *   - A short id, see https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS
+ *   - An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-,
+ *     and a suffix in the formats:
+ *     - +|-h[h]
+ *     - +|-hh[:]mm
+ *     - +|-hh:mm:ss
+ *     - +|-hhmmss
+ *  - Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
+ *
+ * Unlike Spark, Spark-Rapids only supports the following time zones:
+ *   - Z - Zulu time zone UTC+0
+ *   - +|-[h]h:[m]m
+ *   - Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
  *
- * Refer to `SparkDateTimeUtils.stringToTimestamp`
+ *
+ * @param input input string column view.
+ * @param default_time_zone if input string does not contain a time zone, use this time zone.
+ * @param allow_time_zone whether allow time zone in the timestamp string. e.g.: 
+ *   1991-04-14T02:00:00Asia/Shanghai is invalid when do not allow time zone.
+ * @param allow_special_expressions whether allow epoch, now, today, yesterday, tomorrow strings.
+ * @param ansi_mode is ansi mode
+ * @returns a timestamp column and a bool column. Bool column is empty if ansi mode is false, not empty otherwise.
  */
-std::unique_ptr<cudf::column> string_to_timestamp(cudf::strings_column_view const &input,
-                                                  cudf::string_view time_zone);
-
+std::pair<std::unique_ptr<cudf::column>, std::unique_ptr<cudf::column>>
+parse_string_to_timestamp(cudf::strings_column_view const &input,
+                          std::string_view const &default_time_zone,
+                          bool allow_time_zone,
+                          bool allow_special_expressions,
+                          bool ansi_mode);
 } // namespace spark_rapids_jni

From df60772b70befbbef9a95ec2088532a662fdd996 Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Tue, 12 Dec 2023 17:05:15 +0800
Subject: [PATCH 03/35] Update

---
 src/main/cpp/src/datetime_parser.cu    | 368 ++++++++++++++++++-------
 src/main/cpp/src/datetime_parser.hpp   |  23 +-
 src/main/cpp/tests/CMakeLists.txt      |   3 +
 src/main/cpp/tests/datetime_parser.cpp |  55 ++++
 4 files changed, 351 insertions(+), 98 deletions(-)
 create mode 100644 src/main/cpp/tests/datetime_parser.cpp

diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index 14ed0a9c9f..cd5b53e32f 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -18,27 +18,20 @@
 #include <numeric>
 #include <vector>
 
-#include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/strings/convert/convert_datetime.hpp>
-#include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
-#include <cudf/strings/detail/utilities.cuh>
+
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/search.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/error.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/optional.h>
 #include <thrust/transform.h>
@@ -46,12 +39,14 @@
 
 #include "datetime_parser.hpp"
 
-namespace {
+namespace
+{
 
 /**
  * represents local date time in a time zone.
  */
-struct timestamp_components {
+struct timestamp_components
+{
   int32_t year; // max 6 digits
   int8_t month;
   int8_t day;
@@ -65,42 +60,73 @@ struct timestamp_components {
  * convert a local time in a time zone to UTC timestamp
  */
 __device__ __host__ thrust::tuple<cudf::timestamp_us, bool>
-to_utc_timestamp(timestamp_components components, cudf::string_view const &time_zone) {
+to_utc_timestamp(timestamp_components components, cudf::string_view const &time_zone)
+{
   // TODO replace the temp implementation
   long v = 365L * 86400L * 1000000L * components.year + 30L * 86400L * 1000000L * components.month +
-           86400L * 1000000L * components.day + 3600L * 1000000L * components.hour +
-           60L * 1000000L * components.minute + 1000000L * components.second +
-           components.microseconds;
+            86400L * 1000000L * components.day + 3600L * 1000000L * components.hour +
+            60L * 1000000L * components.minute + 1000000L * components.second +
+            components.microseconds;
   return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{v}}, true);
 }
 
-__device__ __host__ inline bool is_whitespace(const char chr) {
-  switch (chr) {
-    case ' ':
-    case '\r':
-    case '\t':
-    case '\n': return true;
-    default: return false;
+__device__ __host__ inline bool is_whitespace(const char chr)
+{
+  switch (chr)
+  {
+  case ' ':
+  case '\r':
+  case '\t':
+  case '\n':
+    return true;
+  default:
+    return false;
+  }
+}
+
+// compare 2 strings are equal ignore case, the expect string should be lower-case
+__device__ __host__ inline bool equals(const char *actual_begin, const char *actual_end,
+                                        const char *expect_begin, const char *expect_end)
+{
+  if (actual_end - actual_begin != expect_end - expect_begin)
+  {
+    return false;
+  }
+
+  while (actual_begin < actual_end)
+  {
+    // the diff between upper case and lower case for a same char is 32
+    if (*actual_begin != *expect_begin && *actual_begin != (*expect_begin - 32))
+    {
+      return false;
+    }
+    actual_begin++;
+    expect_begin++;
   }
+  return true;
 }
 
-__device__ __host__ bool is_valid_digits(int segment, int digits) {
+__device__ __host__ bool is_valid_digits(int segment, int digits)
+{
   // A Long is able to represent a timestamp within [+-]200 thousand years
   const int constexpr maxDigitsYear = 6;
   // For the nanosecond part, more than 6 digits is allowed, but will be truncated.
   return segment == 6 || (segment == 0 && digits >= 4 && digits <= maxDigitsYear) ||
-         // For the zoneId segment(7), it's could be zero digits when it's a region-based zone ID
-         (segment == 7 && digits <= 2) ||
-         (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2);
+          // For the zoneId segment(7), it's could be zero digits when it's a region-based zone ID
+          (segment == 7 && digits <= 2) ||
+          (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2);
 }
 
 __device__ __host__ thrust::tuple<cudf::timestamp_us, bool>
 parse_string_to_timestamp_us(cudf::string_view const &timestamp_str, const char *default_time_zone,
-                             cudf::size_type default_time_zone_char_len, bool allow_time_zone,
-                             bool allow_special_expressions, bool ansi_mode) {
+                              cudf::size_type default_time_zone_char_len, bool allow_time_zone,
+                              bool allow_special_expressions)
+{
+
   auto error_us = cudf::timestamp_us{cudf::duration_us{0}};
 
-  if (timestamp_str.empty()) {
+  if (timestamp_str.empty())
+  {
     return thrust::make_tuple(error_us, false);
   }
 
@@ -108,15 +134,58 @@ parse_string_to_timestamp_us(cudf::string_view const &timestamp_str, const char
   const char *end_ptr = curr_ptr + timestamp_str.size_bytes();
 
   // trim left
-  while (curr_ptr < end_ptr && is_whitespace(*curr_ptr)) {
+  while (curr_ptr < end_ptr && is_whitespace(*curr_ptr))
+  {
     ++curr_ptr;
   }
   // trim right
-  while (curr_ptr < end_ptr - 1 && is_whitespace(*(end_ptr - 1))) {
+  while (curr_ptr < end_ptr - 1 && is_whitespace(*(end_ptr - 1)))
+  {
     --end_ptr;
   }
 
-  if (curr_ptr == end_ptr) {
+  // special strings: epoch, now, today, yesterday, tomorrow
+  // TODO
+  if (allow_special_expressions)
+  {
+    char const *begin_epoch = "epoch";
+    char const *end_epoch = begin_epoch + 5;
+
+    char const *begin_now = "now";
+    char const *end_now = begin_now + 3;
+
+    char const *begin_today = "today";
+    char const *end_today = begin_today + 5;
+
+    char const *begin_yesterday = "yesterday";
+    char const *end_yesterday = begin_yesterday + 9;
+
+    char const *begin_tomorrow = "tomorrow";
+    char const *end_tomorrow = begin_tomorrow + 8;
+    if (equals(curr_ptr, end_ptr, begin_epoch, end_epoch))
+    { // epoch
+      return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, true);
+    }
+    else if (equals(curr_ptr, end_ptr, begin_now, end_now))
+    {
+      // now
+    }
+    else if (equals(curr_ptr, end_ptr, begin_today, end_today))
+    {
+      // today
+    }
+    else if (equals(curr_ptr, end_ptr, begin_yesterday, end_yesterday))
+    {
+      // yesterday
+    }
+    else if (equals(curr_ptr, end_ptr, begin_tomorrow, end_tomorrow))
+    {
+      // tomorrow
+    }
+  }
+
+  if (curr_ptr == end_ptr)
+  {
     return thrust::make_tuple(error_us, false);
   }
 
@@ -133,78 +202,113 @@ parse_string_to_timestamp_us(cudf::string_view const &timestamp_str, const char
   int digits_milli = 0;
   bool just_time = false;
   thrust::optional<int> year_sign;
-  if ('-' == bytes[j] || '+' == bytes[j]) {
-    if ('-' == bytes[j]) {
+  if ('-' == bytes[j] || '+' == bytes[j])
+  {
+    if ('-' == bytes[j])
+    {
       year_sign = -1;
-    } else {
+    }
+    else
+    {
       year_sign = 1;
     }
     j += 1;
   }
 
-  while (j < bytes_length) {
+  while (j < bytes_length)
+  {
     char b = bytes[j];
     int parsed_value = static_cast<int32_t>(b - '0');
-    if (parsed_value < 0 || parsed_value > 9) {
-      if (0 == j && 'T' == b) {
+    if (parsed_value < 0 || parsed_value > 9)
+    {
+      if (0 == j && 'T' == b)
+      {
         just_time = true;
         i += 3;
-      } else if (i < 2) {
-        if (b == '-') {
-          if (!is_valid_digits(i, current_segment_digits)) {
+      }
+      else if (i < 2)
+      {
+        if (b == '-')
+        {
+          if (!is_valid_digits(i, current_segment_digits))
+          {
             return thrust::make_tuple(error_us, false);
           }
           segments[i] = current_segment_value;
           current_segment_value = 0;
           current_segment_digits = 0;
           i += 1;
-        } else if (0 == i && ':' == b && !year_sign.has_value()) {
+        }
+        else if (0 == i && ':' == b && !year_sign.has_value())
+        {
           just_time = true;
-          if (!is_valid_digits(3, current_segment_digits)) {
+          if (!is_valid_digits(3, current_segment_digits))
+          {
             return thrust::make_tuple(error_us, false);
           }
           segments[3] = current_segment_value;
           current_segment_value = 0;
           current_segment_digits = 0;
           i = 4;
-        } else {
+        }
+        else
+        {
           return thrust::make_tuple(error_us, false);
         }
-      } else if (2 == i) {
-        if (' ' == b || 'T' == b) {
-          if (!is_valid_digits(i, current_segment_digits)) {
+      }
+      else if (2 == i)
+      {
+        if (' ' == b || 'T' == b)
+        {
+          if (!is_valid_digits(i, current_segment_digits))
+          {
             return thrust::make_tuple(error_us, false);
           }
           segments[i] = current_segment_value;
           current_segment_value = 0;
           current_segment_digits = 0;
           i += 1;
-        } else {
+        }
+        else
+        {
           return thrust::make_tuple(error_us, false);
         }
-      } else if (3 == i || 4 == i) {
-        if (':' == b) {
-          if (!is_valid_digits(i, current_segment_digits)) {
+      }
+      else if (3 == i || 4 == i)
+      {
+        if (':' == b)
+        {
+          if (!is_valid_digits(i, current_segment_digits))
+          {
             return thrust::make_tuple(error_us, false);
           }
           segments[i] = current_segment_value;
           current_segment_value = 0;
           current_segment_digits = 0;
           i += 1;
-        } else {
+        }
+        else
+        {
           return thrust::make_tuple(error_us, false);
         }
-      } else if (5 == i || 6 == i) {
-        if ('.' == b && 5 == i) {
-          if (!is_valid_digits(i, current_segment_digits)) {
+      }
+      else if (5 == i || 6 == i)
+      {
+        if ('.' == b && 5 == i)
+        {
+          if (!is_valid_digits(i, current_segment_digits))
+          {
             return thrust::make_tuple(error_us, false);
           }
           segments[i] = current_segment_value;
           current_segment_value = 0;
           current_segment_digits = 0;
           i += 1;
-        } else {
-          if (!is_valid_digits(i, current_segment_digits)) {
+        }
+        else
+        {
+          if (!is_valid_digits(i, current_segment_digits))
+          {
             return thrust::make_tuple(error_us, false);
           }
           segments[i] = current_segment_value;
@@ -214,29 +318,40 @@ parse_string_to_timestamp_us(cudf::string_view const &timestamp_str, const char
           tz = cudf::string_view(bytes + j, (bytes_length - j));
           j = bytes_length - 1;
         }
-        if (i == 6 && '.' != b) {
+        if (i == 6 && '.' != b)
+        {
           i += 1;
         }
-      } else {
-        if (i < segments_len && (':' == b || ' ' == b)) {
-          if (!is_valid_digits(i, current_segment_digits)) {
+      }
+      else
+      {
+        if (i < segments_len && (':' == b || ' ' == b))
+        {
+          if (!is_valid_digits(i, current_segment_digits))
+          {
             return thrust::make_tuple(error_us, false);
           }
           segments[i] = current_segment_value;
           current_segment_value = 0;
           current_segment_digits = 0;
           i += 1;
-        } else {
+        }
+        else
+        {
           return thrust::make_tuple(error_us, false);
         }
       }
-    } else {
-      if (6 == i) {
+    }
+    else
+    {
+      if (6 == i)
+      {
         digits_milli += 1;
       }
       // We will truncate the nanosecond part if there are more than 6 digits, which results
       // in loss of precision
-      if (6 != i || current_segment_digits < 6) {
+      if (6 != i || current_segment_digits < 6)
+      {
         current_segment_value = current_segment_value * 10 + parsed_value;
       }
       current_segment_digits += 1;
@@ -244,20 +359,43 @@ parse_string_to_timestamp_us(cudf::string_view const &timestamp_str, const char
     j += 1;
   }
 
-  if (!is_valid_digits(i, current_segment_digits)) {
+  if (!is_valid_digits(i, current_segment_digits))
+  {
     return thrust::make_tuple(error_us, false);
   }
   segments[i] = current_segment_value;
 
-  while (digits_milli < 6) {
+  while (digits_milli < 6)
+  {
     segments[6] *= 10;
     digits_milli += 1;
   }
 
+  if (default_time_zone_char_len == 0)
+  {
+    // invoke from `string_to_timestamp_without_time_zone`
+    if (just_time || !allow_time_zone && tz.has_value())
+    {
+      return thrust::make_tuple(error_us, false);
+    }
+  }
+  else
+  {
+    // invoke from `string_to_timestamp`
+    if (just_time)
+    {
+      // TODO
+      // set today: year-month-day
+    }
+  }
+
   cudf::string_view timze_zone;
-  if (tz.has_value()) {
+  if (tz.has_value())
+  {
     timze_zone = tz.value();
-  } else {
+  }
+  else
+  {
     timze_zone = cudf::string_view(default_time_zone, default_time_zone_char_len);
   }
 
@@ -266,30 +404,29 @@ parse_string_to_timestamp_us(cudf::string_view const &timestamp_str, const char
 
   // set components
   auto components = timestamp_components{segments[0],
-                                         static_cast<int8_t>(segments[1]),
-                                         static_cast<int8_t>(segments[2]),
-                                         static_cast<int8_t>(segments[3]),
-                                         static_cast<int8_t>(segments[4]),
-                                         static_cast<int8_t>(segments[5]),
-                                         segments[6]};
-  if (just_time) {
-    components.year = components.month = components.day = -1;
-  }
+                                          static_cast<int8_t>(segments[1]),
+                                          static_cast<int8_t>(segments[2]),
+                                          static_cast<int8_t>(segments[3]),
+                                          static_cast<int8_t>(segments[4]),
+                                          static_cast<int8_t>(segments[5]),
+                                          segments[6]};
+
   return to_utc_timestamp(components, timze_zone);
 }
 
-struct parse_timestamp_string_fn {
+struct parse_timestamp_string_fn
+{
   cudf::column_device_view const d_strings;
   const char *default_time_zone;
   cudf::size_type default_time_zone_char_len;
   bool allow_time_zone;
   bool allow_special_expressions;
-  bool ansi_mode;
 
-  __device__ thrust::tuple<cudf::timestamp_us, bool> operator()(const cudf::size_type &idx) const {
+  __device__ thrust::tuple<cudf::timestamp_us, bool> operator()(const cudf::size_type &idx) const
+  {
     auto const d_str = d_strings.element<cudf::string_view>(idx);
     return parse_string_to_timestamp_us(d_str, default_time_zone, default_time_zone_char_len,
-                                        allow_time_zone, allow_special_expressions, ansi_mode);
+                                        allow_time_zone, allow_special_expressions);
   }
 };
 
@@ -299,10 +436,11 @@ struct parse_timestamp_string_fn {
  *
  */
 std::pair<std::unique_ptr<cudf::column>, std::unique_ptr<cudf::column>>
-string_to_timestamp(cudf::strings_column_view const &input,
-                    std::string_view const &default_time_zone, bool allow_time_zone,
-                    bool allow_special_expressions, bool ansi_mode, rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource *mr) {
+to_timestamp(cudf::strings_column_view const &input,
+              std::string_view const &default_time_zone, bool allow_time_zone,
+              bool allow_special_expressions, rmm::cuda_stream_view stream,
+              rmm::mr::device_memory_resource *mr)
+{
   auto d_strings = cudf::column_device_view::create(input.parent(), stream);
 
   auto output_timestamp = cudf::make_timestamp_column(
@@ -317,32 +455,68 @@ string_to_timestamp(cudf::strings_column_view const &input,
       thrust::make_counting_iterator(input.size()),
       thrust::make_zip_iterator(
           thrust::make_tuple(output_timestamp->mutable_view().begin<cudf::timestamp_us>(),
-                             output_bool->mutable_view().begin<bool>())),
+                              output_bool->mutable_view().begin<bool>())),
       parse_timestamp_string_fn{*d_strings, default_time_zone.data(),
                                 static_cast<cudf::size_type>(default_time_zone.size()),
-                                allow_time_zone, allow_special_expressions, ansi_mode});
+                                allow_time_zone, allow_special_expressions});
 
   return std::make_pair(std::move(output_timestamp), std::move(output_bool));
 }
 
 } // namespace
 
-namespace spark_rapids_jni {
+namespace spark_rapids_jni
+{
 
-std::pair<std::unique_ptr<cudf::column>, std::unique_ptr<cudf::column>>
+std::pair<std::unique_ptr<cudf::column>, bool>
 parse_string_to_timestamp(cudf::strings_column_view const &input,
                           std::string_view const &default_time_zone, bool allow_time_zone,
-                          bool allow_special_expressions, bool ansi_mode) {
+                          bool allow_special_expressions, bool ansi_mode)
+{
   auto timestamp_type = cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS};
-  if (input.size() == 0) {
-    return std::make_pair(cudf::make_empty_column(timestamp_type.id()),
-                          cudf::make_empty_column(cudf::data_type{cudf::type_id::BOOL8}));
+  if (input.size() == 0)
+  {
+    return std::make_pair(cudf::make_empty_column(timestamp_type.id()), true);
   }
 
   auto const stream = cudf::get_default_stream();
   auto const mr = rmm::mr::get_current_device_resource();
-  return string_to_timestamp(input, default_time_zone, allow_time_zone, allow_special_expressions,
-                             ansi_mode, stream, mr);
+  auto [timestamp_column, valid_column] = to_timestamp(input, default_time_zone, allow_time_zone, allow_special_expressions,
+                                                        stream, mr);
+  if (ansi_mode)
+  {
+    cudf::numeric_scalar<bool> false_scalar{false, true, stream};
+    if (cudf::contains(*valid_column, false_scalar, stream))
+    {
+      return std::make_pair(nullptr, false);
+    }
+    else
+    {
+      return std::make_pair(std::move(timestamp_column), true);
+    }
+  }
+  else
+  {
+    return std::make_pair(std::move(timestamp_column), true);
+  }
+}
+
+std::pair<std::unique_ptr<cudf::column>, bool>
+string_to_timestamp(cudf::strings_column_view const &input,
+                    std::string_view const &default_time_zone,
+                    bool allow_special_expressions,
+                    bool ansi_mode)
+{
+  return parse_string_to_timestamp(input, default_time_zone, true, allow_special_expressions, ansi_mode);
+}
+
+std::pair<std::unique_ptr<cudf::column>, bool>
+string_to_timestamp_without_time_zone(cudf::strings_column_view const &input,
+                                      bool allow_time_zone,
+                                      bool allow_special_expressions,
+                                      bool ansi_mode)
+{
+  return parse_string_to_timestamp(input, std::string_view(""), allow_time_zone, allow_special_expressions, ansi_mode);
 }
 
 } // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp
index 139e69086b..26ecf421a3 100644
--- a/src/main/cpp/src/datetime_parser.hpp
+++ b/src/main/cpp/src/datetime_parser.hpp
@@ -63,10 +63,31 @@ namespace spark_rapids_jni {
  * @param ansi_mode is ansi mode
  * @returns a timestamp column and a bool column. Bool column is empty if ansi mode is false, not empty otherwise.
  */
-std::pair<std::unique_ptr<cudf::column>, std::unique_ptr<cudf::column>>
+std::pair<std::unique_ptr<cudf::column>, bool>
 parse_string_to_timestamp(cudf::strings_column_view const &input,
                           std::string_view const &default_time_zone,
                           bool allow_time_zone,
                           bool allow_special_expressions,
                           bool ansi_mode);
+/**
+ * Refer to `parse_string_to_timestamp`
+ * If timestamp string does not contain date info(yyyy mm dd), use current date
+*/
+std::pair<std::unique_ptr<cudf::column>, bool>
+string_to_timestamp(cudf::strings_column_view const &input,
+                          std::string_view const &default_time_zone,
+                          bool allow_special_expressions,
+                          bool ansi_mode);
+
+/**
+ * Refer to `parse_string_to_timestamp`
+ * 
+ * @param allow_time_zone whether allow time zone in the timestamp string. e.g.: 
+ *   1991-04-14T02:00:00Asia/Shanghai is invalid when do not allow time zone.
+*/
+std::pair<std::unique_ptr<cudf::column>, bool>
+string_to_timestamp_without_time_zone(cudf::strings_column_view const &input,
+                          bool allow_time_zone,
+                          bool allow_special_expressions,
+                          bool ansi_mode);
 } // namespace spark_rapids_jni
diff --git a/src/main/cpp/tests/CMakeLists.txt b/src/main/cpp/tests/CMakeLists.txt
index 617df6dfde..1f58176327 100644
--- a/src/main/cpp/tests/CMakeLists.txt
+++ b/src/main/cpp/tests/CMakeLists.txt
@@ -57,6 +57,9 @@ ConfigureTest(FORMAT_FLOAT
 ConfigureTest(CAST_FLOAT_TO_STRING
     cast_float_to_string.cpp)
 
+ConfigureTest(DATETIME_PARSER
+    datetime_parser.cpp)
+
 ConfigureTest(DATETIME_REBASE
     datetime_rebase.cpp)
 
diff --git a/src/main/cpp/tests/datetime_parser.cpp b/src/main/cpp/tests/datetime_parser.cpp
new file mode 100644
index 0000000000..ff6c7b79db
--- /dev/null
+++ b/src/main/cpp/tests/datetime_parser.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cassert>
+#include <cstring>
+
+#include <datetime_parser.hpp>
+
+//
+
+#include <cudf/strings/convert/convert_datetime.hpp>
+#include <cudf/strings/convert/convert_durations.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/unary.hpp>
+#include <cudf/wrappers/durations.hpp>
+#include <cudf/wrappers/timestamps.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+using timestamp_col = cudf::test::fixed_width_column_wrapper<cudf::timestamp_us, cudf::timestamp_us::rep>;
+
+struct DateTimeParserTest : public cudf::test::BaseFixture
+{
+};
+
+TEST_F(DateTimeParserTest, ParseTimestamp)
+{
+  auto const ts_col = timestamp_col{
+      -719162L, -354285L, -141714, -141438, -141437, -141432, -141427, -31463, -31453, -1, 0, 18335};
+
+  auto const ts_strings =
+      cudf::test::strings_column_wrapper{"2023-11-05T03:04:55Z",
+                                         "2023-11-05T03:04:55 ",
+                                         "2023-11-05T03:04:55.123456   "};
+  auto const parsed_ts =
+      cudf::strings::string_to_timestamp(cudf::strings_column_view(ts_strings),
+                                         "Z",
+                                         true,
+                                         true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(ts_col, *parsed_ts);
+}

From a4a83c02ebd929fc2292ea7e84043f0e61526e27 Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Tue, 12 Dec 2023 18:08:24 +0800
Subject: [PATCH 04/35] Update

---
 src/main/cpp/src/datetime_parser.cu    | 17 +++++++++++------
 src/main/cpp/tests/datetime_parser.cpp | 18 +++++++++---------
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index cd5b53e32f..4b607f4876 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -62,12 +62,15 @@ struct timestamp_components
 __device__ __host__ thrust::tuple<cudf::timestamp_us, bool>
 to_utc_timestamp(timestamp_components components, cudf::string_view const &time_zone)
 {
-  // TODO replace the temp implementation
-  long v = 365L * 86400L * 1000000L * components.year + 30L * 86400L * 1000000L * components.month +
-            86400L * 1000000L * components.day + 3600L * 1000000L * components.hour +
-            60L * 1000000L * components.minute + 1000000L * components.second +
-            components.microseconds;
-  return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{v}}, true);
+  // TODO replace the fake implementation
+  long seconds = components.year * 365L * 86400L +
+                 components.month * 30L * 86400L +
+                 components.day * 86400L +
+                 components.hour * 3600L +
+                 components.minute * 60L +
+                 components.second;
+  long us = seconds * 1000000L + components.microseconds;
+  return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{us}}, true);
 }
 
 __device__ __host__ inline bool is_whitespace(const char chr)
@@ -492,11 +495,13 @@ parse_string_to_timestamp(cudf::strings_column_view const &input,
     }
     else
     {
+      // TODO update bitmask
       return std::make_pair(std::move(timestamp_column), true);
     }
   }
   else
   {
+    // TODO update bitmask
     return std::make_pair(std::move(timestamp_column), true);
   }
 }
diff --git a/src/main/cpp/tests/datetime_parser.cpp b/src/main/cpp/tests/datetime_parser.cpp
index ff6c7b79db..47a733a41e 100644
--- a/src/main/cpp/tests/datetime_parser.cpp
+++ b/src/main/cpp/tests/datetime_parser.cpp
@@ -39,17 +39,17 @@ struct DateTimeParserTest : public cudf::test::BaseFixture
 
 TEST_F(DateTimeParserTest, ParseTimestamp)
 {
-  auto const ts_col = timestamp_col{
-      -719162L, -354285L, -141714, -141438, -141437, -141432, -141427, -31463, -31453, -1, 0, 18335};
+  auto v = (2023L * 365L * 86400L + 11L * 30L * 86400L + 5L * 86400L + 3L * 3600L + 4L * 60L + 55L) * 1000000L;
+  auto const ts_col = timestamp_col{v, v, v + 123456};
 
   auto const ts_strings =
       cudf::test::strings_column_wrapper{"2023-11-05T03:04:55Z",
-                                         "2023-11-05T03:04:55 ",
-                                         "2023-11-05T03:04:55.123456   "};
+                                        "2023-11-05T03:04:55 ",
+                                        "2023-11-05T03:04:55.123456   "};
   auto const parsed_ts =
-      cudf::strings::string_to_timestamp(cudf::strings_column_view(ts_strings),
-                                         "Z",
-                                         true,
-                                         true);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(ts_col, *parsed_ts);
+      spark_rapids_jni::string_to_timestamp(cudf::strings_column_view(ts_strings),
+                                            "Z",
+                                            true,
+                                            true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(ts_col, *(parsed_ts.first));
 }

From 89eef6b34e98c33da84ad1373c1eaf83ee34393d Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Thu, 14 Dec 2023 09:48:49 +0800
Subject: [PATCH 05/35] Fix bitmask; Parse special timestamp strings: now,
 today ...; Add Ansi mode check

---
 src/main/cpp/src/datetime_parser.cu    | 621 +++++++++++++------------
 src/main/cpp/src/datetime_parser.hpp   | 104 +++--
 src/main/cpp/tests/datetime_parser.cpp | 169 ++++++-
 3 files changed, 546 insertions(+), 348 deletions(-)

diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index 4b607f4876..6411c00017 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -18,11 +18,14 @@
 #include <numeric>
 #include <vector>
 
+#include <iostream>
+
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
+#include <cudf/detail/null_mask.cuh>
 #include <cudf/detail/null_mask.hpp>
-
+#include <cudf/detail/valid_if.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/search.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -39,15 +42,13 @@
 
 #include "datetime_parser.hpp"
 
-namespace
-{
+namespace {
 
 /**
- * represents local date time in a time zone.
+ * Represents local date time in a time zone.
  */
-struct timestamp_components
-{
-  int32_t year; // max 6 digits
+struct timestamp_components {
+  int32_t year;  // max 6 digits
   int8_t month;
   int8_t day;
   int8_t hour;
@@ -57,304 +58,255 @@ struct timestamp_components
 };
 
 /**
- * convert a local time in a time zone to UTC timestamp
+ * Convert a local time in a time zone to a UTC timestamp
  */
-__device__ __host__ thrust::tuple<cudf::timestamp_us, bool>
-to_utc_timestamp(timestamp_components components, cudf::string_view const &time_zone)
+__device__ __host__ thrust::tuple<cudf::timestamp_us, bool> to_utc_timestamp(
+  timestamp_components const& components, cudf::string_view const& time_zone)
 {
-  // TODO replace the fake implementation
-  long seconds = components.year * 365L * 86400L +
-                 components.month * 30L * 86400L +
-                 components.day * 86400L +
-                 components.hour * 3600L +
-                 components.minute * 60L +
+  // TODO replace the following fake implementation
+  long seconds = components.year * 365L * 86400L + components.month * 30L * 86400L +
+                 components.day * 86400L + components.hour * 3600L + components.minute * 60L +
                  components.second;
   long us = seconds * 1000000L + components.microseconds;
   return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{us}}, true);
 }
 
+/**
+ * Is white space
+ */
 __device__ __host__ inline bool is_whitespace(const char chr)
 {
-  switch (chr)
-  {
-  case ' ':
-  case '\r':
-  case '\t':
-  case '\n':
-    return true;
-  default:
-    return false;
+  switch (chr) {
+    case ' ':
+    case '\r':
+    case '\t':
+    case '\n': return true;
+    default: return false;
   }
 }
 
-// compare 2 strings are equal ignore case, the expect string should be lower-case
-__device__ __host__ inline bool equals(const char *actual_begin, const char *actual_end,
-                                        const char *expect_begin, const char *expect_end)
+/**
+ * Whether the given two strings are equal,
+ * used to compare special timestamp strings ignoring case:
+ *   "epoch", "now", "today", "yesterday", "tomorrow"
+ * the expect string should be lower-case a-z chars
+ */
+__device__ __host__ inline bool equals_ascii_ignore_case(char const* actual_begin,
+                                                         char const* actual_end,
+                                                         char const* expect_begin,
+                                                         char const* expect_end)
 {
-  if (actual_end - actual_begin != expect_end - expect_begin)
-  {
-    return false;
-  }
+  if (actual_end - actual_begin != expect_end - expect_begin) { return false; }
 
-  while (actual_begin < actual_end)
-  {
+  while (expect_begin < expect_end) {
     // the diff between upper case and lower case for a same char is 32
-    if (*actual_begin != *expect_begin && *actual_begin != (*expect_begin - 32))
-    {
-      return false;
-    }
+    if (*actual_begin != *expect_begin && *actual_begin != (*expect_begin - 32)) { return false; }
     actual_begin++;
     expect_begin++;
   }
   return true;
 }
 
+/**
+ * Ported from Spark
+ */
 __device__ __host__ bool is_valid_digits(int segment, int digits)
 {
   // A Long is able to represent a timestamp within [+-]200 thousand years
   const int constexpr maxDigitsYear = 6;
   // For the nanosecond part, more than 6 digits is allowed, but will be truncated.
   return segment == 6 || (segment == 0 && digits >= 4 && digits <= maxDigitsYear) ||
-          // For the zoneId segment(7), it's could be zero digits when it's a region-based zone ID
-          (segment == 7 && digits <= 2) ||
-          (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2);
+         // For the zoneId segment(7), it's could be zero digits when it's a region-based zone ID
+         (segment == 7 && digits <= 2) ||
+         (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2);
 }
 
-__device__ __host__ thrust::tuple<cudf::timestamp_us, bool>
-parse_string_to_timestamp_us(cudf::string_view const &timestamp_str, const char *default_time_zone,
-                              cudf::size_type default_time_zone_char_len, bool allow_time_zone,
-                              bool allow_special_expressions)
+/**
+ * Ported from Spark:
+ *   https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/
+ *   org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394
+ *
+ * Parse a string with time zone to a timestamp.
+ * The bool in the returned tuple is false if the parse failed.
+ */
+__device__ __host__ thrust::tuple<cudf::timestamp_us, bool> parse_string_to_timestamp_us(
+  cudf::string_view const& timestamp_str,
+  const char* default_time_zone,
+  cudf::size_type default_time_zone_char_len,
+  bool allow_time_zone,
+  bool allow_special_expressions,
+  cudf::timestamp_us epoch,
+  cudf::timestamp_us now,
+  cudf::timestamp_us today,
+  cudf::timestamp_us tomorrow,
+  cudf::timestamp_us yesterday)
 {
-
   auto error_us = cudf::timestamp_us{cudf::duration_us{0}};
 
-  if (timestamp_str.empty())
-  {
-    return thrust::make_tuple(error_us, false);
-  }
+  if (timestamp_str.empty()) { return thrust::make_tuple(error_us, false); }
 
-  const char *curr_ptr = timestamp_str.data();
-  const char *end_ptr = curr_ptr + timestamp_str.size_bytes();
+  const char* curr_ptr = timestamp_str.data();
+  const char* end_ptr  = curr_ptr + timestamp_str.size_bytes();
 
   // trim left
-  while (curr_ptr < end_ptr && is_whitespace(*curr_ptr))
-  {
+  while (curr_ptr < end_ptr && is_whitespace(*curr_ptr)) {
     ++curr_ptr;
   }
   // trim right
-  while (curr_ptr < end_ptr - 1 && is_whitespace(*(end_ptr - 1)))
-  {
+  while (curr_ptr < end_ptr - 1 && is_whitespace(*(end_ptr - 1))) {
     --end_ptr;
   }
 
   // special strings: epoch, now, today, yesterday, tomorrow
-  // TODO
-  if (allow_special_expressions)
-  {
-    char const *begin_epoch = "epoch";
-    char const *end_epoch = begin_epoch + 5;
+  if (allow_special_expressions) {
+    char const* begin_epoch = "epoch";
+    char const* end_epoch   = begin_epoch + 5;
 
-    char const *begin_now = "now";
-    char const *end_now = begin_now + 3;
+    char const* begin_now = "now";
+    char const* end_now   = begin_now + 3;
 
-    char const *begin_today = "today";
-    char const *end_today = begin_today + 5;
+    char const* begin_today = "today";
+    char const* end_today   = begin_today + 5;
 
-    char const *begin_yesterday = "yesterday";
-    char const *end_yesterday = begin_yesterday + 9;
+    char const* begin_tomorrow = "tomorrow";
+    char const* end_tomorrow   = begin_tomorrow + 8;
 
-    char const *begin_tomorrow = "tomorrow";
-    char const *end_tomorrow = begin_tomorrow + 8;
-    if (equals(curr_ptr, end_ptr, begin_epoch, end_epoch))
-    { // epoch
-      return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, true);
-    }
-    else if (equals(curr_ptr, end_ptr, begin_now, end_now))
-    {
+    char const* begin_yesterday = "yesterday";
+    char const* end_yesterday   = begin_yesterday + 9;
+
+    if (equals_ascii_ignore_case(curr_ptr, end_ptr, begin_epoch, end_epoch)) {
+      // epoch
+      return thrust::make_tuple(epoch, true);
+    } else if (equals_ascii_ignore_case(curr_ptr, end_ptr, begin_now, end_now)) {
       // now
-    }
-    else if (equals(curr_ptr, end_ptr, begin_today, end_today))
-    {
+      return thrust::make_tuple(now, true);
+    } else if (equals_ascii_ignore_case(curr_ptr, end_ptr, begin_today, end_today)) {
       // today
-    }
-    else if (equals(curr_ptr, end_ptr, begin_yesterday, end_yesterday))
-    {
-      // yesterday
-    }
-    else if (equals(curr_ptr, end_ptr, begin_tomorrow, end_tomorrow))
-    {
+      return thrust::make_tuple(today, true);
+    } else if (equals_ascii_ignore_case(curr_ptr, end_ptr, begin_tomorrow, end_tomorrow)) {
       // tomorrow
+      return thrust::make_tuple(tomorrow, true);
+    } else if (equals_ascii_ignore_case(curr_ptr, end_ptr, begin_yesterday, end_yesterday)) {
+      // yesterday
+      return thrust::make_tuple(yesterday, true);
     }
   }
 
-  if (curr_ptr == end_ptr)
-  {
-    return thrust::make_tuple(error_us, false);
-  }
+  if (curr_ptr == end_ptr) { return thrust::make_tuple(error_us, false); }
 
-  const char *const bytes = curr_ptr;
+  const char* const bytes   = curr_ptr;
   const size_t bytes_length = end_ptr - curr_ptr;
 
   thrust::optional<cudf::string_view> tz;
-  int segments[] = {1, 1, 1, 0, 0, 0, 0, 0, 0};
-  int segments_len = 9;
-  int i = 0;
-  int current_segment_value = 0;
+  int segments[]             = {1, 1, 1, 0, 0, 0, 0, 0, 0};
+  int segments_len           = 9;
+  int i                      = 0;
+  int current_segment_value  = 0;
   int current_segment_digits = 0;
-  size_t j = 0;
-  int digits_milli = 0;
-  bool just_time = false;
+  size_t j                   = 0;
+  int digits_milli           = 0;
+  bool just_time             = false;
   thrust::optional<int> year_sign;
-  if ('-' == bytes[j] || '+' == bytes[j])
-  {
-    if ('-' == bytes[j])
-    {
+  if ('-' == bytes[j] || '+' == bytes[j]) {
+    if ('-' == bytes[j]) {
       year_sign = -1;
-    }
-    else
-    {
+    } else {
       year_sign = 1;
     }
     j += 1;
   }
 
-  while (j < bytes_length)
-  {
-    char b = bytes[j];
+  while (j < bytes_length) {
+    char b           = bytes[j];
     int parsed_value = static_cast<int32_t>(b - '0');
-    if (parsed_value < 0 || parsed_value > 9)
-    {
-      if (0 == j && 'T' == b)
-      {
+    if (parsed_value < 0 || parsed_value > 9) {
+      if (0 == j && 'T' == b) {
         just_time = true;
         i += 3;
-      }
-      else if (i < 2)
-      {
-        if (b == '-')
-        {
-          if (!is_valid_digits(i, current_segment_digits))
-          {
+      } else if (i < 2) {
+        if (b == '-') {
+          if (!is_valid_digits(i, current_segment_digits)) {
             return thrust::make_tuple(error_us, false);
           }
-          segments[i] = current_segment_value;
-          current_segment_value = 0;
+          segments[i]            = current_segment_value;
+          current_segment_value  = 0;
           current_segment_digits = 0;
           i += 1;
-        }
-        else if (0 == i && ':' == b && !year_sign.has_value())
-        {
+        } else if (0 == i && ':' == b && !year_sign.has_value()) {
           just_time = true;
-          if (!is_valid_digits(3, current_segment_digits))
-          {
+          if (!is_valid_digits(3, current_segment_digits)) {
             return thrust::make_tuple(error_us, false);
           }
-          segments[3] = current_segment_value;
-          current_segment_value = 0;
+          segments[3]            = current_segment_value;
+          current_segment_value  = 0;
           current_segment_digits = 0;
-          i = 4;
-        }
-        else
-        {
+          i                      = 4;
+        } else {
           return thrust::make_tuple(error_us, false);
         }
-      }
-      else if (2 == i)
-      {
-        if (' ' == b || 'T' == b)
-        {
-          if (!is_valid_digits(i, current_segment_digits))
-          {
+      } else if (2 == i) {
+        if (' ' == b || 'T' == b) {
+          if (!is_valid_digits(i, current_segment_digits)) {
             return thrust::make_tuple(error_us, false);
           }
-          segments[i] = current_segment_value;
-          current_segment_value = 0;
+          segments[i]            = current_segment_value;
+          current_segment_value  = 0;
           current_segment_digits = 0;
           i += 1;
-        }
-        else
-        {
+        } else {
           return thrust::make_tuple(error_us, false);
         }
-      }
-      else if (3 == i || 4 == i)
-      {
-        if (':' == b)
-        {
-          if (!is_valid_digits(i, current_segment_digits))
-          {
+      } else if (3 == i || 4 == i) {
+        if (':' == b) {
+          if (!is_valid_digits(i, current_segment_digits)) {
             return thrust::make_tuple(error_us, false);
           }
-          segments[i] = current_segment_value;
-          current_segment_value = 0;
+          segments[i]            = current_segment_value;
+          current_segment_value  = 0;
           current_segment_digits = 0;
           i += 1;
-        }
-        else
-        {
+        } else {
           return thrust::make_tuple(error_us, false);
         }
-      }
-      else if (5 == i || 6 == i)
-      {
-        if ('.' == b && 5 == i)
-        {
-          if (!is_valid_digits(i, current_segment_digits))
-          {
+      } else if (5 == i || 6 == i) {
+        if ('.' == b && 5 == i) {
+          if (!is_valid_digits(i, current_segment_digits)) {
             return thrust::make_tuple(error_us, false);
           }
-          segments[i] = current_segment_value;
-          current_segment_value = 0;
+          segments[i]            = current_segment_value;
+          current_segment_value  = 0;
           current_segment_digits = 0;
           i += 1;
-        }
-        else
-        {
-          if (!is_valid_digits(i, current_segment_digits))
-          {
+        } else {
+          if (!is_valid_digits(i, current_segment_digits)) {
             return thrust::make_tuple(error_us, false);
           }
-          segments[i] = current_segment_value;
-          current_segment_value = 0;
+          segments[i]            = current_segment_value;
+          current_segment_value  = 0;
           current_segment_digits = 0;
           i += 1;
           tz = cudf::string_view(bytes + j, (bytes_length - j));
-          j = bytes_length - 1;
+          j  = bytes_length - 1;
         }
-        if (i == 6 && '.' != b)
-        {
-          i += 1;
-        }
-      }
-      else
-      {
-        if (i < segments_len && (':' == b || ' ' == b))
-        {
-          if (!is_valid_digits(i, current_segment_digits))
-          {
+        if (i == 6 && '.' != b) { i += 1; }
+      } else {
+        if (i < segments_len && (':' == b || ' ' == b)) {
+          if (!is_valid_digits(i, current_segment_digits)) {
             return thrust::make_tuple(error_us, false);
           }
-          segments[i] = current_segment_value;
-          current_segment_value = 0;
+          segments[i]            = current_segment_value;
+          current_segment_value  = 0;
           current_segment_digits = 0;
           i += 1;
-        }
-        else
-        {
+        } else {
           return thrust::make_tuple(error_us, false);
         }
       }
-    }
-    else
-    {
-      if (6 == i)
-      {
-        digits_milli += 1;
-      }
+    } else {
+      if (6 == i) { digits_milli += 1; }
       // We will truncate the nanosecond part if there are more than 6 digits, which results
       // in loss of precision
-      if (6 != i || current_segment_digits < 6)
-      {
+      if (6 != i || current_segment_digits < 6) {
         current_segment_value = current_segment_value * 10 + parsed_value;
       }
       current_segment_digits += 1;
@@ -362,74 +314,82 @@ parse_string_to_timestamp_us(cudf::string_view const &timestamp_str, const char
     j += 1;
   }
 
-  if (!is_valid_digits(i, current_segment_digits))
-  {
-    return thrust::make_tuple(error_us, false);
-  }
+  if (!is_valid_digits(i, current_segment_digits)) { return thrust::make_tuple(error_us, false); }
   segments[i] = current_segment_value;
 
-  while (digits_milli < 6)
-  {
+  while (digits_milli < 6) {
     segments[6] *= 10;
     digits_milli += 1;
   }
 
-  if (default_time_zone_char_len == 0)
-  {
+  cudf::string_view time_zone;
+  if (tz.has_value()) {
+    time_zone = tz.value();
+  } else {
+    time_zone = cudf::string_view(default_time_zone, default_time_zone_char_len);
+  }
+
+  segments[0] *= year_sign.value_or(1);
+  // above is ported from Spark.
+
+  if (default_time_zone_char_len == 0) {
     // invoke from `string_to_timestamp_without_time_zone`
-    if (just_time || !allow_time_zone && tz.has_value())
-    {
+    if (just_time || !allow_time_zone && tz.has_value()) {
       return thrust::make_tuple(error_us, false);
     }
-  }
-  else
-  {
+  } else {
     // invoke from `string_to_timestamp`
-    if (just_time)
-    {
-      // TODO
-      // set today: year-month-day
+    if (just_time) {
+      // Update here to support the following format:
+      //   `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
+      //   `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
+      // by set local date in a time zone: year-month-day.
+      // Above 2 formats are time zone related, Spark uses LocalDate.now(zoneId)
+
+      // do not support currently
+      return thrust::make_tuple(error_us, false);
     }
   }
 
-  cudf::string_view timze_zone;
-  if (tz.has_value())
-  {
-    timze_zone = tz.value();
-  }
-  else
-  {
-    timze_zone = cudf::string_view(default_time_zone, default_time_zone_char_len);
-  }
-
-  segments[0] *= year_sign.value_or(1);
-  // above is ported from Spark.
-
   // set components
   auto components = timestamp_components{segments[0],
-                                          static_cast<int8_t>(segments[1]),
-                                          static_cast<int8_t>(segments[2]),
-                                          static_cast<int8_t>(segments[3]),
-                                          static_cast<int8_t>(segments[4]),
-                                          static_cast<int8_t>(segments[5]),
-                                          segments[6]};
-
-  return to_utc_timestamp(components, timze_zone);
+                                         static_cast<int8_t>(segments[1]),
+                                         static_cast<int8_t>(segments[2]),
+                                         static_cast<int8_t>(segments[3]),
+                                         static_cast<int8_t>(segments[4]),
+                                         static_cast<int8_t>(segments[5]),
+                                         segments[6]};
+
+  return to_utc_timestamp(components, time_zone);
 }
 
-struct parse_timestamp_string_fn
-{
+struct parse_timestamp_string_fn {
   cudf::column_device_view const d_strings;
-  const char *default_time_zone;
+  const char* default_time_zone;
   cudf::size_type default_time_zone_char_len;
   bool allow_time_zone;
   bool allow_special_expressions;
-
-  __device__ thrust::tuple<cudf::timestamp_us, bool> operator()(const cudf::size_type &idx) const
+  // TODO the following should be passed in.
+  // Note: today, tomorrow, yesterday are time zone related, should use time zone to generate.
+  cudf::timestamp_us epoch     = cudf::timestamp_us{cudf::duration_us{111L}};
+  cudf::timestamp_us now       = cudf::timestamp_us{cudf::duration_us{222L}};
+  cudf::timestamp_us today     = cudf::timestamp_us{cudf::duration_us{333L}};
+  cudf::timestamp_us tomorrow  = cudf::timestamp_us{cudf::duration_us{444L}};
+  cudf::timestamp_us yesterday = cudf::timestamp_us{cudf::duration_us{555L}};
+
+  __device__ thrust::tuple<cudf::timestamp_us, bool> operator()(const cudf::size_type& idx) const
   {
     auto const d_str = d_strings.element<cudf::string_view>(idx);
-    return parse_string_to_timestamp_us(d_str, default_time_zone, default_time_zone_char_len,
-                                        allow_time_zone, allow_special_expressions);
+    return parse_string_to_timestamp_us(d_str,
+                                        default_time_zone,
+                                        default_time_zone_char_len,
+                                        allow_time_zone,
+                                        allow_special_expressions,
+                                        epoch,
+                                        now,
+                                        today,
+                                        tomorrow,
+                                        yesterday);
   }
 };
 
@@ -438,90 +398,153 @@ struct parse_timestamp_string_fn
  * Trims and parses timestamp string column to a timestamp column and a is valid column
  *
  */
-std::pair<std::unique_ptr<cudf::column>, std::unique_ptr<cudf::column>>
-to_timestamp(cudf::strings_column_view const &input,
-              std::string_view const &default_time_zone, bool allow_time_zone,
-              bool allow_special_expressions, rmm::cuda_stream_view stream,
-              rmm::mr::device_memory_resource *mr)
+std::pair<std::unique_ptr<cudf::column>, std::unique_ptr<cudf::column>> to_timestamp(
+  cudf::strings_column_view const& input,
+  std::string_view const& default_time_zone,
+  bool allow_time_zone,
+  bool allow_special_expressions,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   auto d_strings = cudf::column_device_view::create(input.parent(), stream);
 
-  auto output_timestamp = cudf::make_timestamp_column(
-      cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}, input.size(),
-      cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count(), stream, mr);
-  auto output_bool = cudf::make_fixed_width_column(
-      cudf::data_type{cudf::type_id::BOOL8}, input.size(),
-      cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count(), stream, mr);
+  auto output_timestamp =
+    cudf::make_timestamp_column(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS},
+                                input.size(),
+                                cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                input.null_count(),
+                                stream,
+                                mr);
+  // record which string is failed to parse.
+  auto output_bool =
+    cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8},
+                                  input.size(),
+                                  cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                  input.null_count(),
+                                  stream,
+                                  mr);
 
   thrust::transform(
-      rmm::exec_policy(stream), thrust::make_counting_iterator(0),
-      thrust::make_counting_iterator(input.size()),
-      thrust::make_zip_iterator(
-          thrust::make_tuple(output_timestamp->mutable_view().begin<cudf::timestamp_us>(),
-                              output_bool->mutable_view().begin<bool>())),
-      parse_timestamp_string_fn{*d_strings, default_time_zone.data(),
-                                static_cast<cudf::size_type>(default_time_zone.size()),
-                                allow_time_zone, allow_special_expressions});
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(input.size()),
+    thrust::make_zip_iterator(
+      thrust::make_tuple(output_timestamp->mutable_view().begin<cudf::timestamp_us>(),
+                         output_bool->mutable_view().begin<bool>())),
+    parse_timestamp_string_fn{*d_strings,
+                              default_time_zone.data(),
+                              static_cast<cudf::size_type>(default_time_zone.size()),
+                              allow_time_zone,
+                              allow_special_expressions});
 
   return std::make_pair(std::move(output_timestamp), std::move(output_bool));
 }
 
-} // namespace
-
-namespace spark_rapids_jni
+/**
+ * Set the null mask of timestamp column according to the valid column.
+ */
+void update_bitmask(cudf::column& timestamp_column,
+                    cudf::column const& validity_column,
+                    rmm::cuda_stream_view stream,
+                    rmm::mr::device_memory_resource* mr)
 {
+  auto const& ts_view    = timestamp_column.view();
+  auto const& valid_view = validity_column.view();
+  std::vector<cudf::bitmask_type const*> masks;
+  std::vector<cudf::size_type> offsets;
+  if (timestamp_column.nullable()) {
+    masks.push_back(ts_view.null_mask());
+    offsets.push_back(ts_view.offset());
+  }
+
+  // generate bitmask from `validity_column`
+  auto [valid_bitmask, valid_null_count] = cudf::detail::valid_if(
+    valid_view.begin<bool>(), valid_view.end<bool>(), thrust::identity<bool>{}, stream, mr);
 
-std::pair<std::unique_ptr<cudf::column>, bool>
-parse_string_to_timestamp(cudf::strings_column_view const &input,
-                          std::string_view const &default_time_zone, bool allow_time_zone,
-                          bool allow_special_expressions, bool ansi_mode)
+  masks.push_back(static_cast<cudf::bitmask_type*>(valid_bitmask.data()));
+  offsets.push_back(0);
+
+  // merge 2 bitmasks 
+  auto [null_mask, null_count] =
+    cudf::detail::bitmask_and(masks, offsets, timestamp_column.size(), stream, mr);
+
+  timestamp_column.set_null_mask(null_mask, null_count);
+}
+
+/**
+ * Parse string column with time zone to timestamp column,
+ * Returns a pair of timestamp column and a bool indicates whether successed.
+ */
+std::pair<std::unique_ptr<cudf::column>, bool> parse_string_to_timestamp(
+  cudf::strings_column_view const& input,
+  std::string_view const& default_time_zone,
+  bool allow_time_zone,
+  bool allow_special_expressions,
+  bool ansi_mode)
 {
   auto timestamp_type = cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS};
-  if (input.size() == 0)
-  {
+  if (input.size() == 0) {
     return std::make_pair(cudf::make_empty_column(timestamp_type.id()), true);
   }
 
   auto const stream = cudf::get_default_stream();
-  auto const mr = rmm::mr::get_current_device_resource();
-  auto [timestamp_column, valid_column] = to_timestamp(input, default_time_zone, allow_time_zone, allow_special_expressions,
-                                                        stream, mr);
-  if (ansi_mode)
-  {
-    cudf::numeric_scalar<bool> false_scalar{false, true, stream};
-    if (cudf::contains(*valid_column, false_scalar, stream))
-    {
+  auto const mr     = rmm::mr::get_current_device_resource();
+  auto [timestamp_column, validity_column] =
+    to_timestamp(input, default_time_zone, allow_time_zone, allow_special_expressions, stream, mr);
+
+  if (ansi_mode) {
+    // create scalar, value is false, is_valid is true
+    cudf::numeric_scalar<bool> false_scalar{false, true, stream, mr};
+    if (cudf::contains(*validity_column, false_scalar, stream)) {
+      // has invalid value in validity column under ansi mode
       return std::make_pair(nullptr, false);
-    }
-    else
-    {
-      // TODO update bitmask
+    } else {
+      update_bitmask(*timestamp_column, *validity_column, stream, mr);
       return std::make_pair(std::move(timestamp_column), true);
     }
-  }
-  else
-  {
-    // TODO update bitmask
+  } else {
+    update_bitmask(*timestamp_column, *validity_column, stream, mr);
     return std::make_pair(std::move(timestamp_column), true);
   }
 }
 
-std::pair<std::unique_ptr<cudf::column>, bool>
-string_to_timestamp(cudf::strings_column_view const &input,
-                    std::string_view const &default_time_zone,
-                    bool allow_special_expressions,
-                    bool ansi_mode)
+}  // namespace
+
+namespace spark_rapids_jni {
+
+/**
+ * Parse string column with time zone to timestamp column,
+ * Returns a pair of timestamp column and a bool indicates whether successed.
+ * If does not have time zone in string, use the default time zone.
+ */
+std::pair<std::unique_ptr<cudf::column>, bool> string_to_timestamp(
+  cudf::strings_column_view const& input,
+  std::string_view const& default_time_zone,
+  bool allow_special_expressions,
+  bool ansi_mode)
 {
-  return parse_string_to_timestamp(input, default_time_zone, true, allow_special_expressions, ansi_mode);
+  CUDF_EXPECTS(default_time_zone.size() > 0, "should specify default time zone");
+  return parse_string_to_timestamp(
+    input, default_time_zone, true, allow_special_expressions, ansi_mode);
 }
 
-std::pair<std::unique_ptr<cudf::column>, bool>
-string_to_timestamp_without_time_zone(cudf::strings_column_view const &input,
-                                      bool allow_time_zone,
-                                      bool allow_special_expressions,
-                                      bool ansi_mode)
+/**
+ * Parse string column with time zone to timestamp column,
+ * Returns a pair of timestamp column and a bool indicates whether successed.
+ * Do not use the time zone in string.
+ * If allow_time_zone is false and string contains time zone, then the string is invalid.
+ */
+std::pair<std::unique_ptr<cudf::column>, bool> string_to_timestamp_without_time_zone(
+  cudf::strings_column_view const& input,
+  bool allow_time_zone,
+  bool allow_special_expressions,
+  bool ansi_mode)
 {
-  return parse_string_to_timestamp(input, std::string_view(""), allow_time_zone, allow_special_expressions, ansi_mode);
+  return parse_string_to_timestamp(input,
+                                   std::string_view(""),  // specify empty time zone
+                                   allow_time_zone,
+                                   allow_special_expressions,
+                                   ansi_mode);
 }
 
-} // namespace spark_rapids_jni
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp
index 26ecf421a3..d2f1dfa39c 100644
--- a/src/main/cpp/src/datetime_parser.hpp
+++ b/src/main/cpp/src/datetime_parser.hpp
@@ -26,8 +26,7 @@ namespace spark_rapids_jni {
  * Refer to: https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/
  * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394
  *
- * Formats are:
- *
+ * Spark supports the following formats:
  * `[+-]yyyy*`
  * `[+-]yyyy*-[m]m`
  * `[+-]yyyy*-[m]m-[d]d`
@@ -37,6 +36,14 @@ namespace spark_rapids_jni {
  * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
  * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
  *
+ * Unlike Spark, Spark-Rapids only supports the following formats:
+ * `[+-]yyyy*`
+ * `[+-]yyyy*-[m]m`
+ * `[+-]yyyy*-[m]m-[d]d`
+ * `[+-]yyyy*-[m]m-[d]d `
+ * `[+-]yyyy*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
+ * `[+-]yyyy*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
+ *
  * Spark supports the following zone id forms:
  *   - Z - Zulu time zone UTC+0
  *   - +|-[h]h:[m]m
@@ -57,37 +64,72 @@ namespace spark_rapids_jni {
  *
  * @param input input string column view.
  * @param default_time_zone if input string does not contain a time zone, use this time zone.
- * @param allow_time_zone whether allow time zone in the timestamp string. e.g.: 
- *   1991-04-14T02:00:00Asia/Shanghai is invalid when do not allow time zone.
- * @param allow_special_expressions whether allow epoch, now, today, yesterday, tomorrow strings.
  * @param ansi_mode is ansi mode
- * @returns a timestamp column and a bool column. Bool column is empty if ansi mode is false, not empty otherwise.
+ * @returns a timestamp column and a bool column. Bool column is empty if ansi mode is false, not
+ * empty otherwise.
  */
-std::pair<std::unique_ptr<cudf::column>, bool>
-parse_string_to_timestamp(cudf::strings_column_view const &input,
-                          std::string_view const &default_time_zone,
-                          bool allow_time_zone,
-                          bool allow_special_expressions,
-                          bool ansi_mode);
-/**
- * Refer to `parse_string_to_timestamp`
- * If timestamp string does not contain date info(yyyy mm dd), use current date
-*/
-std::pair<std::unique_ptr<cudf::column>, bool>
-string_to_timestamp(cudf::strings_column_view const &input,
-                          std::string_view const &default_time_zone,
-                          bool allow_special_expressions,
-                          bool ansi_mode);
+std::pair<std::unique_ptr<cudf::column>, bool> string_to_timestamp(
+  cudf::strings_column_view const& input,
+  std::string_view const& default_time_zone,
+  bool allow_special_expressions,
+  bool ansi_mode);
 
 /**
- * Refer to `parse_string_to_timestamp`
- * 
- * @param allow_time_zone whether allow time zone in the timestamp string. e.g.: 
+ *
+ * Trims and parses a timestamp string column with time zone suffix to a timestamp column.
+ * e.g.: 1991-04-14T02:00:00Asia/Shanghai => 1991-04-13 18:00:00
+ *
+ * Refer to: https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/
+ * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394
+ *
+ * Spark supports the following formats:
+ * `[+-]yyyy*`
+ * `[+-]yyyy*-[m]m`
+ * `[+-]yyyy*-[m]m-[d]d`
+ * `[+-]yyyy*-[m]m-[d]d `
+ * `[+-]yyyy*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
+ * `[+-]yyyy*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
+ * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
+ * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
+ *
+ * Unlike Spark, Spark-Rapids only supports the following formats:
+ * `[+-]yyyy*`
+ * `[+-]yyyy*-[m]m`
+ * `[+-]yyyy*-[m]m-[d]d`
+ * `[+-]yyyy*-[m]m-[d]d `
+ * `[+-]yyyy*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
+ * `[+-]yyyy*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
+ *
+ * Spark supports the following zone id forms:
+ *   - Z - Zulu time zone UTC+0
+ *   - +|-[h]h:[m]m
+ *   - A short id, see https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS
+ *   - An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-,
+ *     and a suffix in the formats:
+ *     - +|-h[h]
+ *     - +|-hh[:]mm
+ *     - +|-hh:mm:ss
+ *     - +|-hhmmss
+ *  - Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
+ *
+ * Unlike Spark, Spark-Rapids only supports the following time zones:
+ *   - Z - Zulu time zone UTC+0
+ *   - +|-[h]h:[m]m
+ *   - Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
+ *
+ *
+ * @param input input string column view.
+ * @param allow_time_zone whether allow time zone in the timestamp string. e.g.:
  *   1991-04-14T02:00:00Asia/Shanghai is invalid when do not allow time zone.
-*/
-std::pair<std::unique_ptr<cudf::column>, bool>
-string_to_timestamp_without_time_zone(cudf::strings_column_view const &input,
-                          bool allow_time_zone,
-                          bool allow_special_expressions,
-                          bool ansi_mode);
-} // namespace spark_rapids_jni
+ * @param allow_special_expressions whether allow epoch, now, today, yesterday, tomorrow strings.
+ * @param ansi_mode is ansi mode
+ * @returns a timestamp column and a bool column. Bool column is empty if ansi mode is false, not
+ * empty otherwise.
+ */
+std::pair<std::unique_ptr<cudf::column>, bool> string_to_timestamp_without_time_zone(
+  cudf::strings_column_view const& input,
+  bool allow_time_zone,
+  bool allow_special_expressions,
+  bool ansi_mode);
+
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/tests/datetime_parser.cpp b/src/main/cpp/tests/datetime_parser.cpp
index 47a733a41e..23d20bcbb6 100644
--- a/src/main/cpp/tests/datetime_parser.cpp
+++ b/src/main/cpp/tests/datetime_parser.cpp
@@ -31,25 +31,158 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
-using timestamp_col = cudf::test::fixed_width_column_wrapper<cudf::timestamp_us, cudf::timestamp_us::rep>;
-
-struct DateTimeParserTest : public cudf::test::BaseFixture
-{
-};
+using timestamp_col =
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_us, cudf::timestamp_us::rep>;
+using micros_col =
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_us, cudf::timestamp_us::rep>;
+struct DateTimeParserTest : public cudf::test::BaseFixture {};
 
 TEST_F(DateTimeParserTest, ParseTimestamp)
 {
-  auto v = (2023L * 365L * 86400L + 11L * 30L * 86400L + 5L * 86400L + 3L * 3600L + 4L * 60L + 55L) * 1000000L;
-  auto const ts_col = timestamp_col{v, v, v + 123456};
-
-  auto const ts_strings =
-      cudf::test::strings_column_wrapper{"2023-11-05T03:04:55Z",
-                                        "2023-11-05T03:04:55 ",
-                                        "2023-11-05T03:04:55.123456   "};
-  auto const parsed_ts =
-      spark_rapids_jni::string_to_timestamp(cudf::strings_column_view(ts_strings),
-                                            "Z",
-                                            true,
-                                            true);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(ts_col, *(parsed_ts.first));
+  auto ts_strings = cudf::test::strings_column_wrapper(
+    {
+      "2023",
+      " 2023 ",
+      " 2023-11 ",
+      " 2023-11-5 ",
+      " 2023-11-05 3:04:55   ",
+      " 2023-11-05T03:4:55   ",
+      " 2023-11-05T3:4:55   ",
+      "  2023-11-5T3:4:55.",
+      "  2023-11-5T3:4:55.Iran",
+      "  2023-11-5T3:4:55.1 ",
+      "  2023-11-5T3:4:55.1Iran",
+      "  2023-11-05T03:04:55.123456  ",
+      "  2023-11-05T03:04:55.123456Iran  ",
+      " 222222 ",
+      " ",  // invalid
+      "",   // invalid
+      "1-"  // invalid
+
+    },
+    {
+
+      0,  // null bit
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1
+
+    });
+  auto d_2023_1_1           = (2023L * 365L * 86400L + 1 * 30L * 86400L + 1 * 86400L) * 1000000L;
+  auto d_2023_11_1          = (2023L * 365L * 86400L + 11 * 30L * 86400L + 1 * 86400L) * 1000000L;
+  auto d_2023_11_5          = (2023L * 365L * 86400L + 11L * 30L * 86400L + 5L * 86400L) * 1000000L;
+  auto t_3_4_55             = (3L * 3600L + 4L * 60L + 55L) * 1000000L;
+  auto d_2023_11_5_t_3_4_55 = d_2023_11_5 + t_3_4_55;
+  auto ts_col               = timestamp_col(
+    {
+
+                    0L,
+                    d_2023_1_1,
+                    d_2023_11_1,
+                    d_2023_11_5,
+                    d_2023_11_5_t_3_4_55,
+                    d_2023_11_5_t_3_4_55,
+                    d_2023_11_5_t_3_4_55,
+                    d_2023_11_5_t_3_4_55,
+                    d_2023_11_5_t_3_4_55,
+                    d_2023_11_5_t_3_4_55 + 100000,
+                    d_2023_11_5_t_3_4_55 + 100000,
+                    d_2023_11_5_t_3_4_55 + 123456,
+                    d_2023_11_5_t_3_4_55 + 123456,
+                    (222222L * 365L * 86400L + 1 * 30L * 86400L + 1 * 86400L) * 1000000L,
+                    0L,
+                    0L,
+                    0L
+
+    },
+    {
+                    0,  // null bit
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    0,  // null bit
+                    0,  // null bit
+                    0   // null bit
+
+    });
+  auto ret =
+    spark_rapids_jni::string_to_timestamp(cudf::strings_column_view(ts_strings), "Z", true, false);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(ts_col, *(ret.first));
+  assert(ret.second == true);
+
+  ts_strings = cudf::test::strings_column_wrapper(
+    {
+
+      "invalid"
+
+    },
+    {
+
+      1
+
+    });
+  ts_col = timestamp_col(
+    {
+
+      0L
+
+    },
+    {0
+
+    });
+  ret =
+    spark_rapids_jni::string_to_timestamp(cudf::strings_column_view(ts_strings), "Z", true, true);
+  assert(ret.first == nullptr);
+  assert(ret.second == false);
+
+  ts_strings = cudf::test::strings_column_wrapper(
+    {
+
+      " Epoch  ", " NOW ", "  today  ", "  tomoRRow  ", "  yesTERday  "
+
+    },
+    {
+
+      1, 1, 1, 1, 1
+
+    });
+  ts_col = timestamp_col(
+    {// Temp implement: epoch -> 111, now -> 222, ... , yesterday -> 555
+     111L,
+     222L,
+     333L,
+     444L,
+     555L
+
+    },
+    {1, 1, 1, 1, 1
+
+    });
+  ret =
+    spark_rapids_jni::string_to_timestamp(cudf::strings_column_view(ts_strings), "Z", true, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(ts_col, *(ret.first));
+  assert(ret.second == true);
 }

From 533f5904235a6e7fb08793886b45df7b938f0133 Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Thu, 14 Dec 2023 10:12:28 +0800
Subject: [PATCH 06/35] Format code

---
 src/main/cpp/src/datetime_parser.cu    |  2 +-
 src/main/cpp/tests/datetime_parser.cpp | 68 +++++++++++++-------------
 2 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index 6411c00017..d0d9cee93f 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -464,7 +464,7 @@ void update_bitmask(cudf::column& timestamp_column,
   masks.push_back(static_cast<cudf::bitmask_type*>(valid_bitmask.data()));
   offsets.push_back(0);
 
-  // merge 2 bitmasks 
+  // merge 2 bitmasks
   auto [null_mask, null_count] =
     cudf::detail::bitmask_and(masks, offsets, timestamp_column.size(), stream, mr);
 
diff --git a/src/main/cpp/tests/datetime_parser.cpp b/src/main/cpp/tests/datetime_parser.cpp
index 23d20bcbb6..9ab4271327 100644
--- a/src/main/cpp/tests/datetime_parser.cpp
+++ b/src/main/cpp/tests/datetime_parser.cpp
@@ -89,43 +89,43 @@ TEST_F(DateTimeParserTest, ParseTimestamp)
   auto ts_col               = timestamp_col(
     {
 
-                    0L,
-                    d_2023_1_1,
-                    d_2023_11_1,
-                    d_2023_11_5,
-                    d_2023_11_5_t_3_4_55,
-                    d_2023_11_5_t_3_4_55,
-                    d_2023_11_5_t_3_4_55,
-                    d_2023_11_5_t_3_4_55,
-                    d_2023_11_5_t_3_4_55,
-                    d_2023_11_5_t_3_4_55 + 100000,
-                    d_2023_11_5_t_3_4_55 + 100000,
-                    d_2023_11_5_t_3_4_55 + 123456,
-                    d_2023_11_5_t_3_4_55 + 123456,
-                    (222222L * 365L * 86400L + 1 * 30L * 86400L + 1 * 86400L) * 1000000L,
-                    0L,
-                    0L,
-                    0L
+      0L,
+      d_2023_1_1,
+      d_2023_11_1,
+      d_2023_11_5,
+      d_2023_11_5_t_3_4_55,
+      d_2023_11_5_t_3_4_55,
+      d_2023_11_5_t_3_4_55,
+      d_2023_11_5_t_3_4_55,
+      d_2023_11_5_t_3_4_55,
+      d_2023_11_5_t_3_4_55 + 100000,
+      d_2023_11_5_t_3_4_55 + 100000,
+      d_2023_11_5_t_3_4_55 + 123456,
+      d_2023_11_5_t_3_4_55 + 123456,
+      (222222L * 365L * 86400L + 1 * 30L * 86400L + 1 * 86400L) * 1000000L,
+      0L,
+      0L,
+      0L
 
     },
     {
-                    0,  // null bit
-                    1,
-                    1,
-                    1,
-                    1,
-                    1,
-                    1,
-                    1,
-                    1,
-                    1,
-                    1,
-                    1,
-                    1,
-                    1,
-                    0,  // null bit
-                    0,  // null bit
-                    0   // null bit
+      0,  // null bit
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      0,  // null bit
+      0,  // null bit
+      0   // null bit
 
     });
   auto ret =

From 759a6dc73069a9bec70b2a17ea90818109e68f93 Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Thu, 14 Dec 2023 11:02:03 +0800
Subject: [PATCH 07/35] Update for UTC time zone parser

---
 src/main/cpp/src/datetime_parser.cu | 34 +++++++++++++++++++----------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index d0d9cee93f..7b1be25a43 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -71,6 +71,15 @@ __device__ __host__ thrust::tuple<cudf::timestamp_us, bool> to_utc_timestamp(
   return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{us}}, true);
 }
 
+/**
+ * Convert a local time in a time zone to a UTC timestamp
+ */
+__device__ __host__ thrust::tuple<cudf::timestamp_us, bool> to_utc_timestamp(
+  timestamp_components const& components)
+{
+  return to_utc_timestamp(components, cudf::string_view("UTC", 3));
+}
+
 /**
  * Is white space
  */
@@ -332,10 +341,20 @@ __device__ __host__ thrust::tuple<cudf::timestamp_us, bool> parse_string_to_time
   segments[0] *= year_sign.value_or(1);
   // above is ported from Spark.
 
+  // set components
+  auto components = timestamp_components{segments[0],
+                                         static_cast<int8_t>(segments[1]),
+                                         static_cast<int8_t>(segments[2]),
+                                         static_cast<int8_t>(segments[3]),
+                                         static_cast<int8_t>(segments[4]),
+                                         static_cast<int8_t>(segments[5]),
+                                         segments[6]};
   if (default_time_zone_char_len == 0) {
     // invoke from `string_to_timestamp_without_time_zone`
     if (just_time || !allow_time_zone && tz.has_value()) {
       return thrust::make_tuple(error_us, false);
+    } else {
+      return to_utc_timestamp(components);
     }
   } else {
     // invoke from `string_to_timestamp`
@@ -348,19 +367,10 @@ __device__ __host__ thrust::tuple<cudf::timestamp_us, bool> parse_string_to_time
 
       // do not support currently
       return thrust::make_tuple(error_us, false);
+    } else {
+      return to_utc_timestamp(components, time_zone);
     }
   }
-
-  // set components
-  auto components = timestamp_components{segments[0],
-                                         static_cast<int8_t>(segments[1]),
-                                         static_cast<int8_t>(segments[2]),
-                                         static_cast<int8_t>(segments[3]),
-                                         static_cast<int8_t>(segments[4]),
-                                         static_cast<int8_t>(segments[5]),
-                                         segments[6]};
-
-  return to_utc_timestamp(components, time_zone);
 }
 
 struct parse_timestamp_string_fn {
@@ -441,7 +451,7 @@ std::pair<std::unique_ptr<cudf::column>, std::unique_ptr<cudf::column>> to_times
 }
 
 /**
- * Set the null mask of timestamp column according to the valid column.
+ * Set the null mask of timestamp column according to the validity column.
  */
 void update_bitmask(cudf::column& timestamp_column,
                     cudf::column const& validity_column,

From 3335201d0aaf01ed02463b0e1ec74b4c9097b2c1 Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Thu, 14 Dec 2023 18:29:34 +0800
Subject: [PATCH 08/35] Add JNI interface

---
 src/main/cpp/src/CastStringJni.cpp            | 48 ++++++++++
 .../nvidia/spark/rapids/jni/CastStrings.java  | 89 ++++++++++++++++++-
 .../spark/rapids/jni/CastStringsTest.java     | 71 +++++++++++++++
 3 files changed, 207 insertions(+), 1 deletion(-)

diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp
index b7d898a0c8..32d7b7d697 100644
--- a/src/main/cpp/src/CastStringJni.cpp
+++ b/src/main/cpp/src/CastStringJni.cpp
@@ -30,6 +30,7 @@
 #include <cudf/unary.hpp>
 
 #include "cudf_jni_apis.hpp"
+#include "datetime_parser.hpp"
 #include "dtype_utils.hpp"
 #include "jni_utils.hpp"
 
@@ -255,4 +256,51 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromInteger
   }
   CATCH_CAST_EXCEPTION(env, 0);
 }
+
+JNIEXPORT jlong JNICALL
+Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv* env,
+                                                         jclass,
+                                                         jlong input_column,
+                                                         jstring default_time_zone,
+                                                         jboolean allow_special_expressions,
+                                                         jboolean ansiEnabled)
+{
+  JNI_NULL_CHECK(env, input_column, "input column is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::jni::native_jstring default_zone(env, default_time_zone);
+    auto input_view{*reinterpret_cast<cudf::column_view const*>(input_column)};
+    auto [ret_cv, success] = spark_rapids_jni::string_to_timestamp(
+      input_view, default_zone.get(), allow_special_expressions, ansiEnabled);
+    if (success) { return cudf::jni::release_as_jlong(ret_cv); }
+  }
+  CATCH_STD(env, 0);
+
+  // sucess is false, throw exception.
+  // Note: do not need to release ret_cv, because it's nullptr when success is false.
+  JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Parse failed on Ansi mode", 0);
+}
+
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestampWithoutTimeZone(
+  JNIEnv* env,
+  jclass,
+  jlong input_column,
+  jboolean allowTimeZone,
+  jboolean allow_special_expressions,
+  jboolean ansiEnabled)
+{
+  JNI_NULL_CHECK(env, input_column, "input column is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto input_view{*reinterpret_cast<cudf::column_view const*>(input_column)};
+    auto [ret_cv, success] = spark_rapids_jni::string_to_timestamp_without_time_zone(
+      input_view, allowTimeZone, allow_special_expressions, ansiEnabled);
+    if (success) { return cudf::jni::release_as_jlong(ret_cv); }
+  }
+  CATCH_STD(env, 0);
+
+  // sucess is false, throw exception.
+  // Note: do not need to release ret_cv, because it's nullptr when success is false.
+  JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Parse failed on Ansi mode", 0);
+}
 }
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
index 2b2267f034..515f725e02 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
@@ -152,6 +152,89 @@ public static ColumnVector fromIntegersWithBase(ColumnView cv, int base) {
     return new ColumnVector(fromIntegersWithBase(cv.getNativeView(), base));
   }
 
+  /**
+   * Trims and parses a timestamp string column with time zone suffix to a
+   * timestamp column.
+   * Use the default time zone if string does not contain time zone.
+   *
+   * Supports the following formats:
+   * `[+-]yyyy*`
+   * `[+-]yyyy*-[m]m`
+   * `[+-]yyyy*-[m]m-[d]d`
+   * `[+-]yyyy*-[m]m-[d]d `
+   * `[+-]yyyy*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
+   * `[+-]yyyy*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
+   * 
+   * Supports the following time zones:
+   * - Z - Zulu time zone UTC+0
+   * - +|-[h]h:[m]m
+   * - Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
+   *
+   * Example:
+   * input = [" 2023", "2023-01-01T08:00:00Asia/Shanghai "]
+   * ts = toTimestamp(input, "UTC", allowSpecialExpressions = true, ansiEnabled =
+   * false)
+   * ts is: ['2023-01-01 00:00:00', '2023-01-01T00:00:00']
+   * 
+   * @param cv                      The input string column to be converted.
+   * @param defaultTimeZone         Use the default time zone if string does not
+   *                                contain time zone.
+   * @param allowSpecialExpressions Whether allow: epoch, now, today, tomorrow
+   * @param ansiEnabled             is Ansi mode
+   * @return a timestamp column
+   * @throws IllegalArgumentException if cv contains invalid value when
+   *                                  ansiEnabled is true
+   */
+  public static ColumnVector toTimestamp(ColumnView cv, String defaultTimeZone,
+      boolean allowSpecialExpressions, boolean ansiEnabled) {
+    if (defaultTimeZone == null || defaultTimeZone.isEmpty()) {
+      throw new IllegalArgumentException("Default time zone can not be empty.");
+    }
+    return new ColumnVector(toTimestamp(cv.getNativeView(), defaultTimeZone,
+        allowSpecialExpressions, ansiEnabled));
+  }
+
+  /**
+   * Trims and parses a timestamp string column with time zone suffix to a
+   * timestamp column.
+   * Do not use the time zones in timestamp strings.
+   *
+   * Supports the following formats:
+   * `[+-]yyyy*`
+   * `[+-]yyyy*-[m]m`
+   * `[+-]yyyy*-[m]m-[d]d`
+   * `[+-]yyyy*-[m]m-[d]d `
+   * `[+-]yyyy*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
+   * `[+-]yyyy*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
+   * 
+   * Supports the following time zones:
+   * - Z - Zulu time zone UTC+0
+   * - +|-[h]h:[m]m
+   * - Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
+   *
+   * Example:
+   * input = [" 2023", "2023-01-01T08:00:00Asia/Shanghai "]
+   * ts = toTimestampWithoutTimeZone(input, allowTimeZone = true,
+   * allowSpecialExpressions = true, ansiEnabled = false)
+   * ts is: ['2023-01-01 00:00:00', '2023-01-01T08:00:00']
+   * 
+   * @param cv                      The input string column to be converted.
+   * @param allow_time_zone         whether allow time zone in the timestamp
+   *                                string. e.g.:
+   *                                1991-04-14T02:00:00Asia/Shanghai is invalid
+   *                                when do not allow time zone.
+   * @param allowSpecialExpressions Whether allow: epoch, now, today, tomorrow
+   * @param ansiEnabled             is Ansi mode
+   * @return a timestamp column
+   * @throws IllegalArgumentException if cv contains invalid value when
+   *                                  ansiEnabled is true
+   */
+  public static ColumnVector toTimestampWithoutTimeZone(ColumnView cv, boolean allowTimeZone,
+      boolean allowSpecialExpressions, boolean ansiEnabled) {
+    return new ColumnVector(toTimestampWithoutTimeZone(cv.getNativeView(), allowTimeZone,
+        allowSpecialExpressions, ansiEnabled));
+  }
+
   private static native long toInteger(long nativeColumnView, boolean ansi_enabled, boolean strip,
       int dtype);
   private static native long toDecimal(long nativeColumnView, boolean ansi_enabled, boolean strip,
@@ -163,4 +246,8 @@ private static native long toDecimal(long nativeColumnView, boolean ansi_enabled
   private static native long toIntegersWithBase(long nativeColumnView, int base,
     boolean ansiEnabled, int dtype);
   private static native long fromIntegersWithBase(long nativeColumnView, int base);
-}
\ No newline at end of file
+  private static native long toTimestamp(long nativeColumnView, String defaultTimeZone,
+      boolean allowSpecialExpressions, boolean ansiEnabled);
+  private static native long toTimestampWithoutTimeZone(long nativeColumnView,
+      boolean allowTimeZone, boolean allowSpecialExpressions, boolean ansiEnabled);
+}
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
index c39766454a..7eeee46945 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
@@ -17,6 +17,7 @@
 package com.nvidia.spark.rapids.jni;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.fail;
 
 import java.util.ArrayList;
@@ -324,4 +325,74 @@ void baseHex2DecTest() {
       convTestInternal(input, expected, 16);
     }
   }
+
+  // TODO update after this PR is done.
+  @Test
+  void toTimestampTestNonAnsi() {
+    long d_2023_1_1 = (2023L * 365L * 86400L + 1 * 30L * 86400L + 1 * 86400L) * 1000000L;
+    long d_2023_11_1 = (2023L * 365L * 86400L + 11 * 30L * 86400L + 1 * 86400L) * 1000000L;
+    long d_2023_11_5 = (2023L * 365L * 86400L + 11L * 30L * 86400L + 5L * 86400L) * 1000000L;
+    long t_3_4_55 = (3L * 3600L + 4L * 60L + 55L) * 1000000L;
+    long d_2023_11_5_t_3_4_55 = d_2023_11_5 + t_3_4_55;
+
+    try (
+        ColumnVector input = ColumnVector.fromStrings(
+            null,
+            " 2023 ",
+            " 2023-11 ",
+            " 2023-11-5 ",
+            " 2023-11-05 3:04:55   ",
+            " 2023-11-05T03:4:55   ",
+            " 2023-11-05T3:4:55   ",
+            "  2023-11-5T3:4:55.",
+            "  2023-11-5T3:4:55.Iran",
+            "  2023-11-5T3:4:55.1 ",
+            "  2023-11-5T3:4:55.1Iran",
+            "  2023-11-05T03:04:55.123456  ",
+            "  2023-11-05T03:04:55.123456Iran  ",
+            " 222222 ",
+            " ", // invalid
+            "", // invalid
+            "1-" // invalid
+        );
+        ColumnVector expected = ColumnVector.timestampMicroSecondsFromBoxedLongs(
+            null,
+            d_2023_1_1,
+            d_2023_11_1,
+            d_2023_11_5,
+            d_2023_11_5_t_3_4_55,
+            d_2023_11_5_t_3_4_55,
+            d_2023_11_5_t_3_4_55,
+            d_2023_11_5_t_3_4_55,
+            d_2023_11_5_t_3_4_55,
+            d_2023_11_5_t_3_4_55 + 100000,
+            d_2023_11_5_t_3_4_55 + 100000,
+            d_2023_11_5_t_3_4_55 + 123456,
+            d_2023_11_5_t_3_4_55 + 123456,
+            (222222L * 365L * 86400L + 1 * 30L * 86400L + 1 * 86400L) * 1000000L,
+            null,
+            null,
+            null);
+        ColumnVector actual = CastStrings.toTimestamp(input,
+            "Asia/Shanghai", false, false)) {
+      AssertUtils.assertColumnsAreEqual(expected, actual);
+    }
+  }
+
+  @Test
+  void toTimestampTestAnsi() {
+    assertThrows(IllegalArgumentException.class, () -> {
+      try (ColumnVector input = ColumnVector.fromStrings(" invalid_value ")) {
+        // ansiEnabled is true
+        CastStrings.toTimestamp(input, "Asia/Shanghai", false, true);
+      }
+    });
+
+    assertThrows(IllegalArgumentException.class, () -> {
+      try (ColumnVector input = ColumnVector.fromStrings(" invalid_value ")) {
+        // ansiEnabled is true
+        CastStrings.toTimestampWithoutTimeZone(input, true, false, true);
+      }
+    });
+  }
 }

From 2270d8b10ed376f7a6af93d2fcf8c269d6f9a298 Mon Sep 17 00:00:00 2001
From: sperlingxx <lovedreamf@gmail.com>
Date: Wed, 27 Dec 2023 18:32:18 +0800
Subject: [PATCH 09/35] complete the work

Signed-off-by: sperlingxx <lovedreamf@gmail.com>
---
 src/main/cpp/src/CastStringJni.cpp            |  44 +-
 src/main/cpp/src/datetime_parser.cu           | 839 ++++++++++--------
 src/main/cpp/src/datetime_parser.hpp          |  14 +-
 .../nvidia/spark/rapids/jni/CastStrings.java  |  52 +-
 .../spark/rapids/jni/GpuTimeZoneDB.java       |  94 +-
 .../spark/rapids/jni/CastStringsTest.java     | 182 ++--
 6 files changed, 728 insertions(+), 497 deletions(-)

diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp
index 32d7b7d697..60bf69ff2e 100644
--- a/src/main/cpp/src/CastStringJni.cpp
+++ b/src/main/cpp/src/CastStringJni.cpp
@@ -257,21 +257,30 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromInteger
   CATCH_CAST_EXCEPTION(env, 0);
 }
 
-JNIEXPORT jlong JNICALL
-Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv* env,
-                                                         jclass,
-                                                         jlong input_column,
-                                                         jstring default_time_zone,
-                                                         jboolean allow_special_expressions,
-                                                         jboolean ansiEnabled)
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv *env,
+                                                                                 jclass,
+                                                                                 jlong input_column,
+                                                                                 jlong transitions_handle,
+                                                                                 jlong tz_indices_col,
+                                                                                 jlong special_dt_lit_col,
+                                                                                 jint tz_default_index,
+                                                                                 jboolean ansi_enabled)
 {
   JNI_NULL_CHECK(env, input_column, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::jni::native_jstring default_zone(env, default_time_zone);
-    auto input_view{*reinterpret_cast<cudf::column_view const*>(input_column)};
-    auto [ret_cv, success] = spark_rapids_jni::string_to_timestamp(
-      input_view, default_zone.get(), allow_special_expressions, ansiEnabled);
+
+    auto const &input_view = cudf::strings_column_view(*reinterpret_cast<cudf::column_view const *>(input_column));
+    auto const transitions = reinterpret_cast<cudf::table_view const *>(transitions_handle)->column(0);
+    auto const &tz_indices_view = cudf::strings_column_view(
+        *reinterpret_cast<cudf::column_view const *>(tz_indices_col));
+    auto const &special_dt_lit_view = cudf::strings_column_view(
+        *reinterpret_cast<cudf::column_view const *>(special_dt_lit_col));
+
+    auto const tz_index = static_cast<cudf::size_type>(tz_default_index);
+
+    auto [ret_cv, success] = spark_rapids_jni::string_to_timestamp_with_tz(
+      input_view, transitions, tz_indices_view, special_dt_lit_view, tz_index, ansi_enabled);
     if (success) { return cudf::jni::release_as_jlong(ret_cv); }
   }
   CATCH_STD(env, 0);
@@ -285,16 +294,19 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp
   JNIEnv* env,
   jclass,
   jlong input_column,
-  jboolean allowTimeZone,
-  jboolean allow_special_expressions,
-  jboolean ansiEnabled)
+  jlong special_dt_lit_col,
+  jboolean allow_time_zone,
+  jboolean ansi_enabled)
 {
   JNI_NULL_CHECK(env, input_column, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto input_view{*reinterpret_cast<cudf::column_view const*>(input_column)};
+    auto const &input_view = cudf::strings_column_view(*reinterpret_cast<cudf::column_view const *>(input_column));
+    auto const &special_dt_lit_view = cudf::strings_column_view(
+        *reinterpret_cast<cudf::column_view const *>(special_dt_lit_col));
+
     auto [ret_cv, success] = spark_rapids_jni::string_to_timestamp_without_time_zone(
-      input_view, allowTimeZone, allow_special_expressions, ansiEnabled);
+        input_view, special_dt_lit_view, allow_time_zone, ansi_enabled);
     if (success) { return cudf::jni::release_as_jlong(ret_cv); }
   }
   CATCH_STD(env, 0);
diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index 7b1be25a43..31b5fe0099 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -14,11 +14,12 @@
  * limitations under the License.
  */
 
-#include <map>
-#include <numeric>
-#include <vector>
+#include "datetime_parser.hpp"
 
 #include <iostream>
+#include <vector>
+
+#include <cuda/std/cassert>
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -26,21 +27,36 @@
 #include <cudf/detail/null_mask.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/valid_if.cuh>
+
+#include <cudf/lists/list_device_view.cuh>
+#include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/search.hpp>
 #include <cudf/strings/string_view.cuh>
-#include <cudf/strings/strings_column_view.hpp>
+
+#include <cudf/reduction.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/span.hpp>
 #include <cudf/wrappers/timestamps.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/logical.h>
 #include <thrust/optional.h>
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
-#include "datetime_parser.hpp"
+using column                   = cudf::column;
+using column_device_view       = cudf::column_device_view;
+using column_view              = cudf::column_view;
+using lists_column_device_view = cudf::detail::lists_column_device_view;
+using size_type                = cudf::size_type;
+using string_view              = cudf::string_view;
+using struct_view              = cudf::struct_view;
+using table_view               = cudf::table_view;
 
 namespace {
 
@@ -57,29 +73,6 @@ struct timestamp_components {
   int32_t microseconds;
 };
 
-/**
- * Convert a local time in a time zone to a UTC timestamp
- */
-__device__ __host__ thrust::tuple<cudf::timestamp_us, bool> to_utc_timestamp(
-  timestamp_components const& components, cudf::string_view const& time_zone)
-{
-  // TODO replace the following fake implementation
-  long seconds = components.year * 365L * 86400L + components.month * 30L * 86400L +
-                 components.day * 86400L + components.hour * 3600L + components.minute * 60L +
-                 components.second;
-  long us = seconds * 1000000L + components.microseconds;
-  return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{us}}, true);
-}
-
-/**
- * Convert a local time in a time zone to a UTC timestamp
- */
-__device__ __host__ thrust::tuple<cudf::timestamp_us, bool> to_utc_timestamp(
-  timestamp_components const& components)
-{
-  return to_utc_timestamp(components, cudf::string_view("UTC", 3));
-}
-
 /**
  * Is white space
  */
@@ -100,11 +93,10 @@ __device__ __host__ inline bool is_whitespace(const char chr)
  *   "epoch", "now", "today", "yesterday", "tomorrow"
  * the expect string should be lower-case a-z chars
  */
-__device__ __host__ inline bool equals_ascii_ignore_case(char const* actual_begin,
-                                                         char const* actual_end,
-                                                         char const* expect_begin,
-                                                         char const* expect_end)
-{
+__device__ inline bool equals_ascii_ignore_case(char const *actual_begin,
+                                                char const *actual_end,
+                                                char const *expect_begin,
+                                                char const *expect_end) {
   if (actual_end - actual_begin != expect_end - expect_begin) { return false; }
 
   while (expect_begin < expect_end) {
@@ -125,281 +117,380 @@ __device__ __host__ bool is_valid_digits(int segment, int digits)
   const int constexpr maxDigitsYear = 6;
   // For the nanosecond part, more than 6 digits is allowed, but will be truncated.
   return segment == 6 || (segment == 0 && digits >= 4 && digits <= maxDigitsYear) ||
-         // For the zoneId segment(7), it's could be zero digits when it's a region-based zone ID
-         (segment == 7 && digits <= 2) ||
-         (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2);
+     // For the zoneId segment(7), it's could be zero digits when it's a region-based zone ID
+     (segment == 7 && digits <= 2) ||
+     (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2);
 }
 
-/**
- * Ported from Spark:
- *   https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/
- *   org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394
- *
- * Parse a string with time zone to a timestamp.
- * The bool in the returned tuple is false if the parse failed.
- */
-__device__ __host__ thrust::tuple<cudf::timestamp_us, bool> parse_string_to_timestamp_us(
-  cudf::string_view const& timestamp_str,
-  const char* default_time_zone,
-  cudf::size_type default_time_zone_char_len,
-  bool allow_time_zone,
-  bool allow_special_expressions,
-  cudf::timestamp_us epoch,
-  cudf::timestamp_us now,
-  cudf::timestamp_us today,
-  cudf::timestamp_us tomorrow,
-  cudf::timestamp_us yesterday)
-{
-  auto error_us = cudf::timestamp_us{cudf::duration_us{0}};
+enum ParseResult {
+  OK = 0,
+  INVALID = 1,
+  UNSUPPORTED = 2
+};
 
-  if (timestamp_str.empty()) { return thrust::make_tuple(error_us, false); }
+template <bool with_timezone>
+struct parse_timestamp_string_fn {
+  column_device_view const d_strings;
+  column_device_view const special_datetime_names;
+  size_type default_tz_index;
+  bool allow_tz_in_date_str = true;
+  // The list column of transitions to figure out the correct offset
+  // to adjust the timestamp. The type of the values in this column is
+  // LIST<STRUCT<utcInstant: int64, tzInstant: int64, utcOffset: int32, looseTzInstant: int64>>.
+  thrust::optional<lists_column_device_view const> transitions = thrust::nullopt;
+  thrust::optional<column_device_view const> tz_indices = thrust::nullopt;
+
+  __device__ thrust::tuple<cudf::timestamp_us, uint8_t> operator()(const cudf::size_type& idx) const
+  {
+    if (!d_strings.is_valid(idx)) {
+      return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID);
+    }
 
-  const char* curr_ptr = timestamp_str.data();
-  const char* end_ptr  = curr_ptr + timestamp_str.size_bytes();
+    auto const d_str = d_strings.element<cudf::string_view>(idx);
 
-  // trim left
-  while (curr_ptr < end_ptr && is_whitespace(*curr_ptr)) {
-    ++curr_ptr;
-  }
-  // trim right
-  while (curr_ptr < end_ptr - 1 && is_whitespace(*(end_ptr - 1))) {
-    --end_ptr;
-  }
+    timestamp_components ts_comp{};
+    char const * tz_lit_ptr = nullptr;
+    size_type tz_lit_len = 0;
+    switch (parse_string_to_timestamp_us(&ts_comp, &tz_lit_ptr, &tz_lit_len, d_str)) {
+      case ParseResult::INVALID:
+        return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID);
+      case ParseResult::UNSUPPORTED:
+        return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::UNSUPPORTED);
+      case ParseResult::OK:
+      default:
+        break;
+    }
 
-  // special strings: epoch, now, today, yesterday, tomorrow
-  if (allow_special_expressions) {
-    char const* begin_epoch = "epoch";
-    char const* end_epoch   = begin_epoch + 5;
-
-    char const* begin_now = "now";
-    char const* end_now   = begin_now + 3;
-
-    char const* begin_today = "today";
-    char const* end_today   = begin_today + 5;
-
-    char const* begin_tomorrow = "tomorrow";
-    char const* end_tomorrow   = begin_tomorrow + 8;
-
-    char const* begin_yesterday = "yesterday";
-    char const* end_yesterday   = begin_yesterday + 9;
-
-    if (equals_ascii_ignore_case(curr_ptr, end_ptr, begin_epoch, end_epoch)) {
-      // epoch
-      return thrust::make_tuple(epoch, true);
-    } else if (equals_ascii_ignore_case(curr_ptr, end_ptr, begin_now, end_now)) {
-      // now
-      return thrust::make_tuple(now, true);
-    } else if (equals_ascii_ignore_case(curr_ptr, end_ptr, begin_today, end_today)) {
-      // today
-      return thrust::make_tuple(today, true);
-    } else if (equals_ascii_ignore_case(curr_ptr, end_ptr, begin_tomorrow, end_tomorrow)) {
-      // tomorrow
-      return thrust::make_tuple(tomorrow, true);
-    } else if (equals_ascii_ignore_case(curr_ptr, end_ptr, begin_yesterday, end_yesterday)) {
-      // yesterday
-      return thrust::make_tuple(yesterday, true);
+    if constexpr (!with_timezone) {
+      // path without timezone, in which unix_timestamp is straightforwardly computed
+      auto const ts_unaligned = compute_epoch_us(ts_comp);
+      return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{ts_unaligned}}, ParseResult::OK);
+    }
+  
+    // path with timezone, in which timezone offset has to be determined before computing unix_timestamp
+    int64_t tz_offset;
+    if (tz_lit_ptr == nullptr) {
+      tz_offset = extract_timezone_offset(compute_loose_epoch_s(ts_comp), default_tz_index);
+    } else {
+      auto tz_view = string_view(tz_lit_ptr, tz_lit_len);
+      if (auto [utc_offset, ret_code] = parse_utc_like_tz(tz_view); ret_code == 0) {
+        tz_offset = utc_offset;
+      } else if (ret_code == 1) {
+        auto tz_index = query_index_from_tz_db(tz_view);
+        if (tz_index > transitions->size()) {
+          if (tz_index == tz_indices->size())
+            return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID);
+          return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::UNSUPPORTED);
+        }
+        tz_offset = extract_timezone_offset(compute_loose_epoch_s(ts_comp), tz_index);
+      } else {
+        return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID);
+      }
     }
+
+    auto const ts_unaligned = compute_epoch_us(ts_comp);
+
+    return thrust::make_tuple(
+        cudf::timestamp_us{cudf::duration_us{ts_unaligned - tz_offset * 1000000L}},
+        ParseResult::OK);
   }
 
-  if (curr_ptr == end_ptr) { return thrust::make_tuple(error_us, false); }
-
-  const char* const bytes   = curr_ptr;
-  const size_t bytes_length = end_ptr - curr_ptr;
-
-  thrust::optional<cudf::string_view> tz;
-  int segments[]             = {1, 1, 1, 0, 0, 0, 0, 0, 0};
-  int segments_len           = 9;
-  int i                      = 0;
-  int current_segment_value  = 0;
-  int current_segment_digits = 0;
-  size_t j                   = 0;
-  int digits_milli           = 0;
-  bool just_time             = false;
-  thrust::optional<int> year_sign;
-  if ('-' == bytes[j] || '+' == bytes[j]) {
-    if ('-' == bytes[j]) {
-      year_sign = -1;
+  // TODO: support CST/PST/AST
+  __device__ inline thrust::pair<int64_t, uint8_t> parse_utc_like_tz(string_view const &tz_lit) const
+  {
+    size_type len = tz_lit.size_bytes();
+
+    char const *ptr = tz_lit.data();
+
+    if (*ptr == 'Z') {
+      if (len > 1) return {0, 1};
+      return {0, 0};
+    }
+
+    size_t char_offset = 0;
+
+    if (len > 2
+        && ((*ptr == 'G' && *(ptr + 1) == 'M' && *(ptr + 2) == 'T')
+        || (*ptr == 'U' && *(ptr + 1) == 'T' && *(ptr + 2) == 'C'))) {
+      char_offset = 3;
+    }
+
+    if (len == char_offset) return {0, 0};
+
+    char const sign_char = *(ptr + char_offset++);
+    int64_t sign;
+    if (sign_char == '+') {
+      sign = 1L;
+    } else if (sign_char == '-') {
+      sign = -1L;
     } else {
-      year_sign = 1;
+      return {0, char_offset < 3 ? 1 : 2};
+    }
+
+    int64_t hms[3] = {0L, 0L, 0L};
+    bool has_colon = false;
+    bool one_digit_mm = false;
+    for (size_type i = 0; i < 3; i++) {
+      if (i == 2 && one_digit_mm) return {0, 2};
+
+      hms[i] = *(ptr + char_offset++) - '0';
+      if (hms[i] < 0 || hms[i] > 9) return {0, 2};
+
+      if (len == char_offset) {
+        if (i > 0) {
+          if (!has_colon) return {0, 2};
+          one_digit_mm = true;
+        }
+        break;
+      }
+
+      if (*(ptr + char_offset) == ':') {
+        if (len == ++char_offset) break;
+        has_colon = true;
+        continue;
+      }
+
+      auto digit = *(ptr + char_offset++) - '0';
+      if (digit < 0 || digit > 9) return {0, 2};
+      hms[i] = hms[i] * 10 + digit;
+
+      if (len == char_offset) break;
+      if (*(ptr + char_offset) == ':') {
+        if (len == ++char_offset) break;
+        has_colon = true;
+        continue;
+      }
+      if (has_colon) return {0, 2};
     }
-    j += 1;
+
+    if (hms[0] > 18 || hms[1] > 59 || hms[2] > 59) return {0, 2};
+    if (hms[0] == 18 && hms[1] + hms[2] > 0) return {0, 2};
+
+    return {sign * (hms[0] * 3600L + hms[1] * 60L + hms[2]), 0};
   }
 
-  while (j < bytes_length) {
-    char b           = bytes[j];
-    int parsed_value = static_cast<int32_t>(b - '0');
-    if (parsed_value < 0 || parsed_value > 9) {
-      if (0 == j && 'T' == b) {
-        just_time = true;
-        i += 3;
-      } else if (i < 2) {
-        if (b == '-') {
-          if (!is_valid_digits(i, current_segment_digits)) {
-            return thrust::make_tuple(error_us, false);
-          }
-          segments[i]            = current_segment_value;
-          current_segment_value  = 0;
-          current_segment_digits = 0;
-          i += 1;
-        } else if (0 == i && ':' == b && !year_sign.has_value()) {
-          just_time = true;
-          if (!is_valid_digits(3, current_segment_digits)) {
-            return thrust::make_tuple(error_us, false);
+  __device__ inline int query_index_from_tz_db(string_view const &tz_lit) const
+  {
+    // TODO: replace with more efficient approach (such as binary search or prefix tree)
+    auto predicate = [tz = tz_indices, &tz_lit] __device__(auto const i) {
+      return tz->element<string_view>(i) == tz_lit;
+    };
+    auto ret = thrust::find_if(thrust::seq,
+                               thrust::make_counting_iterator(0),
+                               thrust::make_counting_iterator(tz_indices->size()),
+                               predicate);
+
+    return *ret;
+  }
+
+  __device__ inline int64_t extract_timezone_offset(int64_t loose_epoch_second, size_type tz_index) const
+  {
+    auto const &utc_offsets = transitions->child().child(2);
+    auto const &loose_instants = transitions->child().child(3);
+
+    auto const local_transitions = cudf::list_device_view{*transitions, tz_index};
+    auto const list_size = local_transitions.size();
+
+    auto const transition_times = cudf::device_span<int64_t const>(
+        loose_instants.data<int64_t>() + local_transitions.element_offset(0),
+        static_cast<size_t>(list_size));
+
+    auto const it = thrust::upper_bound(
+        thrust::seq, transition_times.begin(), transition_times.end(), loose_epoch_second);
+    auto const idx = static_cast<size_type>(thrust::distance(transition_times.begin(), it));
+    auto const list_offset = local_transitions.element_offset(idx - 1);
+
+    return static_cast<int64_t>(utc_offsets.element<int32_t>(list_offset));
+  }
+
+  __device__ inline int64_t compute_loose_epoch_s(timestamp_components const& ts) const
+  {
+    return (ts.year * 400 + (ts.month - 1) * 31 + ts.day - 1) * 86400L + ts.hour * 3600L + ts.minute * 60L + ts.second;
+  }
+
+  __device__ inline int64_t compute_epoch_us(timestamp_components const& ts) const
+  {
+    auto const ymd =  // chrono class handles the leap year calculations for us
+        cuda::std::chrono::year_month_day(
+            cuda::std::chrono::year{ts.year},
+            cuda::std::chrono::month{static_cast<uint32_t>(ts.month)},
+            cuda::std::chrono::day{static_cast<uint32_t>(ts.day)});
+    auto days = cuda::std::chrono::sys_days(ymd).time_since_epoch().count();
+
+    int64_t timestamp_s = (days * 24L * 3600L) + (ts.hour * 3600L) + (ts.minute * 60L) + ts.second;
+
+    return timestamp_s * 1000000L + ts.microseconds;
+  }
+
+  /**
+   * Ported from Spark:
+   *   https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/
+   *   org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394
+   *
+   * Parse a string with time zone to a timestamp.
+   * The bool in the returned tuple is false if the parse failed.
+   */
+  __device__ inline ParseResult parse_string_to_timestamp_us(
+      timestamp_components *ts_comp,
+      char const **parsed_tz_ptr,
+      size_type *parsed_tz_length,
+      cudf::string_view const &timestamp_str) const {
+
+    if (timestamp_str.empty()) { return ParseResult::INVALID; }
+
+    const char *curr_ptr = timestamp_str.data();
+    const char *end_ptr = curr_ptr + timestamp_str.size_bytes();
+
+    // trim left
+    while (curr_ptr < end_ptr && is_whitespace(*curr_ptr)) {
+      ++curr_ptr;
+    }
+    // trim right
+    while (curr_ptr < end_ptr - 1 && is_whitespace(*(end_ptr - 1))) {
+      --end_ptr;
+    }
+
+    // TODO: support special dates [epoch, now, today, yesterday, tomorrow]
+    for (size_type i = 0; i < special_datetime_names.size(); i++) {
+      auto const& ref = special_datetime_names.element<string_view>(i);
+      if (equals_ascii_ignore_case(curr_ptr, end_ptr, ref.data(), ref.data() + ref.size_bytes())) {
+        *parsed_tz_ptr = ref.data();
+        *parsed_tz_length = ref.size_bytes();
+        return ParseResult::UNSUPPORTED;
+      }
+    }
+
+    if (curr_ptr == end_ptr) { return ParseResult::INVALID; }
+
+    const char *const bytes = curr_ptr;
+    const size_type bytes_length = end_ptr - curr_ptr;
+
+    int segments[] = {1, 1, 1, 0, 0, 0, 0, 0, 0};
+    int segments_len = 9;
+    int i = 0;
+    int current_segment_value = 0;
+    int current_segment_digits = 0;
+    size_t j = 0;
+    int digits_milli = 0;
+    // bool just_time = false;
+    thrust::optional<int> year_sign;
+    if ('-' == bytes[j] || '+' == bytes[j]) {
+      if ('-' == bytes[j]) {
+        year_sign = -1;
+      } else {
+        year_sign = 1;
+      }
+      j += 1;
+    }
+
+    while (j < bytes_length) {
+      char b = bytes[j];
+      int parsed_value = static_cast<int32_t>(b - '0');
+      if (parsed_value < 0 || parsed_value > 9) {
+        if (0 == j && 'T' == b) {
+          // just_time = true;
+          i += 3;
+        } else if (i < 2) {
+          if (b == '-') {
+            if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; }
+            segments[i] = current_segment_value;
+            current_segment_value = 0;
+            current_segment_digits = 0;
+            i += 1;
+          } else if (0 == i && ':' == b && !year_sign.has_value()) {
+            // just_time = true;
+            if (!is_valid_digits(3, current_segment_digits)) { return ParseResult::INVALID; }
+            segments[3] = current_segment_value;
+            current_segment_value = 0;
+            current_segment_digits = 0;
+            i = 4;
+          } else {
+            return ParseResult::INVALID;
           }
-          segments[3]            = current_segment_value;
-          current_segment_value  = 0;
-          current_segment_digits = 0;
-          i                      = 4;
-        } else {
-          return thrust::make_tuple(error_us, false);
-        }
-      } else if (2 == i) {
-        if (' ' == b || 'T' == b) {
-          if (!is_valid_digits(i, current_segment_digits)) {
-            return thrust::make_tuple(error_us, false);
+        } else if (2 == i) {
+          if (' ' == b || 'T' == b) {
+            if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; }
+            segments[i] = current_segment_value;
+            current_segment_value = 0;
+            current_segment_digits = 0;
+            i += 1;
+          } else {
+            return ParseResult::INVALID;
           }
-          segments[i]            = current_segment_value;
-          current_segment_value  = 0;
-          current_segment_digits = 0;
-          i += 1;
-        } else {
-          return thrust::make_tuple(error_us, false);
-        }
-      } else if (3 == i || 4 == i) {
-        if (':' == b) {
-          if (!is_valid_digits(i, current_segment_digits)) {
-            return thrust::make_tuple(error_us, false);
+        } else if (3 == i || 4 == i) {
+          if (':' == b) {
+            if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; }
+            segments[i] = current_segment_value;
+            current_segment_value = 0;
+            current_segment_digits = 0;
+            i += 1;
+          } else {
+            return ParseResult::INVALID;
           }
-          segments[i]            = current_segment_value;
-          current_segment_value  = 0;
-          current_segment_digits = 0;
-          i += 1;
-        } else {
-          return thrust::make_tuple(error_us, false);
-        }
-      } else if (5 == i || 6 == i) {
-        if ('.' == b && 5 == i) {
-          if (!is_valid_digits(i, current_segment_digits)) {
-            return thrust::make_tuple(error_us, false);
+        } else if (5 == i || 6 == i) {
+          if ('.' == b && 5 == i) {
+            if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; }
+            segments[i] = current_segment_value;
+            current_segment_value = 0;
+            current_segment_digits = 0;
+            i += 1;
+          } else {
+            if (!is_valid_digits(i, current_segment_digits) || !allow_tz_in_date_str) { return ParseResult::INVALID; }
+            segments[i] = current_segment_value;
+            current_segment_value = 0;
+            current_segment_digits = 0;
+            i += 1;
+            *parsed_tz_ptr = bytes + j;
+            // strip the whitespace between timestamp and timezone
+            while (*parsed_tz_ptr < end_ptr && is_whitespace(**parsed_tz_ptr)) ++(*parsed_tz_ptr);
+            *parsed_tz_length = end_ptr - *parsed_tz_ptr;
+            break;
           }
-          segments[i]            = current_segment_value;
-          current_segment_value  = 0;
-          current_segment_digits = 0;
-          i += 1;
+          if (i == 6 && '.' != b) { i += 1; }
         } else {
-          if (!is_valid_digits(i, current_segment_digits)) {
-            return thrust::make_tuple(error_us, false);
+          if (i < segments_len && (':' == b || ' ' == b)) {
+            if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; }
+            segments[i] = current_segment_value;
+            current_segment_value = 0;
+            current_segment_digits = 0;
+            i += 1;
+          } else {
+            return ParseResult::INVALID;
           }
-          segments[i]            = current_segment_value;
-          current_segment_value  = 0;
-          current_segment_digits = 0;
-          i += 1;
-          tz = cudf::string_view(bytes + j, (bytes_length - j));
-          j  = bytes_length - 1;
         }
-        if (i == 6 && '.' != b) { i += 1; }
       } else {
-        if (i < segments_len && (':' == b || ' ' == b)) {
-          if (!is_valid_digits(i, current_segment_digits)) {
-            return thrust::make_tuple(error_us, false);
-          }
-          segments[i]            = current_segment_value;
-          current_segment_value  = 0;
-          current_segment_digits = 0;
-          i += 1;
-        } else {
-          return thrust::make_tuple(error_us, false);
+        if (6 == i) { digits_milli += 1; }
+        // We will truncate the nanosecond part if there are more than 6 digits, which results
+        // in loss of precision
+        if (6 != i || current_segment_digits < 6) {
+          current_segment_value = current_segment_value * 10 + parsed_value;
         }
+        current_segment_digits += 1;
       }
-    } else {
-      if (6 == i) { digits_milli += 1; }
-      // We will truncate the nanosecond part if there are more than 6 digits, which results
-      // in loss of precision
-      if (6 != i || current_segment_digits < 6) {
-        current_segment_value = current_segment_value * 10 + parsed_value;
-      }
-      current_segment_digits += 1;
+      j += 1;
     }
-    j += 1;
-  }
 
-  if (!is_valid_digits(i, current_segment_digits)) { return thrust::make_tuple(error_us, false); }
-  segments[i] = current_segment_value;
+    if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; }
+    segments[i] = current_segment_value;
 
-  while (digits_milli < 6) {
-    segments[6] *= 10;
-    digits_milli += 1;
-  }
+    while (digits_milli < 6) {
+      segments[6] *= 10;
+      digits_milli += 1;
+    }
 
-  cudf::string_view time_zone;
-  if (tz.has_value()) {
-    time_zone = tz.value();
-  } else {
-    time_zone = cudf::string_view(default_time_zone, default_time_zone_char_len);
-  }
+    segments[0] *= year_sign.value_or(1);
+    // above is ported from Spark.
 
-  segments[0] *= year_sign.value_or(1);
-  // above is ported from Spark.
-
-  // set components
-  auto components = timestamp_components{segments[0],
-                                         static_cast<int8_t>(segments[1]),
-                                         static_cast<int8_t>(segments[2]),
-                                         static_cast<int8_t>(segments[3]),
-                                         static_cast<int8_t>(segments[4]),
-                                         static_cast<int8_t>(segments[5]),
-                                         segments[6]};
-  if (default_time_zone_char_len == 0) {
-    // invoke from `string_to_timestamp_without_time_zone`
-    if (just_time || !allow_time_zone && tz.has_value()) {
-      return thrust::make_tuple(error_us, false);
-    } else {
-      return to_utc_timestamp(components);
-    }
-  } else {
-    // invoke from `string_to_timestamp`
-    if (just_time) {
-      // Update here to support the following format:
-      //   `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
-      //   `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
-      // by set local date in a time zone: year-month-day.
-      // Above 2 formats are time zone related, Spark uses LocalDate.now(zoneId)
-
-      // do not support currently
-      return thrust::make_tuple(error_us, false);
-    } else {
-      return to_utc_timestamp(components, time_zone);
-    }
-  }
-}
+    // set components
+    ts_comp->year = segments[0];
+    ts_comp->month = static_cast<int8_t>(segments[1]);
+    ts_comp->day = static_cast<int8_t>(segments[2]);
+    ts_comp->hour = static_cast<int8_t>(segments[3]);
+    ts_comp->minute = static_cast<int8_t>(segments[4]);
+    ts_comp->second = static_cast<int8_t>(segments[5]);
+    ts_comp->microseconds = segments[6];
 
-struct parse_timestamp_string_fn {
-  cudf::column_device_view const d_strings;
-  const char* default_time_zone;
-  cudf::size_type default_time_zone_char_len;
-  bool allow_time_zone;
-  bool allow_special_expressions;
-  // TODO the following should be passed in.
-  // Note: today, tomorrow, yesterday are time zone related, should use time zone to generate.
-  cudf::timestamp_us epoch     = cudf::timestamp_us{cudf::duration_us{111L}};
-  cudf::timestamp_us now       = cudf::timestamp_us{cudf::duration_us{222L}};
-  cudf::timestamp_us today     = cudf::timestamp_us{cudf::duration_us{333L}};
-  cudf::timestamp_us tomorrow  = cudf::timestamp_us{cudf::duration_us{444L}};
-  cudf::timestamp_us yesterday = cudf::timestamp_us{cudf::duration_us{555L}};
-
-  __device__ thrust::tuple<cudf::timestamp_us, bool> operator()(const cudf::size_type& idx) const
-  {
-    auto const d_str = d_strings.element<cudf::string_view>(idx);
-    return parse_string_to_timestamp_us(d_str,
-                                        default_time_zone,
-                                        default_time_zone_char_len,
-                                        allow_time_zone,
-                                        allow_special_expressions,
-                                        epoch,
-                                        now,
-                                        today,
-                                        tomorrow,
-                                        yesterday);
+    return ParseResult::OK;
   }
 };
 
@@ -408,114 +499,90 @@ struct parse_timestamp_string_fn {
  * Trims and parses timestamp string column to a timestamp column and a is valid column
  *
  */
-std::pair<std::unique_ptr<cudf::column>, std::unique_ptr<cudf::column>> to_timestamp(
+std::pair<std::unique_ptr<cudf::column>, bool> to_timestamp(
   cudf::strings_column_view const& input,
-  std::string_view const& default_time_zone,
-  bool allow_time_zone,
-  bool allow_special_expressions,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  cudf::strings_column_view const& special_datetime_lit,
+  bool ansi_mode,
+  bool allow_tz_in_date_str = true,
+  size_type default_tz_index = 1000000000,
+  cudf::column_view const *transitions = nullptr,
+  cudf::strings_column_view const *tz_indices = nullptr)
 {
+  auto const stream = cudf::get_default_stream();
+  auto const mr = rmm::mr::get_current_device_resource();
+
   auto d_strings = cudf::column_device_view::create(input.parent(), stream);
+  auto d_special_datetime_lit = cudf::column_device_view::create(special_datetime_lit.parent(), stream);
 
-  auto output_timestamp =
-    cudf::make_timestamp_column(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS},
-                                input.size(),
-                                cudf::detail::copy_bitmask(input.parent(), stream, mr),
-                                input.null_count(),
-                                stream,
-                                mr);
-  // record which string is failed to parse.
-  auto output_bool =
-    cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8},
+  auto result_col =
+      cudf::make_timestamp_column(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS},
                                   input.size(),
-                                  cudf::detail::copy_bitmask(input.parent(), stream, mr),
-                                  input.null_count(),
+                                  cudf::mask_state::UNALLOCATED,
                                   stream,
                                   mr);
+  // record which string is failed to parse.
+  auto result_valid_col =
+      cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::UINT8},
+                                    input.size(),
+                                    cudf::mask_state::UNALLOCATED,
+                                    stream,
+                                    mr);
+
+  if (transitions == nullptr || tz_indices == nullptr) {
+    thrust::transform(
+        rmm::exec_policy(stream),
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(input.size()),
+        thrust::make_zip_iterator(
+            thrust::make_tuple(result_col->mutable_view().begin<cudf::timestamp_us>(),
+                               result_valid_col->mutable_view().begin<uint8_t>())),
+        parse_timestamp_string_fn<false>{*d_strings,
+                                         *d_special_datetime_lit,
+                                         default_tz_index,
+                                         allow_tz_in_date_str});
+  } else {
+    auto const ft_cdv_ptr = column_device_view::create(*transitions, stream);
+    auto const d_transitions = lists_column_device_view{*ft_cdv_ptr};
+    auto d_tz_indices = cudf::column_device_view::create(tz_indices->parent(), stream);
+
+    thrust::transform(
+        rmm::exec_policy(stream),
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(input.size()),
+        thrust::make_zip_iterator(
+            thrust::make_tuple(result_col->mutable_view().begin<cudf::timestamp_us>(),
+                               result_valid_col->mutable_view().begin<uint8_t>())),
+        parse_timestamp_string_fn<true>{*d_strings,
+                                        *d_special_datetime_lit,
+                                        default_tz_index,
+                                        true,
+                                        d_transitions,
+                                        *d_tz_indices});
+  }
 
-  thrust::transform(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator(0),
-    thrust::make_counting_iterator(input.size()),
-    thrust::make_zip_iterator(
-      thrust::make_tuple(output_timestamp->mutable_view().begin<cudf::timestamp_us>(),
-                         output_bool->mutable_view().begin<bool>())),
-    parse_timestamp_string_fn{*d_strings,
-                              default_time_zone.data(),
-                              static_cast<cudf::size_type>(default_time_zone.size()),
-                              allow_time_zone,
-                              allow_special_expressions});
-
-  return std::make_pair(std::move(output_timestamp), std::move(output_bool));
-}
+  auto valid_view = result_valid_col->mutable_view();
 
-/**
- * Set the null mask of timestamp column according to the validity column.
- */
-void update_bitmask(cudf::column& timestamp_column,
-                    cudf::column const& validity_column,
-                    rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr)
-{
-  auto const& ts_view    = timestamp_column.view();
-  auto const& valid_view = validity_column.view();
-  std::vector<cudf::bitmask_type const*> masks;
-  std::vector<cudf::size_type> offsets;
-  if (timestamp_column.nullable()) {
-    masks.push_back(ts_view.null_mask());
-    offsets.push_back(ts_view.offset());
+  auto exception_exists = thrust::any_of(
+    rmm::exec_policy(stream),
+    valid_view.begin<uint8_t>(),
+    valid_view.end<uint8_t>(),
+    []__device__(uint8_t e) { return e == ParseResult::UNSUPPORTED; });
+  if (exception_exists) {
+    CUDF_FAIL("There exists unsupported timestamp schema!");
   }
 
-  // generate bitmask from `validity_column`
   auto [valid_bitmask, valid_null_count] = cudf::detail::valid_if(
-    valid_view.begin<bool>(), valid_view.end<bool>(), thrust::identity<bool>{}, stream, mr);
-
-  masks.push_back(static_cast<cudf::bitmask_type*>(valid_bitmask.data()));
-  offsets.push_back(0);
-
-  // merge 2 bitmasks
-  auto [null_mask, null_count] =
-    cudf::detail::bitmask_and(masks, offsets, timestamp_column.size(), stream, mr);
-
-  timestamp_column.set_null_mask(null_mask, null_count);
-}
+      valid_view.begin<uint8_t>(), valid_view.end<uint8_t>(),
+      [] __device__(uint8_t e) { return e == 0; },
+      stream, mr);
 
-/**
- * Parse string column with time zone to timestamp column,
- * Returns a pair of timestamp column and a bool indicates whether successed.
- */
-std::pair<std::unique_ptr<cudf::column>, bool> parse_string_to_timestamp(
-  cudf::strings_column_view const& input,
-  std::string_view const& default_time_zone,
-  bool allow_time_zone,
-  bool allow_special_expressions,
-  bool ansi_mode)
-{
-  auto timestamp_type = cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS};
-  if (input.size() == 0) {
-    return std::make_pair(cudf::make_empty_column(timestamp_type.id()), true);
+  if (ansi_mode && input.null_count() < valid_null_count) {
+    // has invalid value in validity column under ansi mode
+    return std::make_pair(nullptr, false);
   }
 
-  auto const stream = cudf::get_default_stream();
-  auto const mr     = rmm::mr::get_current_device_resource();
-  auto [timestamp_column, validity_column] =
-    to_timestamp(input, default_time_zone, allow_time_zone, allow_special_expressions, stream, mr);
-
-  if (ansi_mode) {
-    // create scalar, value is false, is_valid is true
-    cudf::numeric_scalar<bool> false_scalar{false, true, stream, mr};
-    if (cudf::contains(*validity_column, false_scalar, stream)) {
-      // has invalid value in validity column under ansi mode
-      return std::make_pair(nullptr, false);
-    } else {
-      update_bitmask(*timestamp_column, *validity_column, stream, mr);
-      return std::make_pair(std::move(timestamp_column), true);
-    }
-  } else {
-    update_bitmask(*timestamp_column, *validity_column, stream, mr);
-    return std::make_pair(std::move(timestamp_column), true);
-  }
+  result_col->set_null_mask(valid_bitmask, valid_null_count, stream);
+  return std::make_pair(std::move(result_col), true);
 }
 
 }  // namespace
@@ -527,15 +594,18 @@ namespace spark_rapids_jni {
  * Returns a pair of timestamp column and a bool indicates whether successed.
  * If does not have time zone in string, use the default time zone.
  */
-std::pair<std::unique_ptr<cudf::column>, bool> string_to_timestamp(
+std::pair<std::unique_ptr<cudf::column>, bool> string_to_timestamp_with_tz(
   cudf::strings_column_view const& input,
-  std::string_view const& default_time_zone,
-  bool allow_special_expressions,
+  cudf::column_view const& transitions,
+  cudf::strings_column_view const& tz_indices,
+  cudf::strings_column_view const& special_datetime_lit,
+  cudf::size_type default_tz_index,
   bool ansi_mode)
 {
-  CUDF_EXPECTS(default_time_zone.size() > 0, "should specify default time zone");
-  return parse_string_to_timestamp(
-    input, default_time_zone, true, allow_special_expressions, ansi_mode);
+  if (input.size() == 0) {
+    return std::make_pair(cudf::make_empty_column(cudf::type_id::TIMESTAMP_MICROSECONDS), true);
+  }
+  return to_timestamp(input, special_datetime_lit, ansi_mode, true, default_tz_index, &transitions, &tz_indices);
 }
 
 /**
@@ -546,15 +616,14 @@ std::pair<std::unique_ptr<cudf::column>, bool> string_to_timestamp(
  */
 std::pair<std::unique_ptr<cudf::column>, bool> string_to_timestamp_without_time_zone(
   cudf::strings_column_view const& input,
+  cudf::strings_column_view const& special_datetime_lit,
   bool allow_time_zone,
-  bool allow_special_expressions,
   bool ansi_mode)
 {
-  return parse_string_to_timestamp(input,
-                                   std::string_view(""),  // specify empty time zone
-                                   allow_time_zone,
-                                   allow_special_expressions,
-                                   ansi_mode);
+  if (input.size() == 0) {
+    return std::make_pair(cudf::make_empty_column(cudf::type_id::TIMESTAMP_MICROSECONDS), true);
+  }
+  return to_timestamp(input, special_datetime_lit, ansi_mode, allow_time_zone);
 }
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp
index d2f1dfa39c..1b72a1fbb8 100644
--- a/src/main/cpp/src/datetime_parser.hpp
+++ b/src/main/cpp/src/datetime_parser.hpp
@@ -68,11 +68,13 @@ namespace spark_rapids_jni {
  * @returns a timestamp column and a bool column. Bool column is empty if ansi mode is false, not
  * empty otherwise.
  */
-std::pair<std::unique_ptr<cudf::column>, bool> string_to_timestamp(
-  cudf::strings_column_view const& input,
-  std::string_view const& default_time_zone,
-  bool allow_special_expressions,
-  bool ansi_mode);
+std::pair<std::unique_ptr<cudf::column>, bool> string_to_timestamp_with_tz(
+    cudf::strings_column_view const& input,
+    cudf::column_view const& transitions,
+    cudf::strings_column_view const& tz_indices,
+    cudf::strings_column_view const& special_datetime_lit,
+    cudf::size_type default_tz_index,
+    bool ansi_mode);
 
 /**
  *
@@ -128,8 +130,8 @@ std::pair<std::unique_ptr<cudf::column>, bool> string_to_timestamp(
  */
 std::pair<std::unique_ptr<cudf::column>, bool> string_to_timestamp_without_time_zone(
   cudf::strings_column_view const& input,
+  cudf::strings_column_view const& special_datetime_lit,
   bool allow_time_zone,
-  bool allow_special_expressions,
   bool ansi_mode);
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
index 515f725e02..b383468e7e 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
@@ -18,6 +18,8 @@
 
 import ai.rapids.cudf.*;
 
+import java.time.ZoneId;
+
 /** Utility class for casting between string columns and native type columns */
 public class CastStrings {
   static {
@@ -179,19 +181,30 @@ public static ColumnVector fromIntegersWithBase(ColumnView cv, int base) {
    * @param cv                      The input string column to be converted.
    * @param defaultTimeZone         Use the default time zone if string does not
    *                                contain time zone.
-   * @param allowSpecialExpressions Whether allow: epoch, now, today, tomorrow
    * @param ansiEnabled             is Ansi mode
    * @return a timestamp column
    * @throws IllegalArgumentException if cv contains invalid value when
    *                                  ansiEnabled is true
    */
-  public static ColumnVector toTimestamp(ColumnView cv, String defaultTimeZone,
-      boolean allowSpecialExpressions, boolean ansiEnabled) {
-    if (defaultTimeZone == null || defaultTimeZone.isEmpty()) {
-      throw new IllegalArgumentException("Default time zone can not be empty.");
+  public static ColumnVector toTimestamp(ColumnView cv, ZoneId defaultTimeZone, boolean ansiEnabled) {
+    if (!GpuTimeZoneDB.isSupportedTimeZone(defaultTimeZone)) {
+      throw new IllegalArgumentException(String.format("Unsupported timezone: %s",
+              defaultTimeZone.toString()));
+    }
+
+    GpuTimeZoneDB singleton = GpuTimeZoneDB.getInstance();
+    if (!singleton.isLoaded()) {
+      GpuTimeZoneDB.cacheDatabase();
+    }
+
+    Integer tzIndex = singleton.getZoneIDMap().get(defaultTimeZone.normalized().toString());
+
+    try (Table transitions = singleton.getTransitions();
+         ColumnVector tzIndices = singleton.getZoneIDVector();
+         ColumnVector specialTz = singleton.getSpecialTzVector()) {
+      return new ColumnVector(toTimestamp(cv.getNativeView(), transitions.getNativeView(),
+              tzIndices.getNativeView(), specialTz.getNativeView(), tzIndex, ansiEnabled));
     }
-    return new ColumnVector(toTimestamp(cv.getNativeView(), defaultTimeZone,
-        allowSpecialExpressions, ansiEnabled));
   }
 
   /**
@@ -219,20 +232,25 @@ public static ColumnVector toTimestamp(ColumnView cv, String defaultTimeZone,
    * ts is: ['2023-01-01 00:00:00', '2023-01-01T08:00:00']
    * 
    * @param cv                      The input string column to be converted.
-   * @param allow_time_zone         whether allow time zone in the timestamp
+   * @param allowTimeZone           whether allow time zone in the timestamp
    *                                string. e.g.:
    *                                1991-04-14T02:00:00Asia/Shanghai is invalid
    *                                when do not allow time zone.
-   * @param allowSpecialExpressions Whether allow: epoch, now, today, tomorrow
    * @param ansiEnabled             is Ansi mode
    * @return a timestamp column
    * @throws IllegalArgumentException if cv contains invalid value when
    *                                  ansiEnabled is true
    */
-  public static ColumnVector toTimestampWithoutTimeZone(ColumnView cv, boolean allowTimeZone,
-      boolean allowSpecialExpressions, boolean ansiEnabled) {
-    return new ColumnVector(toTimestampWithoutTimeZone(cv.getNativeView(), allowTimeZone,
-        allowSpecialExpressions, ansiEnabled));
+  public static ColumnVector toTimestampWithoutTimeZone(ColumnView cv, boolean allowTimeZone, boolean ansiEnabled) {
+    GpuTimeZoneDB singleton = GpuTimeZoneDB.getInstance();
+    if (!singleton.isLoaded()) {
+      GpuTimeZoneDB.cacheDatabase();
+    }
+
+    try (ColumnVector specialTz = singleton.getSpecialTzVector()) {
+      return new ColumnVector(toTimestampWithoutTimeZone(cv.getNativeView(), specialTz.getNativeView(),
+              allowTimeZone,  ansiEnabled));
+    }
   }
 
   private static native long toInteger(long nativeColumnView, boolean ansi_enabled, boolean strip,
@@ -246,8 +264,8 @@ private static native long toDecimal(long nativeColumnView, boolean ansi_enabled
   private static native long toIntegersWithBase(long nativeColumnView, int base,
     boolean ansiEnabled, int dtype);
   private static native long fromIntegersWithBase(long nativeColumnView, int base);
-  private static native long toTimestamp(long nativeColumnView, String defaultTimeZone,
-      boolean allowSpecialExpressions, boolean ansiEnabled);
-  private static native long toTimestampWithoutTimeZone(long nativeColumnView,
-      boolean allowTimeZone, boolean allowSpecialExpressions, boolean ansiEnabled);
+  private static native long toTimestamp(long input,
+      long transitions, long tzIndices, long specialDate, int tzIndex, boolean ansiEnabled);
+  private static native long toTimestampWithoutTimeZone(long input,
+      long specialDate, boolean allowTimeZone, boolean ansiEnabled);
 }
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
index b63a9dc282..f8b49b5b22 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
@@ -17,6 +17,7 @@
 package com.nvidia.spark.rapids.jni;
 
 import java.time.Instant;
+import java.time.LocalDateTime;
 import java.time.ZoneId;
 import java.time.zone.ZoneOffsetTransition;
 import java.time.zone.ZoneRules;
@@ -27,15 +28,14 @@
 import java.util.Map;
 import java.util.TimeZone;
 import java.util.concurrent.*;
+import java.util.function.Function;
 
-import ai.rapids.cudf.ColumnVector;
-import ai.rapids.cudf.DType;
-import ai.rapids.cudf.HostColumnVector;
-import ai.rapids.cudf.Table;
+import ai.rapids.cudf.*;
 
 public class GpuTimeZoneDB {
 
   public static final int TIMEOUT_SECS = 300;
+  public static final String[] SPECIAL_TZ_LITERALS = {"epoch", "now", "today", "tomorrow", "yesterday"};
 
 
   // For the timezone database, we store the transitions in a ColumnVector that is a list of 
@@ -43,14 +43,18 @@ public class GpuTimeZoneDB {
   //   LIST<STRUCT<utcInstant: int64, localInstant: int64, offset: int32>>
   private CompletableFuture<Map<String, Integer>> zoneIdToTableFuture;
   private CompletableFuture<HostColumnVector> fixedTransitionsFuture;
+  private CompletableFuture<HostColumnVector> zoneIdVectorFuture;
+  private CompletableFuture<HostColumnVector> specialTzLiteralsFuture;
 
   private boolean closed = false;
 
   GpuTimeZoneDB() {
     zoneIdToTableFuture = new CompletableFuture<>();
     fixedTransitionsFuture = new CompletableFuture<>();
+    zoneIdVectorFuture = new CompletableFuture<>();
+    specialTzLiteralsFuture = new CompletableFuture<>();
   }
-  
+
   private static GpuTimeZoneDB instance = new GpuTimeZoneDB();
   // This method is default visibility for testing purposes only. The instance will be never be exposed publicly
   // for this class.
@@ -157,10 +161,11 @@ public static ZoneId getZoneId(String timeZoneId) {
     return ZoneId.of(formattedZoneId, ZoneId.SHORT_IDS);
   }
 
-  private boolean isLoaded() {
-    return zoneIdToTableFuture.isDone();
+  public boolean isLoaded() {
+    return zoneIdToTableFuture.isDone() && fixedTransitionsFuture.isDone() &&
+            zoneIdVectorFuture.isDone() && specialTzLiteralsFuture.isDone();
   }
-  
+
   private void loadData(Executor executor) throws IllegalStateException {
     // Start loading the data in separate thread and return
     try {
@@ -176,6 +181,9 @@ private void doLoadData() {
       try {
         Map<String, Integer> zoneIdToTable = new HashMap<>();
         List<List<HostColumnVector.StructData>> masterTransitions = new ArrayList<>();
+        List<String> zondIdList = new ArrayList<>();
+        List<String> unsupportedZoneList = new ArrayList<>();
+
         for (String tzId : TimeZone.getAvailableIDs()) {
           ZoneId zoneId;
           try {
@@ -189,6 +197,7 @@ private void doLoadData() {
           ZoneRules zoneRules = zoneId.getRules();
           // Filter by non-repeating rules
           if (!zoneRules.isFixedOffset() && !zoneRules.getTransitionRules().isEmpty()) {
+            unsupportedZoneList.add(zoneId.getId());
             continue;
           }
           if (!zoneIdToTable.containsKey(zoneId.getId())) {
@@ -198,16 +207,27 @@ private void doLoadData() {
             if (zoneRules.isFixedOffset()) {
               data.add(
                   new HostColumnVector.StructData(Long.MIN_VALUE, Long.MIN_VALUE,
-                      zoneRules.getOffset(Instant.now()).getTotalSeconds())
+                      zoneRules.getOffset(Instant.now()).getTotalSeconds(), Long.MIN_VALUE)
               );
             } else {
               // Capture the first official offset (before any transition) using Long min
               ZoneOffsetTransition first = transitions.get(0);
               data.add(
                   new HostColumnVector.StructData(Long.MIN_VALUE, Long.MIN_VALUE,
-                      first.getOffsetBefore().getTotalSeconds())
+                      first.getOffsetBefore().getTotalSeconds(), Long.MIN_VALUE)
               );
               transitions.forEach(t -> {
+                // A simple approach to transform LocalDateTime to a value which is proportional to
+                // the exact EpochSecond. After caching these values along with EpochSeconds, we
+                // can easily search out which time zone transition rule we should apply according
+                // to LocalDateTime structs. The searching procedure is same as the binary search with
+                // exact EpochSeconds(convert_timestamp_tz_functor), except using "loose EpochSeconds"
+                // as search index instead of exact EpochSeconds.
+                Function<LocalDateTime, Long> localToLooseEpochSecond = lt ->
+                        86400L * (lt.getYear() * 400L + (lt.getMonthValue() - 1) * 31L +
+                                lt.getDayOfMonth() - 1) +
+                                3600L * lt.getHour() + 60L * lt.getMinute() + lt.getSecond();
+
                 // Whether transition is an overlap vs gap.
                 // In Spark:
                 // if it's a gap, then we use the offset after *on* the instant
@@ -219,35 +239,53 @@ private void doLoadData() {
                       new HostColumnVector.StructData(
                           t.getInstant().getEpochSecond(),
                           t.getInstant().getEpochSecond() + t.getOffsetAfter().getTotalSeconds(),
-                          t.getOffsetAfter().getTotalSeconds())
+                          t.getOffsetAfter().getTotalSeconds(),
+                          localToLooseEpochSecond.apply(t.getDateTimeAfter())
+                      )
                   );
                 } else {
                   data.add(
                       new HostColumnVector.StructData(
                           t.getInstant().getEpochSecond(),
                           t.getInstant().getEpochSecond() + t.getOffsetBefore().getTotalSeconds(),
-                          t.getOffsetAfter().getTotalSeconds())
+                          t.getOffsetAfter().getTotalSeconds(),
+                          localToLooseEpochSecond.apply(t.getDateTimeBefore())
+                      )
                   );
                 }
               });
             }
             masterTransitions.add(data);
             zoneIdToTable.put(zoneId.getId(), idx);
+            zondIdList.add(zoneId.getId());
           }
         }
+        zoneIdToTableFuture.complete(zoneIdToTable);
+
         HostColumnVector.DataType childType = new HostColumnVector.StructType(false,
             new HostColumnVector.BasicType(false, DType.INT64),
             new HostColumnVector.BasicType(false, DType.INT64),
-            new HostColumnVector.BasicType(false, DType.INT32));
+            new HostColumnVector.BasicType(false, DType.INT32),
+            new HostColumnVector.BasicType(false, DType.INT64));
         HostColumnVector.DataType resultType =
             new HostColumnVector.ListType(false, childType);
-        HostColumnVector fixedTransitions = HostColumnVector.fromLists(resultType,
-            masterTransitions.toArray(new List[0]));
-        fixedTransitionsFuture.complete(fixedTransitions);
-        zoneIdToTableFuture.complete(zoneIdToTable);
+
+        zondIdList.addAll(unsupportedZoneList);
+
+        try (HostColumnVector fixedTransitions = HostColumnVector.fromLists(resultType, masterTransitions.toArray(new List[0]))) {
+          try (HostColumnVector zoneIdVector = HostColumnVector.fromStrings(zondIdList.toArray(new String[0]))) {
+            try (HostColumnVector specialTzVector = HostColumnVector.fromStrings(SPECIAL_TZ_LITERALS)) {
+              fixedTransitionsFuture.complete(fixedTransitions.incRefCount());
+              zoneIdVectorFuture.complete(zoneIdVector.incRefCount());
+              specialTzLiteralsFuture.complete(specialTzVector.incRefCount());
+            }
+          }
+        }
       } catch (Exception e) {
         fixedTransitionsFuture.completeExceptionally(e);
         zoneIdToTableFuture.completeExceptionally(e);
+        zoneIdVectorFuture.completeExceptionally(e);
+        specialTzLiteralsFuture.completeExceptionally(e);
         throw e;
       }
     }
@@ -273,7 +311,7 @@ private HostColumnVector getHostFixedTransitions() {
     }
   }
 
-  private Map<String, Integer> getZoneIDMap() {
+  public Map<String, Integer> getZoneIDMap() {
     try {
       return zoneIdToTableFuture.get(TIMEOUT_SECS, TimeUnit.SECONDS);
     } catch (InterruptedException | ExecutionException | TimeoutException e) {
@@ -281,7 +319,25 @@ private Map<String, Integer> getZoneIDMap() {
     }
   }
 
-  private Table getTransitions() {
+  public ColumnVector getZoneIDVector() {
+    try {
+      HostColumnVector hcv = zoneIdVectorFuture.get(TIMEOUT_SECS, TimeUnit.SECONDS);
+      return hcv.copyToDevice();
+    } catch (InterruptedException | ExecutionException | TimeoutException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  public ColumnVector getSpecialTzVector() {
+    try {
+      HostColumnVector hcv = specialTzLiteralsFuture.get(TIMEOUT_SECS, TimeUnit.SECONDS);
+      return hcv.copyToDevice();
+    } catch (InterruptedException | ExecutionException | TimeoutException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  public Table getTransitions() {
     try (ColumnVector fixedTransitions = getFixedTransitions()) {
       return new Table(fixedTransitions);
     }
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
index 7eeee46945..a8939bc825 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
@@ -20,8 +20,11 @@
 import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.fail;
 
+import java.time.*;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.AbstractMap;
+import java.util.Map;
 
 import org.junit.jupiter.api.Test;
 
@@ -326,72 +329,143 @@ void baseHex2DecTest() {
     }
   }
 
-  // TODO update after this PR is done.
   @Test
-  void toTimestampTestNonAnsi() {
-    long d_2023_1_1 = (2023L * 365L * 86400L + 1 * 30L * 86400L + 1 * 86400L) * 1000000L;
-    long d_2023_11_1 = (2023L * 365L * 86400L + 11 * 30L * 86400L + 1 * 86400L) * 1000000L;
-    long d_2023_11_5 = (2023L * 365L * 86400L + 11L * 30L * 86400L + 5L * 86400L) * 1000000L;
-    long t_3_4_55 = (3L * 3600L + 4L * 60L + 55L) * 1000000L;
-    long d_2023_11_5_t_3_4_55 = d_2023_11_5 + t_3_4_55;
+  void toTimestampTestAnsiWithoutTz() {
+    assertThrows(IllegalArgumentException.class, () -> {
+      try (ColumnVector input = ColumnVector.fromStrings(" invalid_value ")) {
+        // ansiEnabled is true
+        CastStrings.toTimestampWithoutTimeZone(input, false, true);
+      }
+    });
+
+    Instant instant = LocalDateTime.parse("2023-11-05T03:04:55").toInstant(ZoneOffset.UTC);
+    long expectedResults = instant.getEpochSecond() * 1000000L;
 
     try (
-        ColumnVector input = ColumnVector.fromStrings(
-            null,
-            " 2023 ",
-            " 2023-11 ",
-            " 2023-11-5 ",
-            " 2023-11-05 3:04:55   ",
-            " 2023-11-05T03:4:55   ",
-            " 2023-11-05T3:4:55   ",
-            "  2023-11-5T3:4:55.",
-            "  2023-11-5T3:4:55.Iran",
-            "  2023-11-5T3:4:55.1 ",
-            "  2023-11-5T3:4:55.1Iran",
-            "  2023-11-05T03:04:55.123456  ",
-            "  2023-11-05T03:04:55.123456Iran  ",
-            " 222222 ",
-            " ", // invalid
-            "", // invalid
-            "1-" // invalid
-        );
-        ColumnVector expected = ColumnVector.timestampMicroSecondsFromBoxedLongs(
-            null,
-            d_2023_1_1,
-            d_2023_11_1,
-            d_2023_11_5,
-            d_2023_11_5_t_3_4_55,
-            d_2023_11_5_t_3_4_55,
-            d_2023_11_5_t_3_4_55,
-            d_2023_11_5_t_3_4_55,
-            d_2023_11_5_t_3_4_55,
-            d_2023_11_5_t_3_4_55 + 100000,
-            d_2023_11_5_t_3_4_55 + 100000,
-            d_2023_11_5_t_3_4_55 + 123456,
-            d_2023_11_5_t_3_4_55 + 123456,
-            (222222L * 365L * 86400L + 1 * 30L * 86400L + 1 * 86400L) * 1000000L,
-            null,
-            null,
-            null);
-        ColumnVector actual = CastStrings.toTimestamp(input,
-            "Asia/Shanghai", false, false)) {
+        ColumnVector input = ColumnVector.fromStrings("2023-11-05 3:04:55");
+        ColumnVector expected = ColumnVector.timestampMicroSecondsFromBoxedLongs(expectedResults);
+        ColumnVector actual = CastStrings.toTimestampWithoutTimeZone(input, false, true)) {
       AssertUtils.assertColumnsAreEqual(expected, actual);
     }
   }
 
   @Test
-  void toTimestampTestAnsi() {
-    assertThrows(IllegalArgumentException.class, () -> {
-      try (ColumnVector input = ColumnVector.fromStrings(" invalid_value ")) {
-        // ansiEnabled is true
-        CastStrings.toTimestamp(input, "Asia/Shanghai", false, true);
+  void toTimestampTestWithTz() {
+    List<Map.Entry<String, Long>> entries = new ArrayList<>();
+    // Without timezone
+    entries.add(new AbstractMap.SimpleEntry<>("  2000-01-29 ", 949104000000000L));
+    // Timezone IDs
+    entries.add(new AbstractMap.SimpleEntry<>("2023-11-05 3:4:55 America/Sao_Paulo", 1699164295000000L));
+    entries.add(new AbstractMap.SimpleEntry<>("2023-11-5T03:04:55.1   Asia/Shanghai", 1699124695100000L));
+    entries.add(new AbstractMap.SimpleEntry<>("2000-1-29 13:59:8 Iran", 949141748000000L));
+    entries.add(new AbstractMap.SimpleEntry<>("1968-03-25T23:59:1.123Asia/Tokyo", -55846858877000L));
+    entries.add(new AbstractMap.SimpleEntry<>("1968-03-25T23:59:1.123456Asia/Tokyo", -55846858876544L));
+  
+    // UTC-like timezones
+    //  no adjustment
+    entries.add(new AbstractMap.SimpleEntry<>("1970-9-9 2:33:44 Z", 21695624000000L));
+    entries.add(new AbstractMap.SimpleEntry<>(" 1969-12-1 2:3:4.999Z", -2671015001000L));
+    entries.add(new AbstractMap.SimpleEntry<>("1954-10-20 00:11:22 GMT  ", -479692118000000L));
+    entries.add(new AbstractMap.SimpleEntry<>("1984-1-3 00:11:22UTC", 441936682000000L));
+    //  hh
+    entries.add(new AbstractMap.SimpleEntry<>("1998-11-05T20:00:1.12 UTC+18 ", 910231201120000L));
+    entries.add(new AbstractMap.SimpleEntry<>("1998-11-05T20:00:1.12UTC+0", 910296001120000L));
+    entries.add(new AbstractMap.SimpleEntry<>("1998-11-05T20:00:1.12UTC-00", 910296001120000L));
+    entries.add(new AbstractMap.SimpleEntry<>(" 1998-11-05T20:00:1.12   GMT+09 ", 910263601120000L));
+    entries.add(new AbstractMap.SimpleEntry<>("1998-11-05T20:00:1.12   GMT-1", 910299601120000L));
+    entries.add(new AbstractMap.SimpleEntry<>("1998-11-05T20:00:1.12  UTC-6", 910317601120000L));
+    entries.add(new AbstractMap.SimpleEntry<>("1998-11-05T20:00:1.12  UTC-18", 910360801120000L));
+    entries.add(new AbstractMap.SimpleEntry<>("1998-11-05T20:00:1.12UTC-00", 910296001120000L));
+    entries.add(new AbstractMap.SimpleEntry<>(" 1998-11-05T20:00:1.12   +09 ", 910263601120000L));
+    entries.add(new AbstractMap.SimpleEntry<>("1998-11-05T20:00:1.12   -1", 910299601120000L));
+    entries.add(new AbstractMap.SimpleEntry<>("1998-11-05T20:00:1.12 +18 ", 910231201120000L));
+    entries.add(new AbstractMap.SimpleEntry<>("1998-11-05T20:00:1.12-00", 910296001120000L));
+    //  hh:mm
+    entries.add(new AbstractMap.SimpleEntry<>("1969-12-1 2:3:4.999 UTC+1428", -2723095001000L));
+    entries.add(new AbstractMap.SimpleEntry<>("1969-12-1 2:3:4.999 GMT-1501", -2616955001000L));
+    entries.add(new AbstractMap.SimpleEntry<>("1969-12-1 2:3:4.999 GMT+1:22", -2675935001000L));
+    entries.add(new AbstractMap.SimpleEntry<>("1969-12-1 2:3:4.8888 GMT+8:2", -2699935111200L));
+    entries.add(new AbstractMap.SimpleEntry<>("1969-12-1 2:3:4.999 UTC+17:9", -2732755001000L));
+    entries.add(new AbstractMap.SimpleEntry<>("1969-12-1 2:3:4.999 UTC-09:11", -2637955001000L));
+    entries.add(new AbstractMap.SimpleEntry<>("1969-12-1 2:3:4.999 +1428  ", -2723095001000L));
+    entries.add(new AbstractMap.SimpleEntry<>("1969-12-1 2:3:4.999-1501  ", -2616955001000L));
+    entries.add(new AbstractMap.SimpleEntry<>("1969-12-1 2:3:4.999 +1:22 ", -2675935001000L));
+    entries.add(new AbstractMap.SimpleEntry<>("1969-12-1 2:3:4.8888 +8:2  ", -2699935111200L));
+    entries.add(new AbstractMap.SimpleEntry<>("1969-12-1 2:3:4.999+17:9", -2732755001000L));
+    entries.add(new AbstractMap.SimpleEntry<>("1969-12-1 2:3:4.999    -09:11", -2637955001000L));
+    //  hh:mm::ss
+    entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 GMT+112233", 1571569871100000L));
+    entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 UTC-100102", 1571646886100000L));
+    entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 UTC+11:22:33", 1571569871100000L));
+    entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 GMT-10:10:10", 1571647434100000L));
+    entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 GMT-8:08:01", 1571640105100000L));
+    entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 UTC+4:59:59", 1571592825100000L));
+    entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 00:1:20.3  +102030", 1571492450300000L));
+    entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 00:1:20.3   -020103", 1571536943300000L));
+    entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1   -8:08:01  ", 1571640105100000L));
+    entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1+4:59:59", 1571592825100000L));
+
+    int validDataSize = entries.size();
+
+    // Invalid instances
+    // Timezone without hh:mm:ss
+    entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 Iran", null));
+    // Invalid Timezone ID
+    entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 10:20:30 Asia/London", null));
+    // Invalid UTC-like timezone
+    //  overflow
+    entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 10:20:30 +10:60", null));
+    entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 10:20:30 UTC-7:59:60", null));
+    entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 10:20:30 +19", null));
+    entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 10:20:30 UTC-23", null));
+    entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 10:20:30 GMT+1801", null));
+    entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 10:20:30 -180001", null));
+    entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 10:20:30 UTC+18:00:10", null));
+    entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 10:20:30 GMT-23:5", null));
+
+    List<String> inputs = new ArrayList<>();
+    List<Long> expects = new ArrayList<>();
+    for (Map.Entry<String, Long> entry : entries) {
+      inputs.add(entry.getKey());
+      expects.add(entry.getValue());
+    }
+
+    // Throw unsupported exception for symbols because Europe/London contains DST rules
+    assertThrows(ai.rapids.cudf.CudfException.class, () -> {
+      try (ColumnVector input = ColumnVector.fromStrings("2000-01-29 1:2:3 Europe/London")) {
+        CastStrings.toTimestamp(input, ZoneId.of("UTC"), false);
+      }
+    });
+    // Throw unsupported exception for symbols of special dates
+    for (String date : new String[]{"epoch", "now", "today", "yesterday", "tomorrow"})
+    assertThrows(ai.rapids.cudf.CudfException.class, () -> {
+      try (ColumnVector input = ColumnVector.fromStrings(date)) {
+        CastStrings.toTimestamp(input, ZoneId.of("UTC"), false);
       }
     });
 
+    // non-ANSI mode
+    try (
+        ColumnVector input = ColumnVector.fromStrings(inputs.toArray(new String[0]));
+        ColumnVector expected = ColumnVector.timestampMicroSecondsFromBoxedLongs(expects.toArray(new Long[0]));
+        ColumnVector actual = CastStrings.toTimestamp(input, ZoneId.of("UTC"), false)) {
+      AssertUtils.assertColumnsAreEqual(expected, actual);
+    }
+
+    // Should NOT throw exception because all inputs are valid
+    String[] validInputs = inputs.stream().limit(validDataSize).toArray(String[]::new);
+    Long[] validExpects = expects.stream().limit(validDataSize).toArray(Long[]::new);
+    try (
+        ColumnVector input = ColumnVector.fromStrings(validInputs);
+        ColumnVector expected = ColumnVector.timestampMicroSecondsFromBoxedLongs(validExpects);
+        ColumnVector actual = CastStrings.toTimestamp(input, ZoneId.of("UTC"), true)) {
+      AssertUtils.assertColumnsAreEqual(expected, actual);
+    }
+
+    // Throw IllegalArgumentException for invalid timestamps under ANSI mode
     assertThrows(IllegalArgumentException.class, () -> {
-      try (ColumnVector input = ColumnVector.fromStrings(" invalid_value ")) {
-        // ansiEnabled is true
-        CastStrings.toTimestampWithoutTimeZone(input, true, false, true);
+      try (ColumnVector input = ColumnVector.fromStrings(inputs.toArray(new String[0]))) {
+        CastStrings.toTimestamp(input, ZoneId.of("UTC"), true);
       }
     });
   }

From 7c9b8000593e4c57a6057dccb5aa979d9607102e Mon Sep 17 00:00:00 2001
From: sperlingxx <lovedreamf@gmail.com>
Date: Wed, 10 Jan 2024 19:40:55 +0800
Subject: [PATCH 10/35] refine

---
 src/main/cpp/src/CastStringJni.cpp            |  63 ++-
 src/main/cpp/src/datetime_parser.cu           | 517 +++++++++++-------
 src/main/cpp/src/datetime_parser.hpp          |  64 ++-
 .../spark/rapids/jni/GpuTimeZoneDB.java       |  18 +-
 4 files changed, 403 insertions(+), 259 deletions(-)

diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp
index 60bf69ff2e..de23f48c39 100644
--- a/src/main/cpp/src/CastStringJni.cpp
+++ b/src/main/cpp/src/CastStringJni.cpp
@@ -257,21 +257,20 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromInteger
   CATCH_CAST_EXCEPTION(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv *env,
-                                                                                 jclass,
-                                                                                 jlong input_column,
-                                                                                 jlong transitions_handle,
-                                                                                 jlong tz_indices_col,
-                                                                                 jlong special_dt_lit_col,
-                                                                                 jint tz_default_index,
-                                                                                 jboolean ansi_enabled)
-{
+JNIEXPORT jlong JNICALL
+Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(
+    JNIEnv *env, jclass, jlong input_column, jlong transitions_handle,
+    jlong tz_indices_col, jlong special_dt_lit_col, jint tz_default_index,
+    jboolean ansi_enabled) {
   JNI_NULL_CHECK(env, input_column, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
 
-    auto const &input_view = cudf::strings_column_view(*reinterpret_cast<cudf::column_view const *>(input_column));
-    auto const transitions = reinterpret_cast<cudf::table_view const *>(transitions_handle)->column(0);
+    auto const &input_view = cudf::strings_column_view(
+        *reinterpret_cast<cudf::column_view const *>(input_column));
+    auto const transitions =
+        reinterpret_cast<cudf::table_view const *>(transitions_handle)
+            ->column(0);
     auto const &tz_indices_view = cudf::strings_column_view(
         *reinterpret_cast<cudf::column_view const *>(tz_indices_col));
     auto const &special_dt_lit_view = cudf::strings_column_view(
@@ -279,40 +278,46 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp
 
     auto const tz_index = static_cast<cudf::size_type>(tz_default_index);
 
-    auto [ret_cv, success] = spark_rapids_jni::string_to_timestamp_with_tz(
-      input_view, transitions, tz_indices_view, special_dt_lit_view, tz_index, ansi_enabled);
-    if (success) { return cudf::jni::release_as_jlong(ret_cv); }
+    auto ret_cv = spark_rapids_jni::string_to_timestamp_with_tz(
+        input_view, transitions, tz_indices_view, special_dt_lit_view, tz_index,
+        ansi_enabled);
+    if (ret_cv) {
+      return cudf::jni::release_as_jlong(ret_cv);
+    }
   }
   CATCH_STD(env, 0);
 
   // sucess is false, throw exception.
-  // Note: do not need to release ret_cv, because it's nullptr when success is false.
-  JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Parse failed on Ansi mode", 0);
+  // Note: do not need to release ret_cv, because it's nullptr when success is
+  // false.
+  JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
+                "Parse failed on Ansi mode", 0);
 }
 
-JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestampWithoutTimeZone(
-  JNIEnv* env,
-  jclass,
-  jlong input_column,
-  jlong special_dt_lit_col,
-  jboolean allow_time_zone,
-  jboolean ansi_enabled)
-{
+JNIEXPORT jlong JNICALL
+Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestampWithoutTimeZone(
+    JNIEnv *env, jclass, jlong input_column, jlong special_dt_lit_col,
+    jboolean allow_time_zone, jboolean ansi_enabled) {
   JNI_NULL_CHECK(env, input_column, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const &input_view = cudf::strings_column_view(*reinterpret_cast<cudf::column_view const *>(input_column));
+    auto const &input_view = cudf::strings_column_view(
+        *reinterpret_cast<cudf::column_view const *>(input_column));
     auto const &special_dt_lit_view = cudf::strings_column_view(
         *reinterpret_cast<cudf::column_view const *>(special_dt_lit_col));
 
-    auto [ret_cv, success] = spark_rapids_jni::string_to_timestamp_without_time_zone(
+    auto ret_cv = spark_rapids_jni::string_to_timestamp_without_tz(
         input_view, special_dt_lit_view, allow_time_zone, ansi_enabled);
-    if (success) { return cudf::jni::release_as_jlong(ret_cv); }
+    if (ret_cv) {
+      return cudf::jni::release_as_jlong(ret_cv);
+    }
   }
   CATCH_STD(env, 0);
 
   // sucess is false, throw exception.
-  // Note: do not need to release ret_cv, because it's nullptr when success is false.
-  JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Parse failed on Ansi mode", 0);
+  // Note: do not need to release ret_cv, because it's nullptr when success is
+  // false.
+  JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
+                "Parse failed on Ansi mode", 0);
 }
 }
diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index 31b5fe0099..a70e49fb33 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -49,14 +49,14 @@
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
-using column                   = cudf::column;
-using column_device_view       = cudf::column_device_view;
-using column_view              = cudf::column_view;
+using column = cudf::column;
+using column_device_view = cudf::column_device_view;
+using column_view = cudf::column_view;
 using lists_column_device_view = cudf::detail::lists_column_device_view;
-using size_type                = cudf::size_type;
-using string_view              = cudf::string_view;
-using struct_view              = cudf::struct_view;
-using table_view               = cudf::table_view;
+using size_type = cudf::size_type;
+using string_view = cudf::string_view;
+using struct_view = cudf::struct_view;
+using table_view = cudf::table_view;
 
 namespace {
 
@@ -64,7 +64,7 @@ namespace {
  * Represents local date time in a time zone.
  */
 struct timestamp_components {
-  int32_t year;  // max 6 digits
+  int32_t year; // max 6 digits
   int8_t month;
   int8_t day;
   int8_t hour;
@@ -76,14 +76,15 @@ struct timestamp_components {
 /**
  * Is white space
  */
-__device__ __host__ inline bool is_whitespace(const char chr)
-{
+__device__ __host__ inline bool is_whitespace(const char chr) {
   switch (chr) {
-    case ' ':
-    case '\r':
-    case '\t':
-    case '\n': return true;
-    default: return false;
+  case ' ':
+  case '\r':
+  case '\t':
+  case '\n':
+    return true;
+  default:
+    return false;
   }
 }
 
@@ -97,11 +98,16 @@ __device__ inline bool equals_ascii_ignore_case(char const *actual_begin,
                                                 char const *actual_end,
                                                 char const *expect_begin,
                                                 char const *expect_end) {
-  if (actual_end - actual_begin != expect_end - expect_begin) { return false; }
+  if (actual_end - actual_begin != expect_end - expect_begin) {
+    return false;
+  }
 
   while (expect_begin < expect_end) {
     // the diff between upper case and lower case for a same char is 32
-    if (*actual_begin != *expect_begin && *actual_begin != (*expect_begin - 32)) { return false; }
+    if (*actual_begin != *expect_begin &&
+        *actual_begin != (*expect_begin - 32)) {
+      return false;
+    }
     actual_begin++;
     expect_begin++;
   }
@@ -111,112 +117,165 @@ __device__ inline bool equals_ascii_ignore_case(char const *actual_begin,
 /**
  * Ported from Spark
  */
-__device__ __host__ bool is_valid_digits(int segment, int digits)
-{
+__device__ __host__ bool is_valid_digits(int segment, int digits) {
   // A Long is able to represent a timestamp within [+-]200 thousand years
   const int constexpr maxDigitsYear = 6;
-  // For the nanosecond part, more than 6 digits is allowed, but will be truncated.
-  return segment == 6 || (segment == 0 && digits >= 4 && digits <= maxDigitsYear) ||
-     // For the zoneId segment(7), it's could be zero digits when it's a region-based zone ID
-     (segment == 7 && digits <= 2) ||
-     (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2);
+  // For the nanosecond part, more than 6 digits is allowed, but will be
+  // truncated.
+  return segment == 6 ||
+         (segment == 0 && digits >= 4 && digits <= maxDigitsYear) ||
+         // For the zoneId segment(7), it's could be zero digits when it's a
+         // region-based zone ID
+         (segment == 7 && digits <= 2) ||
+         (segment != 0 && segment != 6 && segment != 7 && digits > 0 &&
+          digits <= 2);
 }
 
-enum ParseResult {
-  OK = 0,
-  INVALID = 1,
-  UNSUPPORTED = 2
-};
+/**
+ * We have to dintinguish INVALID value with UNSUPPORTED value.
+ * INVALID means the value is invalid in Spark SQL.
+ * UNSUPPORTED means the value is valid in Spark SQL but not supported by rapids
+ * yet. As for INVALID values, we treat them in the same as Spark SQL. As for
+ * UNSUPPORTED values, we just throw cuDF exception.
+ */
+enum ParseResult { OK = 0, INVALID = 1, UNSUPPORTED = 2 };
 
-template <bool with_timezone>
-struct parse_timestamp_string_fn {
+template <bool with_timezone> struct parse_timestamp_string_fn {
   column_device_view const d_strings;
   column_device_view const special_datetime_names;
   size_type default_tz_index;
   bool allow_tz_in_date_str = true;
   // The list column of transitions to figure out the correct offset
   // to adjust the timestamp. The type of the values in this column is
-  // LIST<STRUCT<utcInstant: int64, tzInstant: int64, utcOffset: int32, looseTzInstant: int64>>.
-  thrust::optional<lists_column_device_view const> transitions = thrust::nullopt;
+  // LIST<STRUCT<utcInstant: int64, tzInstant: int64, utcOffset: int32,
+  // looseTzInstant: int64>>.
+  thrust::optional<lists_column_device_view const> transitions =
+      thrust::nullopt;
   thrust::optional<column_device_view const> tz_indices = thrust::nullopt;
 
-  __device__ thrust::tuple<cudf::timestamp_us, uint8_t> operator()(const cudf::size_type& idx) const
-  {
+  __device__ thrust::tuple<cudf::timestamp_us, uint8_t>
+  operator()(const cudf::size_type &idx) const {
+    // inherit the nullmask of the input column
     if (!d_strings.is_valid(idx)) {
-      return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID);
+      return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}},
+                                ParseResult::INVALID);
     }
 
     auto const d_str = d_strings.element<cudf::string_view>(idx);
 
     timestamp_components ts_comp{};
-    char const * tz_lit_ptr = nullptr;
+    char const *tz_lit_ptr = nullptr;
     size_type tz_lit_len = 0;
-    switch (parse_string_to_timestamp_us(&ts_comp, &tz_lit_ptr, &tz_lit_len, d_str)) {
-      case ParseResult::INVALID:
-        return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID);
-      case ParseResult::UNSUPPORTED:
-        return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::UNSUPPORTED);
-      case ParseResult::OK:
-      default:
-        break;
+    switch (parse_string_to_timestamp_us(&ts_comp, &tz_lit_ptr, &tz_lit_len,
+                                         d_str)) {
+    case ParseResult::INVALID:
+      return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}},
+                                ParseResult::INVALID);
+    case ParseResult::UNSUPPORTED:
+      return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}},
+                                ParseResult::UNSUPPORTED);
+    case ParseResult::OK:
+    default:
+      break;
     }
 
     if constexpr (!with_timezone) {
-      // path without timezone, in which unix_timestamp is straightforwardly computed
+      // path without timezone, in which unix_timestamp is straightforwardly
+      // computed
       auto const ts_unaligned = compute_epoch_us(ts_comp);
-      return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{ts_unaligned}}, ParseResult::OK);
+      return thrust::make_tuple(
+          cudf::timestamp_us{cudf::duration_us{ts_unaligned}}, ParseResult::OK);
     }
-  
-    // path with timezone, in which timezone offset has to be determined before computing unix_timestamp
+
+    // path with timezone, in which timezone offset has to be determined before
+    // computing unix_timestamp
     int64_t tz_offset;
     if (tz_lit_ptr == nullptr) {
-      tz_offset = extract_timezone_offset(compute_loose_epoch_s(ts_comp), default_tz_index);
+      tz_offset = extract_timezone_offset(compute_loose_epoch_s(ts_comp),
+                                          default_tz_index);
     } else {
       auto tz_view = string_view(tz_lit_ptr, tz_lit_len);
-      if (auto [utc_offset, ret_code] = parse_utc_like_tz(tz_view); ret_code == 0) {
+      // Firstly, try parsing as utc-like timezone rep
+      if (auto [utc_offset, ret_code] = parse_utc_like_tz(tz_view);
+          ret_code == 0) {
         tz_offset = utc_offset;
       } else if (ret_code == 1) {
+        // Then, try parsing as region-based timezone ID
         auto tz_index = query_index_from_tz_db(tz_view);
+        // tz_index < size(tzDB): found the ID in tzDB
+        // size(tzDB) <= tz_index < size(tzIDs): found the ID but not supported
+        // yet tz_index == size(tzIDs): invalid timezone ID
         if (tz_index > transitions->size()) {
           if (tz_index == tz_indices->size())
-            return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID);
-          return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::UNSUPPORTED);
+            return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}},
+                                      ParseResult::INVALID);
+          return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}},
+                                    ParseResult::UNSUPPORTED);
         }
-        tz_offset = extract_timezone_offset(compute_loose_epoch_s(ts_comp), tz_index);
+        tz_offset =
+            extract_timezone_offset(compute_loose_epoch_s(ts_comp), tz_index);
       } else {
-        return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID);
+        // (ret_code == 2) quick path to mark value invalid
+        return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}},
+                                  ParseResult::INVALID);
       }
     }
 
+    // Compute the epoch as UTC timezone, then apply the timezone offset.
     auto const ts_unaligned = compute_epoch_us(ts_comp);
 
-    return thrust::make_tuple(
-        cudf::timestamp_us{cudf::duration_us{ts_unaligned - tz_offset * 1000000L}},
-        ParseResult::OK);
+    return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{
+                                  ts_unaligned - tz_offset * 1000000L}},
+                              ParseResult::OK);
   }
 
-  // TODO: support CST/PST/AST
-  __device__ inline thrust::pair<int64_t, uint8_t> parse_utc_like_tz(string_view const &tz_lit) const
-  {
+  /**
+   * TODO: support CST/PST/AST
+   *
+   * Parse UTC-like timezone representation such as: UTC+11:22:33, GMT-8:08:01.
+   * This function is purposed to be fully align to Apache Spark's behavior. The
+   * function returns the status along with the result: 0 - successfully parsed
+   * the timezone offset 1 - not a valid UTC-like timezone representation, maybe
+   * valid regioned-base rep 2 - not a valid timezone representation
+   *
+   * Valid patterns:
+   *   with colon
+   *     hh:mm      : ^(GMT|UTC)?[+-](\d|0[0-9]|1[0-8]):(\d|[0-5][0-9])
+   *     hh:mm:ss   : ^(GMT|UTC)?[+-](\d|0[0-9]|1[0-8]):[0-5][0-9]:[0-5][0-9]
+   *   without colon
+   *     hh only    : ^(GMT|UTC)?[+-](\d|0[0-9]|1[0-8])
+   *     hh:mm:(ss) : ^(GMT|UTC)?[+-](0[0-9]|1[0-8])([0-5][0-9])?([0-5][0-9])?
+   *   special symbols:
+   *                  ^(Z|CST|PST|AST|...)
+   *
+   *   additional restriction: 18:00:00 is the upper bound (which means 18:00:01
+   * is invalid)
+   */
+  __device__ inline thrust::pair<int64_t, uint8_t>
+  parse_utc_like_tz(string_view const &tz_lit) const {
     size_type len = tz_lit.size_bytes();
 
     char const *ptr = tz_lit.data();
 
+    // try to parse Z
     if (*ptr == 'Z') {
-      if (len > 1) return {0, 1};
+      if (len > 1)
+        return {0, 1};
       return {0, 0};
     }
 
     size_t char_offset = 0;
-
-    if (len > 2
-        && ((*ptr == 'G' && *(ptr + 1) == 'M' && *(ptr + 2) == 'T')
-        || (*ptr == 'U' && *(ptr + 1) == 'T' && *(ptr + 2) == 'C'))) {
+    // skip UTC|GMT if existing
+    if (len > 2 && ((*ptr == 'G' && *(ptr + 1) == 'M' && *(ptr + 2) == 'T') ||
+                    (*ptr == 'U' && *(ptr + 1) == 'T' && *(ptr + 2) == 'C'))) {
       char_offset = 3;
     }
 
-    if (len == char_offset) return {0, 0};
+    // return for the pattern UTC|GMT (without exact offset)
+    if (len == char_offset)
+      return {0, 0};
 
+    // parse sign +|-
     char const sign_char = *(ptr + char_offset++);
     int64_t sign;
     if (sign_char == '+') {
@@ -224,100 +283,138 @@ struct parse_timestamp_string_fn {
     } else if (sign_char == '-') {
       sign = -1L;
     } else {
+      // if the rep starts with UTC|GMT, it can NOT be regioned-base rep
       return {0, char_offset < 3 ? 1 : 2};
     }
 
+    // parse hh:mm:ss
     int64_t hms[3] = {0L, 0L, 0L};
     bool has_colon = false;
-    bool one_digit_mm = false;
     for (size_type i = 0; i < 3; i++) {
-      if (i == 2 && one_digit_mm) return {0, 2};
-
+      // deal with the first digit
       hms[i] = *(ptr + char_offset++) - '0';
-      if (hms[i] < 0 || hms[i] > 9) return {0, 2};
+      if (hms[i] < 0 || hms[i] > 9)
+        return {0, 2};
 
+      // deal with trailing single digit instant:
+      //  hh(GMT+8) - valid
+      //  mm(GMT+11:2) - must be separated from (h)h by `:`
+      //  ss(GMT-11:22:3) - invalid
       if (len == char_offset) {
-        if (i > 0) {
-          if (!has_colon) return {0, 2};
-          one_digit_mm = true;
-        }
+        if (i == 2 || (i == 1 && !has_colon))
+          return {0, 2};
         break;
       }
 
+      // deal with `:`
       if (*(ptr + char_offset) == ':') {
-        if (len == ++char_offset) break;
+        // 1. (i == 1) one_digit mm with ss is invalid (+11:2:3)
+        // 2. (i == 2) one_dight ss is invalid (+11:22:3)
+        // 3. trailing `:` is invalid (GMT+8:)
+        if (i > 0 || len == ++char_offset)
+          return {0, 2};
         has_colon = true;
         continue;
       }
 
+      // deal with the second digit
       auto digit = *(ptr + char_offset++) - '0';
-      if (digit < 0 || digit > 9) return {0, 2};
+      if (digit < 0 || digit > 9)
+        return {0, 2};
       hms[i] = hms[i] * 10 + digit;
 
-      if (len == char_offset) break;
+      if (len == char_offset)
+        break;
+      // deal with `:`
       if (*(ptr + char_offset) == ':') {
-        if (len == ++char_offset) break;
+        // trailing `:` is invalid (UTC+11:)
+        if (len == ++char_offset)
+          return {0, 2};
         has_colon = true;
-        continue;
       }
-      if (has_colon) return {0, 2};
     }
 
-    if (hms[0] > 18 || hms[1] > 59 || hms[2] > 59) return {0, 2};
-    if (hms[0] == 18 && hms[1] + hms[2] > 0) return {0, 2};
+    // the upper bound is 18:00:00 (regardless of sign)
+    if (hms[0] > 18 || hms[1] > 59 || hms[2] > 59)
+      return {0, 2};
+    if (hms[0] == 18 && hms[1] + hms[2] > 0)
+      return {0, 2};
 
     return {sign * (hms[0] * 3600L + hms[1] * 60L + hms[2]), 0};
   }
 
-  __device__ inline int query_index_from_tz_db(string_view const &tz_lit) const
-  {
-    // TODO: replace with more efficient approach (such as binary search or prefix tree)
+  /**
+   * TODO: replace linear search with more efficient approach (like prefix tree)
+   */
+  __device__ inline int
+  query_index_from_tz_db(string_view const &tz_lit) const {
     auto predicate = [tz = tz_indices, &tz_lit] __device__(auto const i) {
       return tz->element<string_view>(i) == tz_lit;
     };
-    auto ret = thrust::find_if(thrust::seq,
-                               thrust::make_counting_iterator(0),
-                               thrust::make_counting_iterator(tz_indices->size()),
-                               predicate);
+    auto ret = thrust::find_if(
+        thrust::seq, thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(tz_indices->size()), predicate);
 
     return *ret;
   }
 
-  __device__ inline int64_t extract_timezone_offset(int64_t loose_epoch_second, size_type tz_index) const
-  {
+  /**
+   * Perform binaryserach to search out the timezone offset based on loose epoch
+   * instants. Basically, this is the same approach as
+   * `convert_timestamp_tz_functor`.
+   */
+  __device__ inline int64_t extract_timezone_offset(int64_t loose_epoch_second,
+                                                    size_type tz_index) const {
     auto const &utc_offsets = transitions->child().child(2);
     auto const &loose_instants = transitions->child().child(3);
 
-    auto const local_transitions = cudf::list_device_view{*transitions, tz_index};
+    auto const local_transitions =
+        cudf::list_device_view{*transitions, tz_index};
     auto const list_size = local_transitions.size();
 
     auto const transition_times = cudf::device_span<int64_t const>(
         loose_instants.data<int64_t>() + local_transitions.element_offset(0),
         static_cast<size_t>(list_size));
 
-    auto const it = thrust::upper_bound(
-        thrust::seq, transition_times.begin(), transition_times.end(), loose_epoch_second);
-    auto const idx = static_cast<size_type>(thrust::distance(transition_times.begin(), it));
+    auto const it =
+        thrust::upper_bound(thrust::seq, transition_times.begin(),
+                            transition_times.end(), loose_epoch_second);
+    auto const idx =
+        static_cast<size_type>(thrust::distance(transition_times.begin(), it));
     auto const list_offset = local_transitions.element_offset(idx - 1);
 
     return static_cast<int64_t>(utc_offsets.element<int32_t>(list_offset));
   }
 
-  __device__ inline int64_t compute_loose_epoch_s(timestamp_components const& ts) const
-  {
-    return (ts.year * 400 + (ts.month - 1) * 31 + ts.day - 1) * 86400L + ts.hour * 3600L + ts.minute * 60L + ts.second;
+  /**
+   * The formula to compute loose epoch from local time. The loose epoch is used
+   * to search for the corresponding timezone offset of specific zone ID from
+   * TimezoneDB. The target of loose epoch is to transfer local time to a number
+   * which is proportional to the real timestamp as easily as possible. Loose
+   * epoch, as a computation approach, helps us to align probe(kernel side) to
+   * the TimezoneDB(Java side). Then, we can apply binary search based on loose
+   * epoch instants of TimezoneDB to find out the correct timezone offset.
+   */
+  __device__ inline int64_t
+  compute_loose_epoch_s(timestamp_components const &ts) const {
+    return (ts.year * 400 + (ts.month - 1) * 31 + ts.day - 1) * 86400L +
+           ts.hour * 3600L + ts.minute * 60L + ts.second;
   }
 
-  __device__ inline int64_t compute_epoch_us(timestamp_components const& ts) const
-  {
-    auto const ymd =  // chrono class handles the leap year calculations for us
+  /**
+   * Leverage STL to convert local time to UTC unix_timestamp(in millisecond)
+   */
+  __device__ inline int64_t
+  compute_epoch_us(timestamp_components const &ts) const {
+    auto const ymd = // chrono class handles the leap year calculations for us
         cuda::std::chrono::year_month_day(
             cuda::std::chrono::year{ts.year},
             cuda::std::chrono::month{static_cast<uint32_t>(ts.month)},
             cuda::std::chrono::day{static_cast<uint32_t>(ts.day)});
     auto days = cuda::std::chrono::sys_days(ymd).time_since_epoch().count();
 
-    int64_t timestamp_s = (days * 24L * 3600L) + (ts.hour * 3600L) + (ts.minute * 60L) + ts.second;
+    int64_t timestamp_s = (days * 24L * 3600L) + (ts.hour * 3600L) +
+                          (ts.minute * 60L) + ts.second;
 
     return timestamp_s * 1000000L + ts.microseconds;
   }
@@ -330,13 +427,15 @@ struct parse_timestamp_string_fn {
    * Parse a string with time zone to a timestamp.
    * The bool in the returned tuple is false if the parse failed.
    */
-  __device__ inline ParseResult parse_string_to_timestamp_us(
-      timestamp_components *ts_comp,
-      char const **parsed_tz_ptr,
-      size_type *parsed_tz_length,
-      cudf::string_view const &timestamp_str) const {
-
-    if (timestamp_str.empty()) { return ParseResult::INVALID; }
+  __device__ inline ParseResult
+  parse_string_to_timestamp_us(timestamp_components *ts_comp,
+                               char const **parsed_tz_ptr,
+                               size_type *parsed_tz_length,
+                               cudf::string_view const &timestamp_str) const {
+
+    if (timestamp_str.empty()) {
+      return ParseResult::INVALID;
+    }
 
     const char *curr_ptr = timestamp_str.data();
     const char *end_ptr = curr_ptr + timestamp_str.size_bytes();
@@ -352,15 +451,18 @@ struct parse_timestamp_string_fn {
 
     // TODO: support special dates [epoch, now, today, yesterday, tomorrow]
     for (size_type i = 0; i < special_datetime_names.size(); i++) {
-      auto const& ref = special_datetime_names.element<string_view>(i);
-      if (equals_ascii_ignore_case(curr_ptr, end_ptr, ref.data(), ref.data() + ref.size_bytes())) {
+      auto const &ref = special_datetime_names.element<string_view>(i);
+      if (equals_ascii_ignore_case(curr_ptr, end_ptr, ref.data(),
+                                   ref.data() + ref.size_bytes())) {
         *parsed_tz_ptr = ref.data();
         *parsed_tz_length = ref.size_bytes();
         return ParseResult::UNSUPPORTED;
       }
     }
 
-    if (curr_ptr == end_ptr) { return ParseResult::INVALID; }
+    if (curr_ptr == end_ptr) {
+      return ParseResult::INVALID;
+    }
 
     const char *const bytes = curr_ptr;
     const size_type bytes_length = end_ptr - curr_ptr;
@@ -392,14 +494,18 @@ struct parse_timestamp_string_fn {
           i += 3;
         } else if (i < 2) {
           if (b == '-') {
-            if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; }
+            if (!is_valid_digits(i, current_segment_digits)) {
+              return ParseResult::INVALID;
+            }
             segments[i] = current_segment_value;
             current_segment_value = 0;
             current_segment_digits = 0;
             i += 1;
           } else if (0 == i && ':' == b && !year_sign.has_value()) {
             // just_time = true;
-            if (!is_valid_digits(3, current_segment_digits)) { return ParseResult::INVALID; }
+            if (!is_valid_digits(3, current_segment_digits)) {
+              return ParseResult::INVALID;
+            }
             segments[3] = current_segment_value;
             current_segment_value = 0;
             current_segment_digits = 0;
@@ -409,7 +515,9 @@ struct parse_timestamp_string_fn {
           }
         } else if (2 == i) {
           if (' ' == b || 'T' == b) {
-            if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; }
+            if (!is_valid_digits(i, current_segment_digits)) {
+              return ParseResult::INVALID;
+            }
             segments[i] = current_segment_value;
             current_segment_value = 0;
             current_segment_digits = 0;
@@ -419,7 +527,9 @@ struct parse_timestamp_string_fn {
           }
         } else if (3 == i || 4 == i) {
           if (':' == b) {
-            if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; }
+            if (!is_valid_digits(i, current_segment_digits)) {
+              return ParseResult::INVALID;
+            }
             segments[i] = current_segment_value;
             current_segment_value = 0;
             current_segment_digits = 0;
@@ -429,27 +539,37 @@ struct parse_timestamp_string_fn {
           }
         } else if (5 == i || 6 == i) {
           if ('.' == b && 5 == i) {
-            if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; }
+            if (!is_valid_digits(i, current_segment_digits)) {
+              return ParseResult::INVALID;
+            }
             segments[i] = current_segment_value;
             current_segment_value = 0;
             current_segment_digits = 0;
             i += 1;
           } else {
-            if (!is_valid_digits(i, current_segment_digits) || !allow_tz_in_date_str) { return ParseResult::INVALID; }
+            if (!is_valid_digits(i, current_segment_digits) ||
+                !allow_tz_in_date_str) {
+              return ParseResult::INVALID;
+            }
             segments[i] = current_segment_value;
             current_segment_value = 0;
             current_segment_digits = 0;
             i += 1;
             *parsed_tz_ptr = bytes + j;
             // strip the whitespace between timestamp and timezone
-            while (*parsed_tz_ptr < end_ptr && is_whitespace(**parsed_tz_ptr)) ++(*parsed_tz_ptr);
+            while (*parsed_tz_ptr < end_ptr && is_whitespace(**parsed_tz_ptr))
+              ++(*parsed_tz_ptr);
             *parsed_tz_length = end_ptr - *parsed_tz_ptr;
             break;
           }
-          if (i == 6 && '.' != b) { i += 1; }
+          if (i == 6 && '.' != b) {
+            i += 1;
+          }
         } else {
           if (i < segments_len && (':' == b || ' ' == b)) {
-            if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; }
+            if (!is_valid_digits(i, current_segment_digits)) {
+              return ParseResult::INVALID;
+            }
             segments[i] = current_segment_value;
             current_segment_value = 0;
             current_segment_digits = 0;
@@ -459,9 +579,11 @@ struct parse_timestamp_string_fn {
           }
         }
       } else {
-        if (6 == i) { digits_milli += 1; }
-        // We will truncate the nanosecond part if there are more than 6 digits, which results
-        // in loss of precision
+        if (6 == i) {
+          digits_milli += 1;
+        }
+        // We will truncate the nanosecond part if there are more than 6 digits,
+        // which results in loss of precision
         if (6 != i || current_segment_digits < 6) {
           current_segment_value = current_segment_value * 10 + parsed_value;
         }
@@ -470,7 +592,9 @@ struct parse_timestamp_string_fn {
       j += 1;
     }
 
-    if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; }
+    if (!is_valid_digits(i, current_segment_digits)) {
+      return ParseResult::INVALID;
+    }
     segments[i] = current_segment_value;
 
     while (digits_milli < 6) {
@@ -495,97 +619,92 @@ struct parse_timestamp_string_fn {
 };
 
 /**
- *
- * Trims and parses timestamp string column to a timestamp column and a is valid column
+ * The common entrance of string_to_timestamp, which combines two paths:
+ * with_timezone and without_timezone. This function returns the The
+ * transitions, tz_indices and default_tz_index are only for handling inputs
+ * with timezone. So, this function distinguish with_timezone callsfrom
+ * without_timezone ones by checking if transitions and tz_indices are nullptr.
  *
  */
-std::pair<std::unique_ptr<cudf::column>, bool> to_timestamp(
-  cudf::strings_column_view const& input,
-  cudf::strings_column_view const& special_datetime_lit,
-  bool ansi_mode,
-  bool allow_tz_in_date_str = true,
-  size_type default_tz_index = 1000000000,
-  cudf::column_view const *transitions = nullptr,
-  cudf::strings_column_view const *tz_indices = nullptr)
-{
+std::unique_ptr<cudf::column>
+to_timestamp(cudf::strings_column_view const &input,
+             cudf::strings_column_view const &special_datetime_lit,
+             bool ansi_mode, bool allow_tz_in_date_str = true,
+             size_type default_tz_index = 1000000000,
+             cudf::column_view const *transitions = nullptr,
+             cudf::strings_column_view const *tz_indices = nullptr) {
   auto const stream = cudf::get_default_stream();
   auto const mr = rmm::mr::get_current_device_resource();
 
   auto d_strings = cudf::column_device_view::create(input.parent(), stream);
-  auto d_special_datetime_lit = cudf::column_device_view::create(special_datetime_lit.parent(), stream);
-
-  auto result_col =
-      cudf::make_timestamp_column(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS},
-                                  input.size(),
-                                  cudf::mask_state::UNALLOCATED,
-                                  stream,
-                                  mr);
-  // record which string is failed to parse.
-  auto result_valid_col =
-      cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::UINT8},
-                                    input.size(),
-                                    cudf::mask_state::UNALLOCATED,
-                                    stream,
-                                    mr);
+  auto d_special_datetime_lit =
+      cudf::column_device_view::create(special_datetime_lit.parent(), stream);
+
+  // column to store the result timestamp
+  auto result_col = cudf::make_timestamp_column(
+      cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}, input.size(),
+      cudf::mask_state::UNALLOCATED, stream, mr);
+  // column to store the status `ParseResult`
+  auto result_valid_col = cudf::make_fixed_width_column(
+      cudf::data_type{cudf::type_id::UINT8}, input.size(),
+      cudf::mask_state::UNALLOCATED, stream, mr);
 
   if (transitions == nullptr || tz_indices == nullptr) {
     thrust::transform(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator(0),
+        rmm::exec_policy(stream), thrust::make_counting_iterator(0),
         thrust::make_counting_iterator(input.size()),
-        thrust::make_zip_iterator(
-            thrust::make_tuple(result_col->mutable_view().begin<cudf::timestamp_us>(),
-                               result_valid_col->mutable_view().begin<uint8_t>())),
-        parse_timestamp_string_fn<false>{*d_strings,
-                                         *d_special_datetime_lit,
+        thrust::make_zip_iterator(thrust::make_tuple(
+            result_col->mutable_view().begin<cudf::timestamp_us>(),
+            result_valid_col->mutable_view().begin<uint8_t>())),
+        parse_timestamp_string_fn<false>{*d_strings, *d_special_datetime_lit,
                                          default_tz_index,
                                          allow_tz_in_date_str});
   } else {
     auto const ft_cdv_ptr = column_device_view::create(*transitions, stream);
     auto const d_transitions = lists_column_device_view{*ft_cdv_ptr};
-    auto d_tz_indices = cudf::column_device_view::create(tz_indices->parent(), stream);
+    auto d_tz_indices =
+        cudf::column_device_view::create(tz_indices->parent(), stream);
 
     thrust::transform(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator(0),
+        rmm::exec_policy(stream), thrust::make_counting_iterator(0),
         thrust::make_counting_iterator(input.size()),
-        thrust::make_zip_iterator(
-            thrust::make_tuple(result_col->mutable_view().begin<cudf::timestamp_us>(),
-                               result_valid_col->mutable_view().begin<uint8_t>())),
-        parse_timestamp_string_fn<true>{*d_strings,
-                                        *d_special_datetime_lit,
-                                        default_tz_index,
-                                        true,
-                                        d_transitions,
+        thrust::make_zip_iterator(thrust::make_tuple(
+            result_col->mutable_view().begin<cudf::timestamp_us>(),
+            result_valid_col->mutable_view().begin<uint8_t>())),
+        parse_timestamp_string_fn<true>{*d_strings, *d_special_datetime_lit,
+                                        default_tz_index, true, d_transitions,
                                         *d_tz_indices});
   }
 
   auto valid_view = result_valid_col->mutable_view();
 
-  auto exception_exists = thrust::any_of(
-    rmm::exec_policy(stream),
-    valid_view.begin<uint8_t>(),
-    valid_view.end<uint8_t>(),
-    []__device__(uint8_t e) { return e == ParseResult::UNSUPPORTED; });
+  // throw cuDF exception if there exists any unsupported formats
+  auto exception_exists =
+      thrust::any_of(rmm::exec_policy(stream), valid_view.begin<uint8_t>(),
+                     valid_view.end<uint8_t>(), [] __device__(uint8_t e) {
+                       return e == ParseResult::UNSUPPORTED;
+                     });
   if (exception_exists) {
     CUDF_FAIL("There exists unsupported timestamp schema!");
   }
 
+  // build the updated nullmask and compute the null count
   auto [valid_bitmask, valid_null_count] = cudf::detail::valid_if(
       valid_view.begin<uint8_t>(), valid_view.end<uint8_t>(),
-      [] __device__(uint8_t e) { return e == 0; },
-      stream, mr);
+      [] __device__(uint8_t e) { return e == 0; }, stream, mr);
 
+  // `output null count > input null count` indicates that there are new null
+  // values generated during the `to_timestamp` transaction to replace invalid
+  // inputs.
   if (ansi_mode && input.null_count() < valid_null_count) {
-    // has invalid value in validity column under ansi mode
-    return std::make_pair(nullptr, false);
+    return nullptr;
   }
 
   result_col->set_null_mask(valid_bitmask, valid_null_count, stream);
-  return std::make_pair(std::move(result_col), true);
+  return std::move(result_col);
 }
 
-}  // namespace
+} // namespace
 
 namespace spark_rapids_jni {
 
@@ -594,36 +713,34 @@ namespace spark_rapids_jni {
  * Returns a pair of timestamp column and a bool indicates whether successed.
  * If does not have time zone in string, use the default time zone.
  */
-std::pair<std::unique_ptr<cudf::column>, bool> string_to_timestamp_with_tz(
-  cudf::strings_column_view const& input,
-  cudf::column_view const& transitions,
-  cudf::strings_column_view const& tz_indices,
-  cudf::strings_column_view const& special_datetime_lit,
-  cudf::size_type default_tz_index,
-  bool ansi_mode)
-{
+std::unique_ptr<cudf::column> string_to_timestamp_with_tz(
+    cudf::strings_column_view const &input,
+    cudf::column_view const &transitions,
+    cudf::strings_column_view const &tz_indices,
+    cudf::strings_column_view const &special_datetime_lit,
+    cudf::size_type default_tz_index, bool ansi_mode) {
   if (input.size() == 0) {
-    return std::make_pair(cudf::make_empty_column(cudf::type_id::TIMESTAMP_MICROSECONDS), true);
+    return nullptr;
   }
-  return to_timestamp(input, special_datetime_lit, ansi_mode, true, default_tz_index, &transitions, &tz_indices);
+  return to_timestamp(input, special_datetime_lit, ansi_mode, true,
+                      default_tz_index, &transitions, &tz_indices);
 }
 
 /**
  * Parse string column with time zone to timestamp column,
  * Returns a pair of timestamp column and a bool indicates whether successed.
  * Do not use the time zone in string.
- * If allow_time_zone is false and string contains time zone, then the string is invalid.
+ * If allow_time_zone is false and string contains time zone, then the string is
+ * invalid.
  */
-std::pair<std::unique_ptr<cudf::column>, bool> string_to_timestamp_without_time_zone(
-  cudf::strings_column_view const& input,
-  cudf::strings_column_view const& special_datetime_lit,
-  bool allow_time_zone,
-  bool ansi_mode)
-{
+std::unique_ptr<cudf::column> string_to_timestamp_without_tz(
+    cudf::strings_column_view const &input,
+    cudf::strings_column_view const &special_datetime_lit, bool allow_time_zone,
+    bool ansi_mode) {
   if (input.size() == 0) {
-    return std::make_pair(cudf::make_empty_column(cudf::type_id::TIMESTAMP_MICROSECONDS), true);
+    return nullptr;
   }
   return to_timestamp(input, special_datetime_lit, ansi_mode, allow_time_zone);
 }
 
-}  // namespace spark_rapids_jni
+} // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp
index 1b72a1fbb8..9c536b6837 100644
--- a/src/main/cpp/src/datetime_parser.hpp
+++ b/src/main/cpp/src/datetime_parser.hpp
@@ -20,8 +20,9 @@ namespace spark_rapids_jni {
 
 /**
  *
- * Trims and parses a timestamp string column with time zone suffix to a timestamp column.
- * e.g.: 1991-04-14T02:00:00Asia/Shanghai => 1991-04-13 18:00:00
+ * Trims and parses a timestamp string column with time zone suffix to a
+ * timestamp column. e.g.: 1991-04-14T02:00:00Asia/Shanghai => 1991-04-13
+ * 18:00:00
  *
  * Refer to: https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/
  * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394
@@ -47,7 +48,8 @@ namespace spark_rapids_jni {
  * Spark supports the following zone id forms:
  *   - Z - Zulu time zone UTC+0
  *   - +|-[h]h:[m]m
- *   - A short id, see https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS
+ *   - A short id, see
+ * https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS
  *   - An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-,
  *     and a suffix in the formats:
  *     - +|-h[h]
@@ -63,23 +65,30 @@ namespace spark_rapids_jni {
  *
  *
  * @param input input string column view.
- * @param default_time_zone if input string does not contain a time zone, use this time zone.
- * @param ansi_mode is ansi mode
- * @returns a timestamp column and a bool column. Bool column is empty if ansi mode is false, not
- * empty otherwise.
+ * @param transitions TimezoneDB, the table of transitions contains all
+ * information for timezones
+ * @param tz_indices TimezoneDB index of region-based timezone IDs
+ * @param special_datetime_lit cache of special datetimes
+ * @param default_tz_index the index of default timezone in TimezoneDB, if input
+ * date-like string does not contain a time zone (like: YYYY-MM-DD:hhmmss), use
+ * this time zone.
+ * @param ansi_mode whether enforce ANSI mode or not. If true, exception will be
+ * thrown encountering any invalid inputs.
+ * @returns the pointer of the timestamp result column, which points to nullptr
+ * if there exists invalid inputs and ANSI mode is on.
  */
-std::pair<std::unique_ptr<cudf::column>, bool> string_to_timestamp_with_tz(
-    cudf::strings_column_view const& input,
-    cudf::column_view const& transitions,
-    cudf::strings_column_view const& tz_indices,
-    cudf::strings_column_view const& special_datetime_lit,
-    cudf::size_type default_tz_index,
-    bool ansi_mode);
+std::unique_ptr<cudf::column> string_to_timestamp_with_tz(
+    cudf::strings_column_view const &input,
+    cudf::column_view const &transitions,
+    cudf::strings_column_view const &tz_indices,
+    cudf::strings_column_view const &special_datetime_lit,
+    cudf::size_type default_tz_index, bool ansi_mode);
 
 /**
  *
- * Trims and parses a timestamp string column with time zone suffix to a timestamp column.
- * e.g.: 1991-04-14T02:00:00Asia/Shanghai => 1991-04-13 18:00:00
+ * Trims and parses a timestamp string column with time zone suffix to a
+ * timestamp column. e.g.: 1991-04-14T02:00:00Asia/Shanghai => 1991-04-13
+ * 18:00:00
  *
  * Refer to: https://github.com/apache/spark/blob/v3.5.0/sql/api/src/main/scala/
  * org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala#L394
@@ -105,7 +114,8 @@ std::pair<std::unique_ptr<cudf::column>, bool> string_to_timestamp_with_tz(
  * Spark supports the following zone id forms:
  *   - Z - Zulu time zone UTC+0
  *   - +|-[h]h:[m]m
- *   - A short id, see https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS
+ *   - A short id, see
+ * https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS
  *   - An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-,
  *     and a suffix in the formats:
  *     - +|-h[h]
@@ -121,17 +131,17 @@ std::pair<std::unique_ptr<cudf::column>, bool> string_to_timestamp_with_tz(
  *
  *
  * @param input input string column view.
+ * @param special_datetime_lit cache of special datetimes
  * @param allow_time_zone whether allow time zone in the timestamp string. e.g.:
  *   1991-04-14T02:00:00Asia/Shanghai is invalid when do not allow time zone.
- * @param allow_special_expressions whether allow epoch, now, today, yesterday, tomorrow strings.
- * @param ansi_mode is ansi mode
- * @returns a timestamp column and a bool column. Bool column is empty if ansi mode is false, not
- * empty otherwise.
+ * @param ansi_mode whether enforce ANSI mode or not. If true, exception will be
+ * thrown encountering any invalid inputs.
+ * @returns the pointer of the timestamp result column, which points to nullptr
+ * if there exists invalid inputs and ANSI mode is on.
  */
-std::pair<std::unique_ptr<cudf::column>, bool> string_to_timestamp_without_time_zone(
-  cudf::strings_column_view const& input,
-  cudf::strings_column_view const& special_datetime_lit,
-  bool allow_time_zone,
-  bool ansi_mode);
+std::unique_ptr<cudf::column> string_to_timestamp_without_tz(
+    cudf::strings_column_view const &input,
+    cudf::strings_column_view const &special_datetime_lit, bool allow_time_zone,
+    bool ansi_mode);
 
-}  // namespace spark_rapids_jni
+} // namespace spark_rapids_jni
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
index f8b49b5b22..cc9ba04a8e 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
@@ -40,7 +40,7 @@ public class GpuTimeZoneDB {
 
   // For the timezone database, we store the transitions in a ColumnVector that is a list of 
   // structs. The type of this column vector is:
-  //   LIST<STRUCT<utcInstant: int64, localInstant: int64, offset: int32>>
+  //   LIST<STRUCT<utcInstant: int64, localInstant: int64, offset: int32, looseInstant: int64>>
   private CompletableFuture<Map<String, Integer>> zoneIdToTableFuture;
   private CompletableFuture<HostColumnVector> fixedTransitionsFuture;
   private CompletableFuture<HostColumnVector> zoneIdVectorFuture;
@@ -61,7 +61,7 @@ public class GpuTimeZoneDB {
   static GpuTimeZoneDB getInstance() {
     return instance;
   }
-  
+
   /**
    * Start to cache the database. This should be called on startup of an executor. It should start
    * to cache the data on the CPU in a background thread. It should return immediately and allow the
@@ -181,6 +181,16 @@ private void doLoadData() {
       try {
         Map<String, Integer> zoneIdToTable = new HashMap<>();
         List<List<HostColumnVector.StructData>> masterTransitions = new ArrayList<>();
+        // Build a timezone ID index for the rendering of timezone IDs which may be included in datetime-like strings.
+        // For instance: "2023-11-5T03:04:55.1 Asia/Shanghai" -> This index helps to find the
+        // offset of "Asia/Shanghai" in timezoneDB.
+        //
+        // Currently, we do NOT support all timezone IDs. For unsupported ones, we ought to throw Exception anyway. And
+        // for invalid ones, we replace them with NULL value when ANSI mode is off. Therefore, we need to distinguish the
+        // unsupported ones from invalid ones which means the unsupported Ids need to be collected as well.
+        // To distinguish supported IDs from unsupported ones, we place all unsupported IDs behind supported ones:
+        // 1. Collect the IDs of all supported timezones in the order of masterTransitions.
+        // 2. Append the IDs of all unsupported timezones after the suported ones.
         List<String> zondIdList = new ArrayList<>();
         List<String> unsupportedZoneList = new ArrayList<>();
 
@@ -221,7 +231,7 @@ private void doLoadData() {
                 // the exact EpochSecond. After caching these values along with EpochSeconds, we
                 // can easily search out which time zone transition rule we should apply according
                 // to LocalDateTime structs. The searching procedure is same as the binary search with
-                // exact EpochSeconds(convert_timestamp_tz_functor), except using "loose EpochSeconds"
+                // exact EpochSeconds(convert_timestamp_tz_functor), except using "loose instant"
                 // as search index instead of exact EpochSeconds.
                 Function<LocalDateTime, Long> localToLooseEpochSecond = lt ->
                         86400L * (lt.getYear() * 400L + (lt.getMonthValue() - 1) * 31L +
@@ -257,6 +267,7 @@ private void doLoadData() {
             }
             masterTransitions.add(data);
             zoneIdToTable.put(zoneId.getId(), idx);
+            // Collect the IDs of all supported timezones in the order of masterTransitions
             zondIdList.add(zoneId.getId());
           }
         }
@@ -270,6 +281,7 @@ private void doLoadData() {
         HostColumnVector.DataType resultType =
             new HostColumnVector.ListType(false, childType);
 
+        // Append the IDs of all unsupported timezones after the suported ones.
         zondIdList.addAll(unsupportedZoneList);
 
         try (HostColumnVector fixedTransitions = HostColumnVector.fromLists(resultType, masterTransitions.toArray(new List[0]))) {

From 93d4a66cc3e5617ddf839cb175b79156d28d41f4 Mon Sep 17 00:00:00 2001
From: sperlingxx <lovedreamf@gmail.com>
Date: Wed, 10 Jan 2024 21:39:51 +0800
Subject: [PATCH 11/35] fix clang-fmt

---
 src/main/cpp/src/CastStringJni.cpp   |  65 ++--
 src/main/cpp/src/datetime_parser.cu  | 470 ++++++++++++---------------
 src/main/cpp/src/datetime_parser.hpp |  20 +-
 3 files changed, 254 insertions(+), 301 deletions(-)

diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp
index de23f48c39..ee0a053b88 100644
--- a/src/main/cpp/src/CastStringJni.cpp
+++ b/src/main/cpp/src/CastStringJni.cpp
@@ -258,66 +258,67 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromInteger
 }
 
 JNIEXPORT jlong JNICALL
-Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(
-    JNIEnv *env, jclass, jlong input_column, jlong transitions_handle,
-    jlong tz_indices_col, jlong special_dt_lit_col, jint tz_default_index,
-    jboolean ansi_enabled) {
+Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv* env,
+                                                         jclass,
+                                                         jlong input_column,
+                                                         jlong transitions_handle,
+                                                         jlong tz_indices_col,
+                                                         jlong special_dt_lit_col,
+                                                         jint tz_default_index,
+                                                         jboolean ansi_enabled)
+{
   JNI_NULL_CHECK(env, input_column, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
 
-    auto const &input_view = cudf::strings_column_view(
-        *reinterpret_cast<cudf::column_view const *>(input_column));
+    auto const& input_view =
+      cudf::strings_column_view(*reinterpret_cast<cudf::column_view const*>(input_column));
     auto const transitions =
-        reinterpret_cast<cudf::table_view const *>(transitions_handle)
-            ->column(0);
-    auto const &tz_indices_view = cudf::strings_column_view(
-        *reinterpret_cast<cudf::column_view const *>(tz_indices_col));
-    auto const &special_dt_lit_view = cudf::strings_column_view(
-        *reinterpret_cast<cudf::column_view const *>(special_dt_lit_col));
+      reinterpret_cast<cudf::table_view const*>(transitions_handle)->column(0);
+    auto const& tz_indices_view =
+      cudf::strings_column_view(*reinterpret_cast<cudf::column_view const*>(tz_indices_col));
+    auto const& special_dt_lit_view =
+      cudf::strings_column_view(*reinterpret_cast<cudf::column_view const*>(special_dt_lit_col));
 
     auto const tz_index = static_cast<cudf::size_type>(tz_default_index);
 
     auto ret_cv = spark_rapids_jni::string_to_timestamp_with_tz(
-        input_view, transitions, tz_indices_view, special_dt_lit_view, tz_index,
-        ansi_enabled);
-    if (ret_cv) {
-      return cudf::jni::release_as_jlong(ret_cv);
-    }
+      input_view, transitions, tz_indices_view, special_dt_lit_view, tz_index, ansi_enabled);
+    if (ret_cv) { return cudf::jni::release_as_jlong(ret_cv); }
   }
   CATCH_STD(env, 0);
 
   // sucess is false, throw exception.
   // Note: do not need to release ret_cv, because it's nullptr when success is
   // false.
-  JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                "Parse failed on Ansi mode", 0);
+  JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Parse failed on Ansi mode", 0);
 }
 
 JNIEXPORT jlong JNICALL
-Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestampWithoutTimeZone(
-    JNIEnv *env, jclass, jlong input_column, jlong special_dt_lit_col,
-    jboolean allow_time_zone, jboolean ansi_enabled) {
+Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestampWithoutTimeZone(JNIEnv* env,
+                                                                        jclass,
+                                                                        jlong input_column,
+                                                                        jlong special_dt_lit_col,
+                                                                        jboolean allow_time_zone,
+                                                                        jboolean ansi_enabled)
+{
   JNI_NULL_CHECK(env, input_column, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const &input_view = cudf::strings_column_view(
-        *reinterpret_cast<cudf::column_view const *>(input_column));
-    auto const &special_dt_lit_view = cudf::strings_column_view(
-        *reinterpret_cast<cudf::column_view const *>(special_dt_lit_col));
+    auto const& input_view =
+      cudf::strings_column_view(*reinterpret_cast<cudf::column_view const*>(input_column));
+    auto const& special_dt_lit_view =
+      cudf::strings_column_view(*reinterpret_cast<cudf::column_view const*>(special_dt_lit_col));
 
     auto ret_cv = spark_rapids_jni::string_to_timestamp_without_tz(
-        input_view, special_dt_lit_view, allow_time_zone, ansi_enabled);
-    if (ret_cv) {
-      return cudf::jni::release_as_jlong(ret_cv);
-    }
+      input_view, special_dt_lit_view, allow_time_zone, ansi_enabled);
+    if (ret_cv) { return cudf::jni::release_as_jlong(ret_cv); }
   }
   CATCH_STD(env, 0);
 
   // sucess is false, throw exception.
   // Note: do not need to release ret_cv, because it's nullptr when success is
   // false.
-  JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                "Parse failed on Ansi mode", 0);
+  JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Parse failed on Ansi mode", 0);
 }
 }
diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index a70e49fb33..7e2a73d959 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -49,14 +49,14 @@
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
-using column = cudf::column;
-using column_device_view = cudf::column_device_view;
-using column_view = cudf::column_view;
+using column                   = cudf::column;
+using column_device_view       = cudf::column_device_view;
+using column_view              = cudf::column_view;
 using lists_column_device_view = cudf::detail::lists_column_device_view;
-using size_type = cudf::size_type;
-using string_view = cudf::string_view;
-using struct_view = cudf::struct_view;
-using table_view = cudf::table_view;
+using size_type                = cudf::size_type;
+using string_view              = cudf::string_view;
+using struct_view              = cudf::struct_view;
+using table_view               = cudf::table_view;
 
 namespace {
 
@@ -64,7 +64,7 @@ namespace {
  * Represents local date time in a time zone.
  */
 struct timestamp_components {
-  int32_t year; // max 6 digits
+  int32_t year;  // max 6 digits
   int8_t month;
   int8_t day;
   int8_t hour;
@@ -76,15 +76,14 @@ struct timestamp_components {
 /**
  * Is white space
  */
-__device__ __host__ inline bool is_whitespace(const char chr) {
+__device__ __host__ inline bool is_whitespace(const char chr)
+{
   switch (chr) {
-  case ' ':
-  case '\r':
-  case '\t':
-  case '\n':
-    return true;
-  default:
-    return false;
+    case ' ':
+    case '\r':
+    case '\t':
+    case '\n': return true;
+    default: return false;
   }
 }
 
@@ -94,20 +93,16 @@ __device__ __host__ inline bool is_whitespace(const char chr) {
  *   "epoch", "now", "today", "yesterday", "tomorrow"
  * the expect string should be lower-case a-z chars
  */
-__device__ inline bool equals_ascii_ignore_case(char const *actual_begin,
-                                                char const *actual_end,
-                                                char const *expect_begin,
-                                                char const *expect_end) {
-  if (actual_end - actual_begin != expect_end - expect_begin) {
-    return false;
-  }
+__device__ inline bool equals_ascii_ignore_case(char const* actual_begin,
+                                                char const* actual_end,
+                                                char const* expect_begin,
+                                                char const* expect_end)
+{
+  if (actual_end - actual_begin != expect_end - expect_begin) { return false; }
 
   while (expect_begin < expect_end) {
     // the diff between upper case and lower case for a same char is 32
-    if (*actual_begin != *expect_begin &&
-        *actual_begin != (*expect_begin - 32)) {
-      return false;
-    }
+    if (*actual_begin != *expect_begin && *actual_begin != (*expect_begin - 32)) { return false; }
     actual_begin++;
     expect_begin++;
   }
@@ -117,18 +112,17 @@ __device__ inline bool equals_ascii_ignore_case(char const *actual_begin,
 /**
  * Ported from Spark
  */
-__device__ __host__ bool is_valid_digits(int segment, int digits) {
+__device__ __host__ bool is_valid_digits(int segment, int digits)
+{
   // A Long is able to represent a timestamp within [+-]200 thousand years
   const int constexpr maxDigitsYear = 6;
   // For the nanosecond part, more than 6 digits is allowed, but will be
   // truncated.
-  return segment == 6 ||
-         (segment == 0 && digits >= 4 && digits <= maxDigitsYear) ||
+  return segment == 6 || (segment == 0 && digits >= 4 && digits <= maxDigitsYear) ||
          // For the zoneId segment(7), it's could be zero digits when it's a
          // region-based zone ID
          (segment == 7 && digits <= 2) ||
-         (segment != 0 && segment != 6 && segment != 7 && digits > 0 &&
-          digits <= 2);
+         (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2);
 }
 
 /**
@@ -140,7 +134,8 @@ __device__ __host__ bool is_valid_digits(int segment, int digits) {
  */
 enum ParseResult { OK = 0, INVALID = 1, UNSUPPORTED = 2 };
 
-template <bool with_timezone> struct parse_timestamp_string_fn {
+template <bool with_timezone>
+struct parse_timestamp_string_fn {
   column_device_view const d_strings;
   column_device_view const special_datetime_names;
   size_type default_tz_index;
@@ -149,55 +144,48 @@ template <bool with_timezone> struct parse_timestamp_string_fn {
   // to adjust the timestamp. The type of the values in this column is
   // LIST<STRUCT<utcInstant: int64, tzInstant: int64, utcOffset: int32,
   // looseTzInstant: int64>>.
-  thrust::optional<lists_column_device_view const> transitions =
-      thrust::nullopt;
-  thrust::optional<column_device_view const> tz_indices = thrust::nullopt;
+  thrust::optional<lists_column_device_view const> transitions = thrust::nullopt;
+  thrust::optional<column_device_view const> tz_indices        = thrust::nullopt;
 
-  __device__ thrust::tuple<cudf::timestamp_us, uint8_t>
-  operator()(const cudf::size_type &idx) const {
+  __device__ thrust::tuple<cudf::timestamp_us, uint8_t> operator()(const cudf::size_type& idx) const
+  {
     // inherit the nullmask of the input column
     if (!d_strings.is_valid(idx)) {
-      return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}},
-                                ParseResult::INVALID);
+      return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID);
     }
 
     auto const d_str = d_strings.element<cudf::string_view>(idx);
 
     timestamp_components ts_comp{};
-    char const *tz_lit_ptr = nullptr;
-    size_type tz_lit_len = 0;
-    switch (parse_string_to_timestamp_us(&ts_comp, &tz_lit_ptr, &tz_lit_len,
-                                         d_str)) {
-    case ParseResult::INVALID:
-      return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}},
-                                ParseResult::INVALID);
-    case ParseResult::UNSUPPORTED:
-      return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}},
-                                ParseResult::UNSUPPORTED);
-    case ParseResult::OK:
-    default:
-      break;
+    char const* tz_lit_ptr = nullptr;
+    size_type tz_lit_len   = 0;
+    switch (parse_string_to_timestamp_us(&ts_comp, &tz_lit_ptr, &tz_lit_len, d_str)) {
+      case ParseResult::INVALID:
+        return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID);
+      case ParseResult::UNSUPPORTED:
+        return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}},
+                                  ParseResult::UNSUPPORTED);
+      case ParseResult::OK:
+      default: break;
     }
 
     if constexpr (!with_timezone) {
       // path without timezone, in which unix_timestamp is straightforwardly
       // computed
       auto const ts_unaligned = compute_epoch_us(ts_comp);
-      return thrust::make_tuple(
-          cudf::timestamp_us{cudf::duration_us{ts_unaligned}}, ParseResult::OK);
+      return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{ts_unaligned}},
+                                ParseResult::OK);
     }
 
     // path with timezone, in which timezone offset has to be determined before
     // computing unix_timestamp
     int64_t tz_offset;
     if (tz_lit_ptr == nullptr) {
-      tz_offset = extract_timezone_offset(compute_loose_epoch_s(ts_comp),
-                                          default_tz_index);
+      tz_offset = extract_timezone_offset(compute_loose_epoch_s(ts_comp), default_tz_index);
     } else {
       auto tz_view = string_view(tz_lit_ptr, tz_lit_len);
       // Firstly, try parsing as utc-like timezone rep
-      if (auto [utc_offset, ret_code] = parse_utc_like_tz(tz_view);
-          ret_code == 0) {
+      if (auto [utc_offset, ret_code] = parse_utc_like_tz(tz_view); ret_code == 0) {
         tz_offset = utc_offset;
       } else if (ret_code == 1) {
         // Then, try parsing as region-based timezone ID
@@ -212,21 +200,18 @@ template <bool with_timezone> struct parse_timestamp_string_fn {
           return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}},
                                     ParseResult::UNSUPPORTED);
         }
-        tz_offset =
-            extract_timezone_offset(compute_loose_epoch_s(ts_comp), tz_index);
+        tz_offset = extract_timezone_offset(compute_loose_epoch_s(ts_comp), tz_index);
       } else {
         // (ret_code == 2) quick path to mark value invalid
-        return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}},
-                                  ParseResult::INVALID);
+        return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID);
       }
     }
 
     // Compute the epoch as UTC timezone, then apply the timezone offset.
     auto const ts_unaligned = compute_epoch_us(ts_comp);
 
-    return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{
-                                  ts_unaligned - tz_offset * 1000000L}},
-                              ParseResult::OK);
+    return thrust::make_tuple(
+      cudf::timestamp_us{cudf::duration_us{ts_unaligned - tz_offset * 1000000L}}, ParseResult::OK);
   }
 
   /**
@@ -251,16 +236,16 @@ template <bool with_timezone> struct parse_timestamp_string_fn {
    *   additional restriction: 18:00:00 is the upper bound (which means 18:00:01
    * is invalid)
    */
-  __device__ inline thrust::pair<int64_t, uint8_t>
-  parse_utc_like_tz(string_view const &tz_lit) const {
+  __device__ inline thrust::pair<int64_t, uint8_t> parse_utc_like_tz(
+    string_view const& tz_lit) const
+  {
     size_type len = tz_lit.size_bytes();
 
-    char const *ptr = tz_lit.data();
+    char const* ptr = tz_lit.data();
 
     // try to parse Z
     if (*ptr == 'Z') {
-      if (len > 1)
-        return {0, 1};
+      if (len > 1) return {0, 1};
       return {0, 0};
     }
 
@@ -272,8 +257,7 @@ template <bool with_timezone> struct parse_timestamp_string_fn {
     }
 
     // return for the pattern UTC|GMT (without exact offset)
-    if (len == char_offset)
-      return {0, 0};
+    if (len == char_offset) return {0, 0};
 
     // parse sign +|-
     char const sign_char = *(ptr + char_offset++);
@@ -293,16 +277,14 @@ template <bool with_timezone> struct parse_timestamp_string_fn {
     for (size_type i = 0; i < 3; i++) {
       // deal with the first digit
       hms[i] = *(ptr + char_offset++) - '0';
-      if (hms[i] < 0 || hms[i] > 9)
-        return {0, 2};
+      if (hms[i] < 0 || hms[i] > 9) return {0, 2};
 
       // deal with trailing single digit instant:
       //  hh(GMT+8) - valid
       //  mm(GMT+11:2) - must be separated from (h)h by `:`
       //  ss(GMT-11:22:3) - invalid
       if (len == char_offset) {
-        if (i == 2 || (i == 1 && !has_colon))
-          return {0, 2};
+        if (i == 2 || (i == 1 && !has_colon)) return {0, 2};
         break;
       }
 
@@ -311,34 +293,28 @@ template <bool with_timezone> struct parse_timestamp_string_fn {
         // 1. (i == 1) one_digit mm with ss is invalid (+11:2:3)
         // 2. (i == 2) one_dight ss is invalid (+11:22:3)
         // 3. trailing `:` is invalid (GMT+8:)
-        if (i > 0 || len == ++char_offset)
-          return {0, 2};
+        if (i > 0 || len == ++char_offset) return {0, 2};
         has_colon = true;
         continue;
       }
 
       // deal with the second digit
       auto digit = *(ptr + char_offset++) - '0';
-      if (digit < 0 || digit > 9)
-        return {0, 2};
+      if (digit < 0 || digit > 9) return {0, 2};
       hms[i] = hms[i] * 10 + digit;
 
-      if (len == char_offset)
-        break;
+      if (len == char_offset) break;
       // deal with `:`
       if (*(ptr + char_offset) == ':') {
         // trailing `:` is invalid (UTC+11:)
-        if (len == ++char_offset)
-          return {0, 2};
+        if (len == ++char_offset) return {0, 2};
         has_colon = true;
       }
     }
 
     // the upper bound is 18:00:00 (regardless of sign)
-    if (hms[0] > 18 || hms[1] > 59 || hms[2] > 59)
-      return {0, 2};
-    if (hms[0] == 18 && hms[1] + hms[2] > 0)
-      return {0, 2};
+    if (hms[0] > 18 || hms[1] > 59 || hms[2] > 59) return {0, 2};
+    if (hms[0] == 18 && hms[1] + hms[2] > 0) return {0, 2};
 
     return {sign * (hms[0] * 3600L + hms[1] * 60L + hms[2]), 0};
   }
@@ -346,14 +322,15 @@ template <bool with_timezone> struct parse_timestamp_string_fn {
   /**
    * TODO: replace linear search with more efficient approach (like prefix tree)
    */
-  __device__ inline int
-  query_index_from_tz_db(string_view const &tz_lit) const {
+  __device__ inline int query_index_from_tz_db(string_view const& tz_lit) const
+  {
     auto predicate = [tz = tz_indices, &tz_lit] __device__(auto const i) {
       return tz->element<string_view>(i) == tz_lit;
     };
-    auto ret = thrust::find_if(
-        thrust::seq, thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(tz_indices->size()), predicate);
+    auto ret = thrust::find_if(thrust::seq,
+                               thrust::make_counting_iterator(0),
+                               thrust::make_counting_iterator(tz_indices->size()),
+                               predicate);
 
     return *ret;
   }
@@ -364,23 +341,21 @@ template <bool with_timezone> struct parse_timestamp_string_fn {
    * `convert_timestamp_tz_functor`.
    */
   __device__ inline int64_t extract_timezone_offset(int64_t loose_epoch_second,
-                                                    size_type tz_index) const {
-    auto const &utc_offsets = transitions->child().child(2);
-    auto const &loose_instants = transitions->child().child(3);
+                                                    size_type tz_index) const
+  {
+    auto const& utc_offsets    = transitions->child().child(2);
+    auto const& loose_instants = transitions->child().child(3);
 
-    auto const local_transitions =
-        cudf::list_device_view{*transitions, tz_index};
-    auto const list_size = local_transitions.size();
+    auto const local_transitions = cudf::list_device_view{*transitions, tz_index};
+    auto const list_size         = local_transitions.size();
 
     auto const transition_times = cudf::device_span<int64_t const>(
-        loose_instants.data<int64_t>() + local_transitions.element_offset(0),
-        static_cast<size_t>(list_size));
-
-    auto const it =
-        thrust::upper_bound(thrust::seq, transition_times.begin(),
-                            transition_times.end(), loose_epoch_second);
-    auto const idx =
-        static_cast<size_type>(thrust::distance(transition_times.begin(), it));
+      loose_instants.data<int64_t>() + local_transitions.element_offset(0),
+      static_cast<size_t>(list_size));
+
+    auto const it = thrust::upper_bound(
+      thrust::seq, transition_times.begin(), transition_times.end(), loose_epoch_second);
+    auto const idx         = static_cast<size_type>(thrust::distance(transition_times.begin(), it));
     auto const list_offset = local_transitions.element_offset(idx - 1);
 
     return static_cast<int64_t>(utc_offsets.element<int32_t>(list_offset));
@@ -395,26 +370,24 @@ template <bool with_timezone> struct parse_timestamp_string_fn {
    * the TimezoneDB(Java side). Then, we can apply binary search based on loose
    * epoch instants of TimezoneDB to find out the correct timezone offset.
    */
-  __device__ inline int64_t
-  compute_loose_epoch_s(timestamp_components const &ts) const {
-    return (ts.year * 400 + (ts.month - 1) * 31 + ts.day - 1) * 86400L +
-           ts.hour * 3600L + ts.minute * 60L + ts.second;
+  __device__ inline int64_t compute_loose_epoch_s(timestamp_components const& ts) const
+  {
+    return (ts.year * 400 + (ts.month - 1) * 31 + ts.day - 1) * 86400L + ts.hour * 3600L +
+           ts.minute * 60L + ts.second;
   }
 
   /**
    * Leverage STL to convert local time to UTC unix_timestamp(in millisecond)
    */
-  __device__ inline int64_t
-  compute_epoch_us(timestamp_components const &ts) const {
-    auto const ymd = // chrono class handles the leap year calculations for us
-        cuda::std::chrono::year_month_day(
-            cuda::std::chrono::year{ts.year},
-            cuda::std::chrono::month{static_cast<uint32_t>(ts.month)},
-            cuda::std::chrono::day{static_cast<uint32_t>(ts.day)});
+  __device__ inline int64_t compute_epoch_us(timestamp_components const& ts) const
+  {
+    auto const ymd =  // chrono class handles the leap year calculations for us
+      cuda::std::chrono::year_month_day(cuda::std::chrono::year{ts.year},
+                                        cuda::std::chrono::month{static_cast<uint32_t>(ts.month)},
+                                        cuda::std::chrono::day{static_cast<uint32_t>(ts.day)});
     auto days = cuda::std::chrono::sys_days(ymd).time_since_epoch().count();
 
-    int64_t timestamp_s = (days * 24L * 3600L) + (ts.hour * 3600L) +
-                          (ts.minute * 60L) + ts.second;
+    int64_t timestamp_s = (days * 24L * 3600L) + (ts.hour * 3600L) + (ts.minute * 60L) + ts.second;
 
     return timestamp_s * 1000000L + ts.microseconds;
   }
@@ -427,18 +400,16 @@ template <bool with_timezone> struct parse_timestamp_string_fn {
    * Parse a string with time zone to a timestamp.
    * The bool in the returned tuple is false if the parse failed.
    */
-  __device__ inline ParseResult
-  parse_string_to_timestamp_us(timestamp_components *ts_comp,
-                               char const **parsed_tz_ptr,
-                               size_type *parsed_tz_length,
-                               cudf::string_view const &timestamp_str) const {
-
-    if (timestamp_str.empty()) {
-      return ParseResult::INVALID;
-    }
+  __device__ inline ParseResult parse_string_to_timestamp_us(
+    timestamp_components* ts_comp,
+    char const** parsed_tz_ptr,
+    size_type* parsed_tz_length,
+    cudf::string_view const& timestamp_str) const
+  {
+    if (timestamp_str.empty()) { return ParseResult::INVALID; }
 
-    const char *curr_ptr = timestamp_str.data();
-    const char *end_ptr = curr_ptr + timestamp_str.size_bytes();
+    const char* curr_ptr = timestamp_str.data();
+    const char* end_ptr  = curr_ptr + timestamp_str.size_bytes();
 
     // trim left
     while (curr_ptr < end_ptr && is_whitespace(*curr_ptr)) {
@@ -451,29 +422,26 @@ template <bool with_timezone> struct parse_timestamp_string_fn {
 
     // TODO: support special dates [epoch, now, today, yesterday, tomorrow]
     for (size_type i = 0; i < special_datetime_names.size(); i++) {
-      auto const &ref = special_datetime_names.element<string_view>(i);
-      if (equals_ascii_ignore_case(curr_ptr, end_ptr, ref.data(),
-                                   ref.data() + ref.size_bytes())) {
-        *parsed_tz_ptr = ref.data();
+      auto const& ref = special_datetime_names.element<string_view>(i);
+      if (equals_ascii_ignore_case(curr_ptr, end_ptr, ref.data(), ref.data() + ref.size_bytes())) {
+        *parsed_tz_ptr    = ref.data();
         *parsed_tz_length = ref.size_bytes();
         return ParseResult::UNSUPPORTED;
       }
     }
 
-    if (curr_ptr == end_ptr) {
-      return ParseResult::INVALID;
-    }
+    if (curr_ptr == end_ptr) { return ParseResult::INVALID; }
 
-    const char *const bytes = curr_ptr;
+    const char* const bytes      = curr_ptr;
     const size_type bytes_length = end_ptr - curr_ptr;
 
-    int segments[] = {1, 1, 1, 0, 0, 0, 0, 0, 0};
-    int segments_len = 9;
-    int i = 0;
-    int current_segment_value = 0;
+    int segments[]             = {1, 1, 1, 0, 0, 0, 0, 0, 0};
+    int segments_len           = 9;
+    int i                      = 0;
+    int current_segment_value  = 0;
     int current_segment_digits = 0;
-    size_t j = 0;
-    int digits_milli = 0;
+    size_t j                   = 0;
+    int digits_milli           = 0;
     // bool just_time = false;
     thrust::optional<int> year_sign;
     if ('-' == bytes[j] || '+' == bytes[j]) {
@@ -486,7 +454,7 @@ template <bool with_timezone> struct parse_timestamp_string_fn {
     }
 
     while (j < bytes_length) {
-      char b = bytes[j];
+      char b           = bytes[j];
       int parsed_value = static_cast<int32_t>(b - '0');
       if (parsed_value < 0 || parsed_value > 9) {
         if (0 == j && 'T' == b) {
@@ -494,32 +462,26 @@ template <bool with_timezone> struct parse_timestamp_string_fn {
           i += 3;
         } else if (i < 2) {
           if (b == '-') {
-            if (!is_valid_digits(i, current_segment_digits)) {
-              return ParseResult::INVALID;
-            }
-            segments[i] = current_segment_value;
-            current_segment_value = 0;
+            if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; }
+            segments[i]            = current_segment_value;
+            current_segment_value  = 0;
             current_segment_digits = 0;
             i += 1;
           } else if (0 == i && ':' == b && !year_sign.has_value()) {
             // just_time = true;
-            if (!is_valid_digits(3, current_segment_digits)) {
-              return ParseResult::INVALID;
-            }
-            segments[3] = current_segment_value;
-            current_segment_value = 0;
+            if (!is_valid_digits(3, current_segment_digits)) { return ParseResult::INVALID; }
+            segments[3]            = current_segment_value;
+            current_segment_value  = 0;
             current_segment_digits = 0;
-            i = 4;
+            i                      = 4;
           } else {
             return ParseResult::INVALID;
           }
         } else if (2 == i) {
           if (' ' == b || 'T' == b) {
-            if (!is_valid_digits(i, current_segment_digits)) {
-              return ParseResult::INVALID;
-            }
-            segments[i] = current_segment_value;
-            current_segment_value = 0;
+            if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; }
+            segments[i]            = current_segment_value;
+            current_segment_value  = 0;
             current_segment_digits = 0;
             i += 1;
           } else {
@@ -527,11 +489,9 @@ template <bool with_timezone> struct parse_timestamp_string_fn {
           }
         } else if (3 == i || 4 == i) {
           if (':' == b) {
-            if (!is_valid_digits(i, current_segment_digits)) {
-              return ParseResult::INVALID;
-            }
-            segments[i] = current_segment_value;
-            current_segment_value = 0;
+            if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; }
+            segments[i]            = current_segment_value;
+            current_segment_value  = 0;
             current_segment_digits = 0;
             i += 1;
           } else {
@@ -539,20 +499,17 @@ template <bool with_timezone> struct parse_timestamp_string_fn {
           }
         } else if (5 == i || 6 == i) {
           if ('.' == b && 5 == i) {
-            if (!is_valid_digits(i, current_segment_digits)) {
-              return ParseResult::INVALID;
-            }
-            segments[i] = current_segment_value;
-            current_segment_value = 0;
+            if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; }
+            segments[i]            = current_segment_value;
+            current_segment_value  = 0;
             current_segment_digits = 0;
             i += 1;
           } else {
-            if (!is_valid_digits(i, current_segment_digits) ||
-                !allow_tz_in_date_str) {
+            if (!is_valid_digits(i, current_segment_digits) || !allow_tz_in_date_str) {
               return ParseResult::INVALID;
             }
-            segments[i] = current_segment_value;
-            current_segment_value = 0;
+            segments[i]            = current_segment_value;
+            current_segment_value  = 0;
             current_segment_digits = 0;
             i += 1;
             *parsed_tz_ptr = bytes + j;
@@ -562,16 +519,12 @@ template <bool with_timezone> struct parse_timestamp_string_fn {
             *parsed_tz_length = end_ptr - *parsed_tz_ptr;
             break;
           }
-          if (i == 6 && '.' != b) {
-            i += 1;
-          }
+          if (i == 6 && '.' != b) { i += 1; }
         } else {
           if (i < segments_len && (':' == b || ' ' == b)) {
-            if (!is_valid_digits(i, current_segment_digits)) {
-              return ParseResult::INVALID;
-            }
-            segments[i] = current_segment_value;
-            current_segment_value = 0;
+            if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; }
+            segments[i]            = current_segment_value;
+            current_segment_value  = 0;
             current_segment_digits = 0;
             i += 1;
           } else {
@@ -579,9 +532,7 @@ template <bool with_timezone> struct parse_timestamp_string_fn {
           }
         }
       } else {
-        if (6 == i) {
-          digits_milli += 1;
-        }
+        if (6 == i) { digits_milli += 1; }
         // We will truncate the nanosecond part if there are more than 6 digits,
         // which results in loss of precision
         if (6 != i || current_segment_digits < 6) {
@@ -592,9 +543,7 @@ template <bool with_timezone> struct parse_timestamp_string_fn {
       j += 1;
     }
 
-    if (!is_valid_digits(i, current_segment_digits)) {
-      return ParseResult::INVALID;
-    }
+    if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; }
     segments[i] = current_segment_value;
 
     while (digits_milli < 6) {
@@ -606,12 +555,12 @@ template <bool with_timezone> struct parse_timestamp_string_fn {
     // above is ported from Spark.
 
     // set components
-    ts_comp->year = segments[0];
-    ts_comp->month = static_cast<int8_t>(segments[1]);
-    ts_comp->day = static_cast<int8_t>(segments[2]);
-    ts_comp->hour = static_cast<int8_t>(segments[3]);
-    ts_comp->minute = static_cast<int8_t>(segments[4]);
-    ts_comp->second = static_cast<int8_t>(segments[5]);
+    ts_comp->year         = segments[0];
+    ts_comp->month        = static_cast<int8_t>(segments[1]);
+    ts_comp->day          = static_cast<int8_t>(segments[2]);
+    ts_comp->hour         = static_cast<int8_t>(segments[3]);
+    ts_comp->minute       = static_cast<int8_t>(segments[4]);
+    ts_comp->second       = static_cast<int8_t>(segments[5]);
     ts_comp->microseconds = segments[6];
 
     return ParseResult::OK;
@@ -626,85 +575,86 @@ template <bool with_timezone> struct parse_timestamp_string_fn {
  * without_timezone ones by checking if transitions and tz_indices are nullptr.
  *
  */
-std::unique_ptr<cudf::column>
-to_timestamp(cudf::strings_column_view const &input,
-             cudf::strings_column_view const &special_datetime_lit,
-             bool ansi_mode, bool allow_tz_in_date_str = true,
-             size_type default_tz_index = 1000000000,
-             cudf::column_view const *transitions = nullptr,
-             cudf::strings_column_view const *tz_indices = nullptr) {
+std::unique_ptr<cudf::column> to_timestamp(cudf::strings_column_view const& input,
+                                           cudf::strings_column_view const& special_datetime_lit,
+                                           bool ansi_mode,
+                                           bool allow_tz_in_date_str                   = true,
+                                           size_type default_tz_index                  = 1000000000,
+                                           cudf::column_view const* transitions        = nullptr,
+                                           cudf::strings_column_view const* tz_indices = nullptr)
+{
   auto const stream = cudf::get_default_stream();
-  auto const mr = rmm::mr::get_current_device_resource();
+  auto const mr     = rmm::mr::get_current_device_resource();
 
   auto d_strings = cudf::column_device_view::create(input.parent(), stream);
   auto d_special_datetime_lit =
-      cudf::column_device_view::create(special_datetime_lit.parent(), stream);
+    cudf::column_device_view::create(special_datetime_lit.parent(), stream);
 
   // column to store the result timestamp
-  auto result_col = cudf::make_timestamp_column(
-      cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}, input.size(),
-      cudf::mask_state::UNALLOCATED, stream, mr);
+  auto result_col =
+    cudf::make_timestamp_column(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS},
+                                input.size(),
+                                cudf::mask_state::UNALLOCATED,
+                                stream,
+                                mr);
   // column to store the status `ParseResult`
   auto result_valid_col = cudf::make_fixed_width_column(
-      cudf::data_type{cudf::type_id::UINT8}, input.size(),
-      cudf::mask_state::UNALLOCATED, stream, mr);
+    cudf::data_type{cudf::type_id::UINT8}, input.size(), cudf::mask_state::UNALLOCATED, stream, mr);
 
   if (transitions == nullptr || tz_indices == nullptr) {
     thrust::transform(
-        rmm::exec_policy(stream), thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(input.size()),
-        thrust::make_zip_iterator(thrust::make_tuple(
-            result_col->mutable_view().begin<cudf::timestamp_us>(),
-            result_valid_col->mutable_view().begin<uint8_t>())),
-        parse_timestamp_string_fn<false>{*d_strings, *d_special_datetime_lit,
-                                         default_tz_index,
-                                         allow_tz_in_date_str});
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(input.size()),
+      thrust::make_zip_iterator(
+        thrust::make_tuple(result_col->mutable_view().begin<cudf::timestamp_us>(),
+                           result_valid_col->mutable_view().begin<uint8_t>())),
+      parse_timestamp_string_fn<false>{
+        *d_strings, *d_special_datetime_lit, default_tz_index, allow_tz_in_date_str});
   } else {
-    auto const ft_cdv_ptr = column_device_view::create(*transitions, stream);
+    auto const ft_cdv_ptr    = column_device_view::create(*transitions, stream);
     auto const d_transitions = lists_column_device_view{*ft_cdv_ptr};
-    auto d_tz_indices =
-        cudf::column_device_view::create(tz_indices->parent(), stream);
+    auto d_tz_indices        = cudf::column_device_view::create(tz_indices->parent(), stream);
 
     thrust::transform(
-        rmm::exec_policy(stream), thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(input.size()),
-        thrust::make_zip_iterator(thrust::make_tuple(
-            result_col->mutable_view().begin<cudf::timestamp_us>(),
-            result_valid_col->mutable_view().begin<uint8_t>())),
-        parse_timestamp_string_fn<true>{*d_strings, *d_special_datetime_lit,
-                                        default_tz_index, true, d_transitions,
-                                        *d_tz_indices});
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(input.size()),
+      thrust::make_zip_iterator(
+        thrust::make_tuple(result_col->mutable_view().begin<cudf::timestamp_us>(),
+                           result_valid_col->mutable_view().begin<uint8_t>())),
+      parse_timestamp_string_fn<true>{
+        *d_strings, *d_special_datetime_lit, default_tz_index, true, d_transitions, *d_tz_indices});
   }
 
   auto valid_view = result_valid_col->mutable_view();
 
   // throw cuDF exception if there exists any unsupported formats
   auto exception_exists =
-      thrust::any_of(rmm::exec_policy(stream), valid_view.begin<uint8_t>(),
-                     valid_view.end<uint8_t>(), [] __device__(uint8_t e) {
-                       return e == ParseResult::UNSUPPORTED;
-                     });
-  if (exception_exists) {
-    CUDF_FAIL("There exists unsupported timestamp schema!");
-  }
+    thrust::any_of(rmm::exec_policy(stream),
+                   valid_view.begin<uint8_t>(),
+                   valid_view.end<uint8_t>(),
+                   [] __device__(uint8_t e) { return e == ParseResult::UNSUPPORTED; });
+  if (exception_exists) { CUDF_FAIL("There exists unsupported timestamp schema!"); }
 
   // build the updated nullmask and compute the null count
   auto [valid_bitmask, valid_null_count] = cudf::detail::valid_if(
-      valid_view.begin<uint8_t>(), valid_view.end<uint8_t>(),
-      [] __device__(uint8_t e) { return e == 0; }, stream, mr);
+    valid_view.begin<uint8_t>(),
+    valid_view.end<uint8_t>(),
+    [] __device__(uint8_t e) { return e == 0; },
+    stream,
+    mr);
 
   // `output null count > input null count` indicates that there are new null
   // values generated during the `to_timestamp` transaction to replace invalid
   // inputs.
-  if (ansi_mode && input.null_count() < valid_null_count) {
-    return nullptr;
-  }
+  if (ansi_mode && input.null_count() < valid_null_count) { return nullptr; }
 
   result_col->set_null_mask(valid_bitmask, valid_null_count, stream);
   return std::move(result_col);
 }
 
-} // namespace
+}  // namespace
 
 namespace spark_rapids_jni {
 
@@ -714,16 +664,16 @@ namespace spark_rapids_jni {
  * If does not have time zone in string, use the default time zone.
  */
 std::unique_ptr<cudf::column> string_to_timestamp_with_tz(
-    cudf::strings_column_view const &input,
-    cudf::column_view const &transitions,
-    cudf::strings_column_view const &tz_indices,
-    cudf::strings_column_view const &special_datetime_lit,
-    cudf::size_type default_tz_index, bool ansi_mode) {
-  if (input.size() == 0) {
-    return nullptr;
-  }
-  return to_timestamp(input, special_datetime_lit, ansi_mode, true,
-                      default_tz_index, &transitions, &tz_indices);
+  cudf::strings_column_view const& input,
+  cudf::column_view const& transitions,
+  cudf::strings_column_view const& tz_indices,
+  cudf::strings_column_view const& special_datetime_lit,
+  cudf::size_type default_tz_index,
+  bool ansi_mode)
+{
+  if (input.size() == 0) { return nullptr; }
+  return to_timestamp(
+    input, special_datetime_lit, ansi_mode, true, default_tz_index, &transitions, &tz_indices);
 }
 
 /**
@@ -734,13 +684,13 @@ std::unique_ptr<cudf::column> string_to_timestamp_with_tz(
  * invalid.
  */
 std::unique_ptr<cudf::column> string_to_timestamp_without_tz(
-    cudf::strings_column_view const &input,
-    cudf::strings_column_view const &special_datetime_lit, bool allow_time_zone,
-    bool ansi_mode) {
-  if (input.size() == 0) {
-    return nullptr;
-  }
+  cudf::strings_column_view const& input,
+  cudf::strings_column_view const& special_datetime_lit,
+  bool allow_time_zone,
+  bool ansi_mode)
+{
+  if (input.size() == 0) { return nullptr; }
   return to_timestamp(input, special_datetime_lit, ansi_mode, allow_time_zone);
 }
 
-} // namespace spark_rapids_jni
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp
index 9c536b6837..bcdfe55ebf 100644
--- a/src/main/cpp/src/datetime_parser.hpp
+++ b/src/main/cpp/src/datetime_parser.hpp
@@ -78,11 +78,12 @@ namespace spark_rapids_jni {
  * if there exists invalid inputs and ANSI mode is on.
  */
 std::unique_ptr<cudf::column> string_to_timestamp_with_tz(
-    cudf::strings_column_view const &input,
-    cudf::column_view const &transitions,
-    cudf::strings_column_view const &tz_indices,
-    cudf::strings_column_view const &special_datetime_lit,
-    cudf::size_type default_tz_index, bool ansi_mode);
+  cudf::strings_column_view const& input,
+  cudf::column_view const& transitions,
+  cudf::strings_column_view const& tz_indices,
+  cudf::strings_column_view const& special_datetime_lit,
+  cudf::size_type default_tz_index,
+  bool ansi_mode);
 
 /**
  *
@@ -140,8 +141,9 @@ std::unique_ptr<cudf::column> string_to_timestamp_with_tz(
  * if there exists invalid inputs and ANSI mode is on.
  */
 std::unique_ptr<cudf::column> string_to_timestamp_without_tz(
-    cudf::strings_column_view const &input,
-    cudf::strings_column_view const &special_datetime_lit, bool allow_time_zone,
-    bool ansi_mode);
+  cudf::strings_column_view const& input,
+  cudf::strings_column_view const& special_datetime_lit,
+  bool allow_time_zone,
+  bool ansi_mode);
 
-} // namespace spark_rapids_jni
+}  // namespace spark_rapids_jni

From 2cf49405eb2dc20d0803736eab4518abafa27a25 Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Thu, 11 Jan 2024 13:40:07 +0800
Subject: [PATCH 12/35] Copyrights;typos

---
 src/main/cpp/CMakeLists.txt                   |   2 +-
 src/main/cpp/src/CastStringJni.cpp            |   2 +-
 src/main/cpp/src/datetime_parser.cu           |  22 +-
 src/main/cpp/src/datetime_parser.hpp          |   6 +-
 src/main/cpp/tests/CMakeLists.txt             |   3 -
 src/main/cpp/tests/datetime_parser.cpp        | 188 ------------------
 .../nvidia/spark/rapids/jni/CastStrings.java  |   2 +-
 .../spark/rapids/jni/GpuTimeZoneDB.java       |   2 +-
 .../spark/rapids/jni/CastStringsTest.java     |   2 +-
 9 files changed, 19 insertions(+), 210 deletions(-)
 delete mode 100644 src/main/cpp/tests/datetime_parser.cpp

diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index 4eabade61b..d83253747b 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp
index ee0a053b88..6d3fb6b405 100644
--- a/src/main/cpp/src/CastStringJni.cpp
+++ b/src/main/cpp/src/CastStringJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index 7e2a73d959..25eef7291c 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -126,7 +126,7 @@ __device__ __host__ bool is_valid_digits(int segment, int digits)
 }
 
 /**
- * We have to dintinguish INVALID value with UNSUPPORTED value.
+ * We have to distinguish INVALID value with UNSUPPORTED value.
  * INVALID means the value is invalid in Spark SQL.
  * UNSUPPORTED means the value is valid in Spark SQL but not supported by rapids
  * yet. As for INVALID values, we treat them in the same as Spark SQL. As for
@@ -149,7 +149,7 @@ struct parse_timestamp_string_fn {
 
   __device__ thrust::tuple<cudf::timestamp_us, uint8_t> operator()(const cudf::size_type& idx) const
   {
-    // inherit the nullmask of the input column
+    // inherit the null mask of the input column
     if (!d_strings.is_valid(idx)) {
       return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID);
     }
@@ -221,7 +221,7 @@ struct parse_timestamp_string_fn {
    * This function is purposed to be fully align to Apache Spark's behavior. The
    * function returns the status along with the result: 0 - successfully parsed
    * the timezone offset 1 - not a valid UTC-like timezone representation, maybe
-   * valid regioned-base rep 2 - not a valid timezone representation
+   * valid region-based rep 2 - not a valid timezone representation
    *
    * Valid patterns:
    *   with colon
@@ -267,7 +267,7 @@ struct parse_timestamp_string_fn {
     } else if (sign_char == '-') {
       sign = -1L;
     } else {
-      // if the rep starts with UTC|GMT, it can NOT be regioned-base rep
+      // if the rep starts with UTC|GMT, it can NOT be region-based rep
       return {0, char_offset < 3 ? 1 : 2};
     }
 
@@ -291,7 +291,7 @@ struct parse_timestamp_string_fn {
       // deal with `:`
       if (*(ptr + char_offset) == ':') {
         // 1. (i == 1) one_digit mm with ss is invalid (+11:2:3)
-        // 2. (i == 2) one_dight ss is invalid (+11:22:3)
+        // 2. (i == 2) one_digit ss is invalid (+11:22:3)
         // 3. trailing `:` is invalid (GMT+8:)
         if (i > 0 || len == ++char_offset) return {0, 2};
         has_colon = true;
@@ -336,7 +336,7 @@ struct parse_timestamp_string_fn {
   }
 
   /**
-   * Perform binaryserach to search out the timezone offset based on loose epoch
+   * Perform binary search to search out the timezone offset based on loose epoch
    * instants. Basically, this is the same approach as
    * `convert_timestamp_tz_functor`.
    */
@@ -571,7 +571,7 @@ struct parse_timestamp_string_fn {
  * The common entrance of string_to_timestamp, which combines two paths:
  * with_timezone and without_timezone. This function returns the The
  * transitions, tz_indices and default_tz_index are only for handling inputs
- * with timezone. So, this function distinguish with_timezone callsfrom
+ * with timezone. So, this function distinguish with_timezone calls from
  * without_timezone ones by checking if transitions and tz_indices are nullptr.
  *
  */
@@ -637,7 +637,7 @@ std::unique_ptr<cudf::column> to_timestamp(cudf::strings_column_view const& inpu
                    [] __device__(uint8_t e) { return e == ParseResult::UNSUPPORTED; });
   if (exception_exists) { CUDF_FAIL("There exists unsupported timestamp schema!"); }
 
-  // build the updated nullmask and compute the null count
+  // build the updated null mask and compute the null count
   auto [valid_bitmask, valid_null_count] = cudf::detail::valid_if(
     valid_view.begin<uint8_t>(),
     valid_view.end<uint8_t>(),
@@ -660,7 +660,7 @@ namespace spark_rapids_jni {
 
 /**
  * Parse string column with time zone to timestamp column,
- * Returns a pair of timestamp column and a bool indicates whether successed.
+ * Returns a pair of timestamp column and a bool indicates whether it successes.
  * If does not have time zone in string, use the default time zone.
  */
 std::unique_ptr<cudf::column> string_to_timestamp_with_tz(
@@ -678,7 +678,7 @@ std::unique_ptr<cudf::column> string_to_timestamp_with_tz(
 
 /**
  * Parse string column with time zone to timestamp column,
- * Returns a pair of timestamp column and a bool indicates whether successed.
+ * Returns a pair of timestamp column and a bool indicates whether it successes.
  * Do not use the time zone in string.
  * If allow_time_zone is false and string contains time zone, then the string is
  * invalid.
diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp
index bcdfe55ebf..b1753af6a2 100644
--- a/src/main/cpp/src/datetime_parser.hpp
+++ b/src/main/cpp/src/datetime_parser.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -68,7 +68,7 @@ namespace spark_rapids_jni {
  * @param transitions TimezoneDB, the table of transitions contains all
  * information for timezones
  * @param tz_indices TimezoneDB index of region-based timezone IDs
- * @param special_datetime_lit cache of special datetimes
+ * @param special_datetime_lit cache of special date times
  * @param default_tz_index the index of default timezone in TimezoneDB, if input
  * date-like string does not contain a time zone (like: YYYY-MM-DD:hhmmss), use
  * this time zone.
@@ -132,7 +132,7 @@ std::unique_ptr<cudf::column> string_to_timestamp_with_tz(
  *
  *
  * @param input input string column view.
- * @param special_datetime_lit cache of special datetimes
+ * @param special_datetime_lit cache of special date times
  * @param allow_time_zone whether allow time zone in the timestamp string. e.g.:
  *   1991-04-14T02:00:00Asia/Shanghai is invalid when do not allow time zone.
  * @param ansi_mode whether enforce ANSI mode or not. If true, exception will be
diff --git a/src/main/cpp/tests/CMakeLists.txt b/src/main/cpp/tests/CMakeLists.txt
index 1f58176327..617df6dfde 100644
--- a/src/main/cpp/tests/CMakeLists.txt
+++ b/src/main/cpp/tests/CMakeLists.txt
@@ -57,9 +57,6 @@ ConfigureTest(FORMAT_FLOAT
 ConfigureTest(CAST_FLOAT_TO_STRING
     cast_float_to_string.cpp)
 
-ConfigureTest(DATETIME_PARSER
-    datetime_parser.cpp)
-
 ConfigureTest(DATETIME_REBASE
     datetime_rebase.cpp)
 
diff --git a/src/main/cpp/tests/datetime_parser.cpp b/src/main/cpp/tests/datetime_parser.cpp
deleted file mode 100644
index 9ab4271327..0000000000
--- a/src/main/cpp/tests/datetime_parser.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cassert>
-#include <cstring>
-
-#include <datetime_parser.hpp>
-
-//
-
-#include <cudf/strings/convert/convert_datetime.hpp>
-#include <cudf/strings/convert/convert_durations.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/unary.hpp>
-#include <cudf/wrappers/durations.hpp>
-#include <cudf/wrappers/timestamps.hpp>
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-
-using timestamp_col =
-  cudf::test::fixed_width_column_wrapper<cudf::timestamp_us, cudf::timestamp_us::rep>;
-using micros_col =
-  cudf::test::fixed_width_column_wrapper<cudf::timestamp_us, cudf::timestamp_us::rep>;
-struct DateTimeParserTest : public cudf::test::BaseFixture {};
-
-TEST_F(DateTimeParserTest, ParseTimestamp)
-{
-  auto ts_strings = cudf::test::strings_column_wrapper(
-    {
-      "2023",
-      " 2023 ",
-      " 2023-11 ",
-      " 2023-11-5 ",
-      " 2023-11-05 3:04:55   ",
-      " 2023-11-05T03:4:55   ",
-      " 2023-11-05T3:4:55   ",
-      "  2023-11-5T3:4:55.",
-      "  2023-11-5T3:4:55.Iran",
-      "  2023-11-5T3:4:55.1 ",
-      "  2023-11-5T3:4:55.1Iran",
-      "  2023-11-05T03:04:55.123456  ",
-      "  2023-11-05T03:04:55.123456Iran  ",
-      " 222222 ",
-      " ",  // invalid
-      "",   // invalid
-      "1-"  // invalid
-
-    },
-    {
-
-      0,  // null bit
-      1,
-      1,
-      1,
-      1,
-      1,
-      1,
-      1,
-      1,
-      1,
-      1,
-      1,
-      1,
-      1,
-      1,
-      1,
-      1
-
-    });
-  auto d_2023_1_1           = (2023L * 365L * 86400L + 1 * 30L * 86400L + 1 * 86400L) * 1000000L;
-  auto d_2023_11_1          = (2023L * 365L * 86400L + 11 * 30L * 86400L + 1 * 86400L) * 1000000L;
-  auto d_2023_11_5          = (2023L * 365L * 86400L + 11L * 30L * 86400L + 5L * 86400L) * 1000000L;
-  auto t_3_4_55             = (3L * 3600L + 4L * 60L + 55L) * 1000000L;
-  auto d_2023_11_5_t_3_4_55 = d_2023_11_5 + t_3_4_55;
-  auto ts_col               = timestamp_col(
-    {
-
-      0L,
-      d_2023_1_1,
-      d_2023_11_1,
-      d_2023_11_5,
-      d_2023_11_5_t_3_4_55,
-      d_2023_11_5_t_3_4_55,
-      d_2023_11_5_t_3_4_55,
-      d_2023_11_5_t_3_4_55,
-      d_2023_11_5_t_3_4_55,
-      d_2023_11_5_t_3_4_55 + 100000,
-      d_2023_11_5_t_3_4_55 + 100000,
-      d_2023_11_5_t_3_4_55 + 123456,
-      d_2023_11_5_t_3_4_55 + 123456,
-      (222222L * 365L * 86400L + 1 * 30L * 86400L + 1 * 86400L) * 1000000L,
-      0L,
-      0L,
-      0L
-
-    },
-    {
-      0,  // null bit
-      1,
-      1,
-      1,
-      1,
-      1,
-      1,
-      1,
-      1,
-      1,
-      1,
-      1,
-      1,
-      1,
-      0,  // null bit
-      0,  // null bit
-      0   // null bit
-
-    });
-  auto ret =
-    spark_rapids_jni::string_to_timestamp(cudf::strings_column_view(ts_strings), "Z", true, false);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(ts_col, *(ret.first));
-  assert(ret.second == true);
-
-  ts_strings = cudf::test::strings_column_wrapper(
-    {
-
-      "invalid"
-
-    },
-    {
-
-      1
-
-    });
-  ts_col = timestamp_col(
-    {
-
-      0L
-
-    },
-    {0
-
-    });
-  ret =
-    spark_rapids_jni::string_to_timestamp(cudf::strings_column_view(ts_strings), "Z", true, true);
-  assert(ret.first == nullptr);
-  assert(ret.second == false);
-
-  ts_strings = cudf::test::strings_column_wrapper(
-    {
-
-      " Epoch  ", " NOW ", "  today  ", "  tomoRRow  ", "  yesTERday  "
-
-    },
-    {
-
-      1, 1, 1, 1, 1
-
-    });
-  ts_col = timestamp_col(
-    {// Temp implement: epoch -> 111, now -> 222, ... , yesterday -> 555
-     111L,
-     222L,
-     333L,
-     444L,
-     555L
-
-    },
-    {1, 1, 1, 1, 1
-
-    });
-  ret =
-    spark_rapids_jni::string_to_timestamp(cudf::strings_column_view(ts_strings), "Z", true, true);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(ts_col, *(ret.first));
-  assert(ret.second == true);
-}
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
index b383468e7e..eb5c09b062 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
index cc9ba04a8e..cccf831081 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
@@ -1,5 +1,5 @@
 /*
-* Copyright (c) 2023, NVIDIA CORPORATION.
+* Copyright (c) 2023-2024, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
index a8939bc825..e5384e4be7 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 8eff534ec7bc76497a46a11e0e8759ca3f45942a Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Thu, 11 Jan 2024 17:25:03 +0800
Subject: [PATCH 13/35] Fix compile error; Update comments

---
 src/main/cpp/src/datetime_parser.cu  |  2 +-
 src/main/cpp/src/datetime_parser.hpp | 11 ++---------
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index 25eef7291c..8e9c503722 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -651,7 +651,7 @@ std::unique_ptr<cudf::column> to_timestamp(cudf::strings_column_view const& inpu
   if (ansi_mode && input.null_count() < valid_null_count) { return nullptr; }
 
   result_col->set_null_mask(valid_bitmask, valid_null_count, stream);
-  return std::move(result_col);
+  return result_col;
 }
 
 }  // namespace
diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp
index b1753af6a2..7ee05b84ec 100644
--- a/src/main/cpp/src/datetime_parser.hpp
+++ b/src/main/cpp/src/datetime_parser.hpp
@@ -58,11 +58,7 @@ namespace spark_rapids_jni {
  *     - +|-hhmmss
  *  - Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
  *
- * Unlike Spark, Spark-Rapids only supports the following time zones:
- *   - Z - Zulu time zone UTC+0
- *   - +|-[h]h:[m]m
- *   - Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
- *
+ * Unlike Spark, Spark-Rapids currently does not support DST time zones.
  *
  * @param input input string column view.
  * @param transitions TimezoneDB, the table of transitions contains all
@@ -125,10 +121,7 @@ std::unique_ptr<cudf::column> string_to_timestamp_with_tz(
  *     - +|-hhmmss
  *  - Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
  *
- * Unlike Spark, Spark-Rapids only supports the following time zones:
- *   - Z - Zulu time zone UTC+0
- *   - +|-[h]h:[m]m
- *   - Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
+ * Unlike Spark, Spark-Rapids currently does not support DST time zones.
  *
  *
  * @param input input string column view.

From 5dbc7ebc785026032167b030b249b68526f36f2e Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Fri, 12 Jan 2024 18:30:36 +0800
Subject: [PATCH 14/35] Remove supports for cast special strings(epoch now
 today yesterday tomorrow) to timestamp because only Spark31x supports and
 Spark320+ does not supports

---
 src/main/cpp/src/CastStringJni.cpp            | 11 +----
 src/main/cpp/src/datetime_parser.cu           | 47 ++-----------------
 src/main/cpp/src/datetime_parser.hpp          |  4 --
 .../nvidia/spark/rapids/jni/CastStrings.java  | 17 +++----
 .../spark/rapids/jni/GpuTimeZoneDB.java       | 23 ++-------
 .../spark/rapids/jni/CastStringsTest.java     |  8 +++-
 6 files changed, 21 insertions(+), 89 deletions(-)

diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp
index 6d3fb6b405..82a3dc3242 100644
--- a/src/main/cpp/src/CastStringJni.cpp
+++ b/src/main/cpp/src/CastStringJni.cpp
@@ -263,7 +263,6 @@ Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv* env,
                                                          jlong input_column,
                                                          jlong transitions_handle,
                                                          jlong tz_indices_col,
-                                                         jlong special_dt_lit_col,
                                                          jint tz_default_index,
                                                          jboolean ansi_enabled)
 {
@@ -277,13 +276,10 @@ Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv* env,
       reinterpret_cast<cudf::table_view const*>(transitions_handle)->column(0);
     auto const& tz_indices_view =
       cudf::strings_column_view(*reinterpret_cast<cudf::column_view const*>(tz_indices_col));
-    auto const& special_dt_lit_view =
-      cudf::strings_column_view(*reinterpret_cast<cudf::column_view const*>(special_dt_lit_col));
-
     auto const tz_index = static_cast<cudf::size_type>(tz_default_index);
 
     auto ret_cv = spark_rapids_jni::string_to_timestamp_with_tz(
-      input_view, transitions, tz_indices_view, special_dt_lit_view, tz_index, ansi_enabled);
+      input_view, transitions, tz_indices_view, tz_index, ansi_enabled);
     if (ret_cv) { return cudf::jni::release_as_jlong(ret_cv); }
   }
   CATCH_STD(env, 0);
@@ -298,7 +294,6 @@ JNIEXPORT jlong JNICALL
 Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestampWithoutTimeZone(JNIEnv* env,
                                                                         jclass,
                                                                         jlong input_column,
-                                                                        jlong special_dt_lit_col,
                                                                         jboolean allow_time_zone,
                                                                         jboolean ansi_enabled)
 {
@@ -307,11 +302,9 @@ Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestampWithoutTimeZone(JNIEnv*
     cudf::jni::auto_set_device(env);
     auto const& input_view =
       cudf::strings_column_view(*reinterpret_cast<cudf::column_view const*>(input_column));
-    auto const& special_dt_lit_view =
-      cudf::strings_column_view(*reinterpret_cast<cudf::column_view const*>(special_dt_lit_col));
 
     auto ret_cv = spark_rapids_jni::string_to_timestamp_without_tz(
-      input_view, special_dt_lit_view, allow_time_zone, ansi_enabled);
+      input_view, allow_time_zone, ansi_enabled);
     if (ret_cv) { return cudf::jni::release_as_jlong(ret_cv); }
   }
   CATCH_STD(env, 0);
diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index 8e9c503722..3ba84f2d26 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -87,28 +87,6 @@ __device__ __host__ inline bool is_whitespace(const char chr)
   }
 }
 
-/**
- * Whether the given two strings are equal,
- * used to compare special timestamp strings ignoring case:
- *   "epoch", "now", "today", "yesterday", "tomorrow"
- * the expect string should be lower-case a-z chars
- */
-__device__ inline bool equals_ascii_ignore_case(char const* actual_begin,
-                                                char const* actual_end,
-                                                char const* expect_begin,
-                                                char const* expect_end)
-{
-  if (actual_end - actual_begin != expect_end - expect_begin) { return false; }
-
-  while (expect_begin < expect_end) {
-    // the diff between upper case and lower case for a same char is 32
-    if (*actual_begin != *expect_begin && *actual_begin != (*expect_begin - 32)) { return false; }
-    actual_begin++;
-    expect_begin++;
-  }
-  return true;
-}
-
 /**
  * Ported from Spark
  */
@@ -137,7 +115,6 @@ enum ParseResult { OK = 0, INVALID = 1, UNSUPPORTED = 2 };
 template <bool with_timezone>
 struct parse_timestamp_string_fn {
   column_device_view const d_strings;
-  column_device_view const special_datetime_names;
   size_type default_tz_index;
   bool allow_tz_in_date_str = true;
   // The list column of transitions to figure out the correct offset
@@ -420,16 +397,6 @@ struct parse_timestamp_string_fn {
       --end_ptr;
     }
 
-    // TODO: support special dates [epoch, now, today, yesterday, tomorrow]
-    for (size_type i = 0; i < special_datetime_names.size(); i++) {
-      auto const& ref = special_datetime_names.element<string_view>(i);
-      if (equals_ascii_ignore_case(curr_ptr, end_ptr, ref.data(), ref.data() + ref.size_bytes())) {
-        *parsed_tz_ptr    = ref.data();
-        *parsed_tz_length = ref.size_bytes();
-        return ParseResult::UNSUPPORTED;
-      }
-    }
-
     if (curr_ptr == end_ptr) { return ParseResult::INVALID; }
 
     const char* const bytes      = curr_ptr;
@@ -576,7 +543,6 @@ struct parse_timestamp_string_fn {
  *
  */
 std::unique_ptr<cudf::column> to_timestamp(cudf::strings_column_view const& input,
-                                           cudf::strings_column_view const& special_datetime_lit,
                                            bool ansi_mode,
                                            bool allow_tz_in_date_str                   = true,
                                            size_type default_tz_index                  = 1000000000,
@@ -587,9 +553,6 @@ std::unique_ptr<cudf::column> to_timestamp(cudf::strings_column_view const& inpu
   auto const mr     = rmm::mr::get_current_device_resource();
 
   auto d_strings = cudf::column_device_view::create(input.parent(), stream);
-  auto d_special_datetime_lit =
-    cudf::column_device_view::create(special_datetime_lit.parent(), stream);
-
   // column to store the result timestamp
   auto result_col =
     cudf::make_timestamp_column(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS},
@@ -610,7 +573,7 @@ std::unique_ptr<cudf::column> to_timestamp(cudf::strings_column_view const& inpu
         thrust::make_tuple(result_col->mutable_view().begin<cudf::timestamp_us>(),
                            result_valid_col->mutable_view().begin<uint8_t>())),
       parse_timestamp_string_fn<false>{
-        *d_strings, *d_special_datetime_lit, default_tz_index, allow_tz_in_date_str});
+        *d_strings, default_tz_index, allow_tz_in_date_str});
   } else {
     auto const ft_cdv_ptr    = column_device_view::create(*transitions, stream);
     auto const d_transitions = lists_column_device_view{*ft_cdv_ptr};
@@ -624,7 +587,7 @@ std::unique_ptr<cudf::column> to_timestamp(cudf::strings_column_view const& inpu
         thrust::make_tuple(result_col->mutable_view().begin<cudf::timestamp_us>(),
                            result_valid_col->mutable_view().begin<uint8_t>())),
       parse_timestamp_string_fn<true>{
-        *d_strings, *d_special_datetime_lit, default_tz_index, true, d_transitions, *d_tz_indices});
+        *d_strings, default_tz_index, true, d_transitions, *d_tz_indices});
   }
 
   auto valid_view = result_valid_col->mutable_view();
@@ -667,13 +630,12 @@ std::unique_ptr<cudf::column> string_to_timestamp_with_tz(
   cudf::strings_column_view const& input,
   cudf::column_view const& transitions,
   cudf::strings_column_view const& tz_indices,
-  cudf::strings_column_view const& special_datetime_lit,
   cudf::size_type default_tz_index,
   bool ansi_mode)
 {
   if (input.size() == 0) { return nullptr; }
   return to_timestamp(
-    input, special_datetime_lit, ansi_mode, true, default_tz_index, &transitions, &tz_indices);
+    input, ansi_mode, true, default_tz_index, &transitions, &tz_indices);
 }
 
 /**
@@ -685,12 +647,11 @@ std::unique_ptr<cudf::column> string_to_timestamp_with_tz(
  */
 std::unique_ptr<cudf::column> string_to_timestamp_without_tz(
   cudf::strings_column_view const& input,
-  cudf::strings_column_view const& special_datetime_lit,
   bool allow_time_zone,
   bool ansi_mode)
 {
   if (input.size() == 0) { return nullptr; }
-  return to_timestamp(input, special_datetime_lit, ansi_mode, allow_time_zone);
+  return to_timestamp(input, ansi_mode, allow_time_zone);
 }
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp
index 7ee05b84ec..c7f9d5ec65 100644
--- a/src/main/cpp/src/datetime_parser.hpp
+++ b/src/main/cpp/src/datetime_parser.hpp
@@ -64,7 +64,6 @@ namespace spark_rapids_jni {
  * @param transitions TimezoneDB, the table of transitions contains all
  * information for timezones
  * @param tz_indices TimezoneDB index of region-based timezone IDs
- * @param special_datetime_lit cache of special date times
  * @param default_tz_index the index of default timezone in TimezoneDB, if input
  * date-like string does not contain a time zone (like: YYYY-MM-DD:hhmmss), use
  * this time zone.
@@ -77,7 +76,6 @@ std::unique_ptr<cudf::column> string_to_timestamp_with_tz(
   cudf::strings_column_view const& input,
   cudf::column_view const& transitions,
   cudf::strings_column_view const& tz_indices,
-  cudf::strings_column_view const& special_datetime_lit,
   cudf::size_type default_tz_index,
   bool ansi_mode);
 
@@ -125,7 +123,6 @@ std::unique_ptr<cudf::column> string_to_timestamp_with_tz(
  *
  *
  * @param input input string column view.
- * @param special_datetime_lit cache of special date times
  * @param allow_time_zone whether allow time zone in the timestamp string. e.g.:
  *   1991-04-14T02:00:00Asia/Shanghai is invalid when do not allow time zone.
  * @param ansi_mode whether enforce ANSI mode or not. If true, exception will be
@@ -135,7 +132,6 @@ std::unique_ptr<cudf::column> string_to_timestamp_with_tz(
  */
 std::unique_ptr<cudf::column> string_to_timestamp_without_tz(
   cudf::strings_column_view const& input,
-  cudf::strings_column_view const& special_datetime_lit,
   bool allow_time_zone,
   bool ansi_mode);
 
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
index eb5c09b062..c515269c27 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
@@ -200,10 +200,9 @@ public static ColumnVector toTimestamp(ColumnView cv, ZoneId defaultTimeZone, bo
     Integer tzIndex = singleton.getZoneIDMap().get(defaultTimeZone.normalized().toString());
 
     try (Table transitions = singleton.getTransitions();
-         ColumnVector tzIndices = singleton.getZoneIDVector();
-         ColumnVector specialTz = singleton.getSpecialTzVector()) {
+         ColumnVector tzIndices = singleton.getZoneIDVector()) {
       return new ColumnVector(toTimestamp(cv.getNativeView(), transitions.getNativeView(),
-              tzIndices.getNativeView(), specialTz.getNativeView(), tzIndex, ansiEnabled));
+              tzIndices.getNativeView(), tzIndex, ansiEnabled));
     }
   }
 
@@ -246,11 +245,7 @@ public static ColumnVector toTimestampWithoutTimeZone(ColumnView cv, boolean all
     if (!singleton.isLoaded()) {
       GpuTimeZoneDB.cacheDatabase();
     }
-
-    try (ColumnVector specialTz = singleton.getSpecialTzVector()) {
-      return new ColumnVector(toTimestampWithoutTimeZone(cv.getNativeView(), specialTz.getNativeView(),
-              allowTimeZone,  ansiEnabled));
-    }
+    return new ColumnVector(toTimestampWithoutTimeZone(cv.getNativeView(), allowTimeZone,  ansiEnabled));
   }
 
   private static native long toInteger(long nativeColumnView, boolean ansi_enabled, boolean strip,
@@ -265,7 +260,7 @@ private static native long toIntegersWithBase(long nativeColumnView, int base,
     boolean ansiEnabled, int dtype);
   private static native long fromIntegersWithBase(long nativeColumnView, int base);
   private static native long toTimestamp(long input,
-      long transitions, long tzIndices, long specialDate, int tzIndex, boolean ansiEnabled);
-  private static native long toTimestampWithoutTimeZone(long input,
-      long specialDate, boolean allowTimeZone, boolean ansiEnabled);
+      long transitions, long tzIndices, int tzIndex, boolean ansiEnabled);
+  private static native long toTimestampWithoutTimeZone(long input, boolean allowTimeZone,
+      boolean ansiEnabled);
 }
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
index cccf831081..6b09d2dda0 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
@@ -35,8 +35,6 @@
 public class GpuTimeZoneDB {
 
   public static final int TIMEOUT_SECS = 300;
-  public static final String[] SPECIAL_TZ_LITERALS = {"epoch", "now", "today", "tomorrow", "yesterday"};
-
 
   // For the timezone database, we store the transitions in a ColumnVector that is a list of 
   // structs. The type of this column vector is:
@@ -44,7 +42,6 @@ public class GpuTimeZoneDB {
   private CompletableFuture<Map<String, Integer>> zoneIdToTableFuture;
   private CompletableFuture<HostColumnVector> fixedTransitionsFuture;
   private CompletableFuture<HostColumnVector> zoneIdVectorFuture;
-  private CompletableFuture<HostColumnVector> specialTzLiteralsFuture;
 
   private boolean closed = false;
 
@@ -52,7 +49,6 @@ public class GpuTimeZoneDB {
     zoneIdToTableFuture = new CompletableFuture<>();
     fixedTransitionsFuture = new CompletableFuture<>();
     zoneIdVectorFuture = new CompletableFuture<>();
-    specialTzLiteralsFuture = new CompletableFuture<>();
   }
 
   private static GpuTimeZoneDB instance = new GpuTimeZoneDB();
@@ -163,7 +159,7 @@ public static ZoneId getZoneId(String timeZoneId) {
 
   public boolean isLoaded() {
     return zoneIdToTableFuture.isDone() && fixedTransitionsFuture.isDone() &&
-            zoneIdVectorFuture.isDone() && specialTzLiteralsFuture.isDone();
+            zoneIdVectorFuture.isDone();
   }
 
   private void loadData(Executor executor) throws IllegalStateException {
@@ -286,18 +282,14 @@ private void doLoadData() {
 
         try (HostColumnVector fixedTransitions = HostColumnVector.fromLists(resultType, masterTransitions.toArray(new List[0]))) {
           try (HostColumnVector zoneIdVector = HostColumnVector.fromStrings(zondIdList.toArray(new String[0]))) {
-            try (HostColumnVector specialTzVector = HostColumnVector.fromStrings(SPECIAL_TZ_LITERALS)) {
-              fixedTransitionsFuture.complete(fixedTransitions.incRefCount());
-              zoneIdVectorFuture.complete(zoneIdVector.incRefCount());
-              specialTzLiteralsFuture.complete(specialTzVector.incRefCount());
-            }
+            fixedTransitionsFuture.complete(fixedTransitions.incRefCount());
+            zoneIdVectorFuture.complete(zoneIdVector.incRefCount());
           }
         }
       } catch (Exception e) {
         fixedTransitionsFuture.completeExceptionally(e);
         zoneIdToTableFuture.completeExceptionally(e);
         zoneIdVectorFuture.completeExceptionally(e);
-        specialTzLiteralsFuture.completeExceptionally(e);
         throw e;
       }
     }
@@ -340,15 +332,6 @@ public ColumnVector getZoneIDVector() {
     }
   }
 
-  public ColumnVector getSpecialTzVector() {
-    try {
-      HostColumnVector hcv = specialTzLiteralsFuture.get(TIMEOUT_SECS, TimeUnit.SECONDS);
-      return hcv.copyToDevice();
-    } catch (InterruptedException | ExecutionException | TimeoutException e) {
-      throw new RuntimeException(e);
-    }
-  }
-
   public Table getTransitions() {
     try (ColumnVector fixedTransitions = getFixedTransitions()) {
       return new Table(fixedTransitions);
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
index e5384e4be7..03ab672c4c 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
@@ -436,11 +436,15 @@ void toTimestampTestWithTz() {
         CastStrings.toTimestamp(input, ZoneId.of("UTC"), false);
       }
     });
+
     // Throw unsupported exception for symbols of special dates
+    // Note: Spark 31x supports "epoch", "now", "today", "yesterday", "tomorrow".
+    // But Spark 32x to Spark 35x do not supports.
+    // Currently JNI do not supports
     for (String date : new String[]{"epoch", "now", "today", "yesterday", "tomorrow"})
-    assertThrows(ai.rapids.cudf.CudfException.class, () -> {
+    assertThrows(IllegalArgumentException.class, () -> {
       try (ColumnVector input = ColumnVector.fromStrings(date)) {
-        CastStrings.toTimestamp(input, ZoneId.of("UTC"), false);
+        CastStrings.toTimestamp(input, ZoneId.of("UTC"), true);
       }
     });
 

From 167df5057a32c71e4083e2709438a9f582c550be Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Fri, 12 Jan 2024 18:40:08 +0800
Subject: [PATCH 15/35] Add comments

---
 .../nvidia/spark/rapids/jni/CastStrings.java  | 42 +++++++++++++++----
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
index c515269c27..d27fa5c118 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
@@ -167,10 +167,23 @@ public static ColumnVector fromIntegersWithBase(ColumnView cv, int base) {
    * `[+-]yyyy*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
    * `[+-]yyyy*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
    * 
-   * Supports the following time zones:
-   * - Z - Zulu time zone UTC+0
-   * - +|-[h]h:[m]m
-   * - Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
+   * Spark supports the following zone id forms:
+   *   - Z - Zulu time zone UTC+0
+   *   - +|-[h]h:[m]m
+   *   - A short id, see
+   * https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS
+   *   - An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-,
+   *     and a suffix in the formats:
+   *     - +|-h[h]
+   *     - +|-hh[:]mm
+   *     - +|-hh:mm:ss
+   *     - +|-hhmmss
+   *  - Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
+   *
+   * Unlike Spark, Spark-Rapids currently does not support DST time zones.
+   *
+   * Note: Do not support cast special strings(epoch now today yesterday tomorrow) to timestamp.
+   * Spark31x supports cast special strings while Spark320+ do not supports
    *
    * Example:
    * input = [" 2023", "2023-01-01T08:00:00Asia/Shanghai "]
@@ -219,10 +232,23 @@ public static ColumnVector toTimestamp(ColumnView cv, ZoneId defaultTimeZone, bo
    * `[+-]yyyy*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
    * `[+-]yyyy*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
    * 
-   * Supports the following time zones:
-   * - Z - Zulu time zone UTC+0
-   * - +|-[h]h:[m]m
-   * - Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
+   * Spark supports the following zone id forms:
+   *   - Z - Zulu time zone UTC+0
+   *   - +|-[h]h:[m]m
+   *   - A short id, see
+   * https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS
+   *   - An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-,
+   *     and a suffix in the formats:
+   *     - +|-h[h]
+   *     - +|-hh[:]mm
+   *     - +|-hh:mm:ss
+   *     - +|-hhmmss
+   *  - Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
+   *
+   * Unlike Spark, Spark-Rapids currently does not support DST time zones.
+   *
+   * Note: Do not support cast special strings(epoch now today yesterday tomorrow) to timestamp.
+   * Spark31x supports cast special strings while Spark320+ do not supports
    *
    * Example:
    * input = [" 2023", "2023-01-01T08:00:00Asia/Shanghai "]

From ec1c68687c1cc4267cd7d9c118a1126f343587c9 Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Mon, 15 Jan 2024 10:08:28 +0800
Subject: [PATCH 16/35] Add comments; Add test cases

---
 .../com/nvidia/spark/rapids/jni/CastStrings.java |  9 +++++++--
 .../nvidia/spark/rapids/jni/CastStringsTest.java | 16 ++++++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
index d27fa5c118..f45d2163d2 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
@@ -190,14 +190,19 @@ public static ColumnVector fromIntegersWithBase(ColumnView cv, int base) {
    * ts = toTimestamp(input, "UTC", allowSpecialExpressions = true, ansiEnabled =
    * false)
    * ts is: ['2023-01-01 00:00:00', '2023-01-01T00:00:00']
+   *
+   * Example:
+   * input = ["2023-01-01T08:00:00 non-exist-time-zone"]
+   * In ANSI mode: throws IllegalArgumentException
+   * In non-ANSI mode: return null value
    * 
    * @param cv                      The input string column to be converted.
    * @param defaultTimeZone         Use the default time zone if string does not
    *                                contain time zone.
    * @param ansiEnabled             is Ansi mode
    * @return a timestamp column
-   * @throws IllegalArgumentException if cv contains invalid value when
-   *                                  ansiEnabled is true
+   * @throws IllegalArgumentException if cv contains invalid value or the time zone is
+   *                                  non-existed when ansiEnabled is true
    */
   public static ColumnVector toTimestamp(ColumnView cv, ZoneId defaultTimeZone, boolean ansiEnabled) {
     if (!GpuTimeZoneDB.isSupportedTimeZone(defaultTimeZone)) {
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
index 03ab672c4c..919f0d035e 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
@@ -472,5 +472,21 @@ void toTimestampTestWithTz() {
         CastStrings.toTimestamp(input, ZoneId.of("UTC"), true);
       }
     });
+
+    // Throw IllegalArgumentException for non-exist-tz in ANSI mode
+    assertThrows(IllegalArgumentException.class, () -> {
+      try (ColumnVector input = ColumnVector.fromStrings("2000-01-29 1:2:3 non-exist-tz")) {
+        CastStrings.toTimestamp(input, ZoneId.of("UTC"), true);
+      }
+    });
+
+    // Return null for non-exist-tz in non-Ansi mode
+    try (
+      ColumnVector input = ColumnVector.fromStrings("2000-01-29 1:2:3 non-exist-tz");
+      ColumnVector actual = CastStrings.toTimestamp(input, ZoneId.of("UTC"), false)) {
+        Long[] expected = {null};
+        AssertUtils.assertColumnsAreEqual(expected, actual);
+    }
+
   }
 }

From 3cba7d06d27eb3b888b27c5a0e4bb22fb3a9cf26 Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Mon, 15 Jan 2024 10:39:12 +0800
Subject: [PATCH 17/35] Address comments

---
 src/main/cpp/src/datetime_parser.cu | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index 3ba84f2d26..ec040e89f2 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -402,6 +402,8 @@ struct parse_timestamp_string_fn {
     const char* const bytes      = curr_ptr;
     const size_type bytes_length = end_ptr - curr_ptr;
 
+    // segments stores: [year, month, day, hour, minute, seconds, microseconds, no_use_item, no_use_item]
+    // the two tail items are no use, but here keeps them as Spark does
     int segments[]             = {1, 1, 1, 0, 0, 0, 0, 0, 0};
     int segments_len           = 9;
     int i                      = 0;
@@ -521,7 +523,9 @@ struct parse_timestamp_string_fn {
     segments[0] *= year_sign.value_or(1);
     // above is ported from Spark.
 
-    // set components
+    // copy segments to equivalent kernel timestamp_components
+    // Note: In order to keep above code is equivalent to Spark implementation,
+    //       did not use `timestamp_components` directly to save values.
     ts_comp->year         = segments[0];
     ts_comp->month        = static_cast<int8_t>(segments[1]);
     ts_comp->day          = static_cast<int8_t>(segments[2]);

From e6af1958ea768013adc41e5a227c0113c7a8fb21 Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Mon, 15 Jan 2024 10:52:20 +0800
Subject: [PATCH 18/35] Fix case

---
 src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
index 919f0d035e..372ae3bd42 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
@@ -481,10 +481,11 @@ void toTimestampTestWithTz() {
     });
 
     // Return null for non-exist-tz in non-Ansi mode
+    Long[] nullExpected = {null};
     try (
       ColumnVector input = ColumnVector.fromStrings("2000-01-29 1:2:3 non-exist-tz");
+      ColumnVector expected = ColumnVector.timestampMicroSecondsFromBoxedLongs(nullExpected);
       ColumnVector actual = CastStrings.toTimestamp(input, ZoneId.of("UTC"), false)) {
-        Long[] expected = {null};
         AssertUtils.assertColumnsAreEqual(expected, actual);
     }
 

From 0b33ff9f3b6b5a22475de3f7c1af3cfdea3277e7 Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Mon, 15 Jan 2024 10:54:14 +0800
Subject: [PATCH 19/35] Update

---
 src/main/cpp/src/datetime_parser.cu | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index ec040e89f2..86190ba861 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -383,8 +383,6 @@ struct parse_timestamp_string_fn {
     size_type* parsed_tz_length,
     cudf::string_view const& timestamp_str) const
   {
-    if (timestamp_str.empty()) { return ParseResult::INVALID; }
-
     const char* curr_ptr = timestamp_str.data();
     const char* end_ptr  = curr_ptr + timestamp_str.size_bytes();
 

From a8fc54cf83589b5e17a82dbbe1c54e78ac6d79af Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Mon, 15 Jan 2024 19:08:14 +0800
Subject: [PATCH 20/35] Support short time zone IDs, like PST, CTT......

---
 src/main/cpp/src/CastStringJni.cpp            |  7 ++--
 src/main/cpp/src/datetime_parser.cu           | 34 ++++++++++++-------
 src/main/cpp/src/datetime_parser.hpp          |  3 +-
 .../nvidia/spark/rapids/jni/CastStrings.java  |  7 ++--
 .../spark/rapids/jni/GpuTimeZoneDB.java       | 34 +++++++++++++++++++
 .../spark/rapids/jni/CastStringsTest.java     |  3 ++
 6 files changed, 68 insertions(+), 20 deletions(-)

diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp
index 82a3dc3242..ab4f977750 100644
--- a/src/main/cpp/src/CastStringJni.cpp
+++ b/src/main/cpp/src/CastStringJni.cpp
@@ -264,7 +264,8 @@ Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv* env,
                                                          jlong transitions_handle,
                                                          jlong tz_indices_col,
                                                          jint tz_default_index,
-                                                         jboolean ansi_enabled)
+                                                         jboolean ansi_enabled,
+                                                         jlong tz_short_ids)
 {
   JNI_NULL_CHECK(env, input_column, "input column is null", 0);
   try {
@@ -277,9 +278,9 @@ Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv* env,
     auto const& tz_indices_view =
       cudf::strings_column_view(*reinterpret_cast<cudf::column_view const*>(tz_indices_col));
     auto const tz_index = static_cast<cudf::size_type>(tz_default_index);
-
+    const cudf::column_view *tz_short_ids_view = reinterpret_cast<cudf::column_view *>(tz_short_ids);
     auto ret_cv = spark_rapids_jni::string_to_timestamp_with_tz(
-      input_view, transitions, tz_indices_view, tz_index, ansi_enabled);
+      input_view, transitions, tz_indices_view, tz_index, ansi_enabled, *tz_short_ids_view);
     if (ret_cv) { return cudf::jni::release_as_jlong(ret_cv); }
   }
   CATCH_STD(env, 0);
diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index 86190ba861..656215364d 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -123,6 +123,7 @@ struct parse_timestamp_string_fn {
   // looseTzInstant: int64>>.
   thrust::optional<lists_column_device_view const> transitions = thrust::nullopt;
   thrust::optional<column_device_view const> tz_indices        = thrust::nullopt;
+  thrust::optional<column_device_view const> tz_short_ids      = thrust::nullopt;
 
   __device__ thrust::tuple<cudf::timestamp_us, uint8_t> operator()(const cudf::size_type& idx) const
   {
@@ -161,6 +162,19 @@ struct parse_timestamp_string_fn {
       tz_offset = extract_timezone_offset(compute_loose_epoch_s(ts_comp), default_tz_index);
     } else {
       auto tz_view = string_view(tz_lit_ptr, tz_lit_len);
+
+      // Map short TZ ID to region-based timezone if tz_view is a short ID
+      auto const& short_name_col = tz_short_ids->child(0);
+      auto const& region_based_col = tz_short_ids->child(1);
+      for (size_type i = 0; i < tz_short_ids->size(); i++) {
+        auto const& curr_short_id = short_name_col.element<string_view>(i);
+        if (curr_short_id == tz_view) {
+          // find short ID, replace tz_view with mapped region TZ ID
+          tz_view = region_based_col.element<string_view>(i);
+          break;
+        }
+      }
+
       // Firstly, try parsing as utc-like timezone rep
       if (auto [utc_offset, ret_code] = parse_utc_like_tz(tz_view); ret_code == 0) {
         tz_offset = utc_offset;
@@ -192,7 +206,6 @@ struct parse_timestamp_string_fn {
   }
 
   /**
-   * TODO: support CST/PST/AST
    *
    * Parse UTC-like timezone representation such as: UTC+11:22:33, GMT-8:08:01.
    * This function is purposed to be fully align to Apache Spark's behavior. The
@@ -207,8 +220,6 @@ struct parse_timestamp_string_fn {
    *   without colon
    *     hh only    : ^(GMT|UTC)?[+-](\d|0[0-9]|1[0-8])
    *     hh:mm:(ss) : ^(GMT|UTC)?[+-](0[0-9]|1[0-8])([0-5][0-9])?([0-5][0-9])?
-   *   special symbols:
-   *                  ^(Z|CST|PST|AST|...)
    *
    *   additional restriction: 18:00:00 is the upper bound (which means 18:00:01
    * is invalid)
@@ -220,12 +231,6 @@ struct parse_timestamp_string_fn {
 
     char const* ptr = tz_lit.data();
 
-    // try to parse Z
-    if (*ptr == 'Z') {
-      if (len > 1) return {0, 1};
-      return {0, 0};
-    }
-
     size_t char_offset = 0;
     // skip UTC|GMT if existing
     if (len > 2 && ((*ptr == 'G' && *(ptr + 1) == 'M' && *(ptr + 2) == 'T') ||
@@ -549,7 +554,8 @@ std::unique_ptr<cudf::column> to_timestamp(cudf::strings_column_view const& inpu
                                            bool allow_tz_in_date_str                   = true,
                                            size_type default_tz_index                  = 1000000000,
                                            cudf::column_view const* transitions        = nullptr,
-                                           cudf::strings_column_view const* tz_indices = nullptr)
+                                           cudf::strings_column_view const* tz_indices = nullptr,
+                                           cudf::column_view const* tz_short_ids       = nullptr)
 {
   auto const stream = cudf::get_default_stream();
   auto const mr     = rmm::mr::get_current_device_resource();
@@ -580,6 +586,7 @@ std::unique_ptr<cudf::column> to_timestamp(cudf::strings_column_view const& inpu
     auto const ft_cdv_ptr    = column_device_view::create(*transitions, stream);
     auto const d_transitions = lists_column_device_view{*ft_cdv_ptr};
     auto d_tz_indices        = cudf::column_device_view::create(tz_indices->parent(), stream);
+    auto d_tz_short_ids      = column_device_view::create(*tz_short_ids, stream);
 
     thrust::transform(
       rmm::exec_policy(stream),
@@ -589,7 +596,7 @@ std::unique_ptr<cudf::column> to_timestamp(cudf::strings_column_view const& inpu
         thrust::make_tuple(result_col->mutable_view().begin<cudf::timestamp_us>(),
                            result_valid_col->mutable_view().begin<uint8_t>())),
       parse_timestamp_string_fn<true>{
-        *d_strings, default_tz_index, true, d_transitions, *d_tz_indices});
+        *d_strings, default_tz_index, true, d_transitions, *d_tz_indices, *d_tz_short_ids});
   }
 
   auto valid_view = result_valid_col->mutable_view();
@@ -633,11 +640,12 @@ std::unique_ptr<cudf::column> string_to_timestamp_with_tz(
   cudf::column_view const& transitions,
   cudf::strings_column_view const& tz_indices,
   cudf::size_type default_tz_index,
-  bool ansi_mode)
+  bool ansi_mode,
+  cudf::column_view const& tz_short_ids)
 {
   if (input.size() == 0) { return nullptr; }
   return to_timestamp(
-    input, ansi_mode, true, default_tz_index, &transitions, &tz_indices);
+    input, ansi_mode, true, default_tz_index, &transitions, &tz_indices, &tz_short_ids);
 }
 
 /**
diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp
index c7f9d5ec65..14bfa6a060 100644
--- a/src/main/cpp/src/datetime_parser.hpp
+++ b/src/main/cpp/src/datetime_parser.hpp
@@ -77,7 +77,8 @@ std::unique_ptr<cudf::column> string_to_timestamp_with_tz(
   cudf::column_view const& transitions,
   cudf::strings_column_view const& tz_indices,
   cudf::size_type default_tz_index,
-  bool ansi_mode);
+  bool ansi_mode,
+  cudf::column_view const& tz_short_ids);
 
 /**
  *
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
index f45d2163d2..455e71333c 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
@@ -218,9 +218,10 @@ public static ColumnVector toTimestamp(ColumnView cv, ZoneId defaultTimeZone, bo
     Integer tzIndex = singleton.getZoneIDMap().get(defaultTimeZone.normalized().toString());
 
     try (Table transitions = singleton.getTransitions();
-         ColumnVector tzIndices = singleton.getZoneIDVector()) {
+         ColumnVector tzIndices = singleton.getZoneIDVector();
+         ColumnVector tzShortIDs = singleton.getTimeZoneShortIDs()) {
       return new ColumnVector(toTimestamp(cv.getNativeView(), transitions.getNativeView(),
-              tzIndices.getNativeView(), tzIndex, ansiEnabled));
+              tzIndices.getNativeView(), tzIndex, ansiEnabled, tzShortIDs.getNativeView()));
     }
   }
 
@@ -291,7 +292,7 @@ private static native long toIntegersWithBase(long nativeColumnView, int base,
     boolean ansiEnabled, int dtype);
   private static native long fromIntegersWithBase(long nativeColumnView, int base);
   private static native long toTimestamp(long input,
-      long transitions, long tzIndices, int tzIndex, boolean ansiEnabled);
+      long transitions, long tzIndices, int tzIndex, boolean ansiEnabled, long tzShortIDs);
   private static native long toTimestampWithoutTimeZone(long input, boolean allowTimeZone,
       boolean ansiEnabled);
 }
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
index 6b09d2dda0..990d601889 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
@@ -42,6 +42,9 @@ public class GpuTimeZoneDB {
   private CompletableFuture<Map<String, Integer>> zoneIdToTableFuture;
   private CompletableFuture<HostColumnVector> fixedTransitionsFuture;
   private CompletableFuture<HostColumnVector> zoneIdVectorFuture;
+  // Used to store Java ZoneId.SHORT_IDS Map, e.g.: PST:America/Los_Angeles
+  // Note: also add a entry: Z->UTC
+  private HostColumnVector shortIDs;
 
   private boolean closed = false;
 
@@ -171,10 +174,41 @@ private void loadData(Executor executor) throws IllegalStateException {
     }
   }
 
+  /**
+   * load ZoneId.SHORT_IDS and append Z->UTC.
+   * The first 3 entries are: Z->UTC, PST->America/Los_Angeles, CTT->Asia/Shanghai
+   */
+  private void loadTimeZoneShortIDs() {
+    HostColumnVector.DataType type = new HostColumnVector.StructType(false,
+    new HostColumnVector.BasicType(false, DType.STRING),
+    new HostColumnVector.BasicType(false, DType.STRING));
+    ArrayList<HostColumnVector.StructData> data = new ArrayList<>();
+    // add Z->UTC
+    data.add(new HostColumnVector.StructData("Z", "UTC"));
+    // add PST CTT
+    for (Map.Entry<String, String> e : ZoneId.SHORT_IDS.entrySet()) {
+      if (e.getKey().equals("PST") || e.getKey().equals("CTT")) {
+        data.add(new HostColumnVector.StructData(e.getKey(), e.getValue()));
+      }
+    }
+    // add others
+    for (Map.Entry<String, String> e : ZoneId.SHORT_IDS.entrySet()) {
+      if (!(e.getKey().equals("PST") || e.getKey().equals("CTT"))) {
+        data.add(new HostColumnVector.StructData(e.getKey(), e.getValue()));
+      }
+    }
+    shortIDs = HostColumnVector.fromStructs(type, data);
+  }
+
+  public ColumnVector getTimeZoneShortIDs() {
+    return shortIDs.copyToDevice();
+  }
+
   @SuppressWarnings("unchecked")
   private void doLoadData() {
     synchronized (this) {
       try {
+        loadTimeZoneShortIDs();
         Map<String, Integer> zoneIdToTable = new HashMap<>();
         List<List<HostColumnVector.StructData>> masterTransitions = new ArrayList<>();
         // Build a timezone ID index for the rendering of timezone IDs which may be included in datetime-like strings.
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
index 372ae3bd42..6bee3c771c 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
@@ -404,6 +404,9 @@ void toTimestampTestWithTz() {
     entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 00:1:20.3   -020103", 1571536943300000L));
     entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1   -8:08:01  ", 1571640105100000L));
     entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1+4:59:59", 1571592825100000L));
+    // short TZ ID: BST->Asia/Dhaka, CTT->Asia/Shanghai
+    entries.add(new AbstractMap.SimpleEntry<>("2023-11-5T03:04:55.1 CTT", 1699124695100000L));
+    entries.add(new AbstractMap.SimpleEntry<>("2023-11-5T03:04:55.1 BST", 1699124695100000L + 7200L * 1000000L)); // BST is 2 hours later than CTT
 
     int validDataSize = entries.size();
 

From 10eba22dec1ba22e5a3e6db57e0e563ebe1c1d50 Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Mon, 15 Jan 2024 19:14:23 +0800
Subject: [PATCH 21/35] Format code

---
 src/main/cpp/src/CastStringJni.cpp   | 18 +++++++-----------
 src/main/cpp/src/datetime_parser.cu  | 16 +++++++---------
 src/main/cpp/src/datetime_parser.hpp |  7 +++----
 3 files changed, 17 insertions(+), 24 deletions(-)

diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp
index ab4f977750..fa48650f32 100644
--- a/src/main/cpp/src/CastStringJni.cpp
+++ b/src/main/cpp/src/CastStringJni.cpp
@@ -277,9 +277,9 @@ Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv* env,
       reinterpret_cast<cudf::table_view const*>(transitions_handle)->column(0);
     auto const& tz_indices_view =
       cudf::strings_column_view(*reinterpret_cast<cudf::column_view const*>(tz_indices_col));
-    auto const tz_index = static_cast<cudf::size_type>(tz_default_index);
-    const cudf::column_view *tz_short_ids_view = reinterpret_cast<cudf::column_view *>(tz_short_ids);
-    auto ret_cv = spark_rapids_jni::string_to_timestamp_with_tz(
+    auto const tz_index                        = static_cast<cudf::size_type>(tz_default_index);
+    const cudf::column_view* tz_short_ids_view = reinterpret_cast<cudf::column_view*>(tz_short_ids);
+    auto ret_cv                                = spark_rapids_jni::string_to_timestamp_with_tz(
       input_view, transitions, tz_indices_view, tz_index, ansi_enabled, *tz_short_ids_view);
     if (ret_cv) { return cudf::jni::release_as_jlong(ret_cv); }
   }
@@ -291,12 +291,8 @@ Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv* env,
   JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Parse failed on Ansi mode", 0);
 }
 
-JNIEXPORT jlong JNICALL
-Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestampWithoutTimeZone(JNIEnv* env,
-                                                                        jclass,
-                                                                        jlong input_column,
-                                                                        jboolean allow_time_zone,
-                                                                        jboolean ansi_enabled)
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestampWithoutTimeZone(
+  JNIEnv* env, jclass, jlong input_column, jboolean allow_time_zone, jboolean ansi_enabled)
 {
   JNI_NULL_CHECK(env, input_column, "input column is null", 0);
   try {
@@ -304,8 +300,8 @@ Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestampWithoutTimeZone(JNIEnv*
     auto const& input_view =
       cudf::strings_column_view(*reinterpret_cast<cudf::column_view const*>(input_column));
 
-    auto ret_cv = spark_rapids_jni::string_to_timestamp_without_tz(
-      input_view, allow_time_zone, ansi_enabled);
+    auto ret_cv =
+      spark_rapids_jni::string_to_timestamp_without_tz(input_view, allow_time_zone, ansi_enabled);
     if (ret_cv) { return cudf::jni::release_as_jlong(ret_cv); }
   }
   CATCH_STD(env, 0);
diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index 656215364d..ffaa8e6b6f 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -164,7 +164,7 @@ struct parse_timestamp_string_fn {
       auto tz_view = string_view(tz_lit_ptr, tz_lit_len);
 
       // Map short TZ ID to region-based timezone if tz_view is a short ID
-      auto const& short_name_col = tz_short_ids->child(0);
+      auto const& short_name_col   = tz_short_ids->child(0);
       auto const& region_based_col = tz_short_ids->child(1);
       for (size_type i = 0; i < tz_short_ids->size(); i++) {
         auto const& curr_short_id = short_name_col.element<string_view>(i);
@@ -405,8 +405,8 @@ struct parse_timestamp_string_fn {
     const char* const bytes      = curr_ptr;
     const size_type bytes_length = end_ptr - curr_ptr;
 
-    // segments stores: [year, month, day, hour, minute, seconds, microseconds, no_use_item, no_use_item]
-    // the two tail items are no use, but here keeps them as Spark does
+    // segments stores: [year, month, day, hour, minute, seconds, microseconds, no_use_item,
+    // no_use_item] the two tail items are no use, but here keeps them as Spark does
     int segments[]             = {1, 1, 1, 0, 0, 0, 0, 0, 0};
     int segments_len           = 9;
     int i                      = 0;
@@ -580,8 +580,7 @@ std::unique_ptr<cudf::column> to_timestamp(cudf::strings_column_view const& inpu
       thrust::make_zip_iterator(
         thrust::make_tuple(result_col->mutable_view().begin<cudf::timestamp_us>(),
                            result_valid_col->mutable_view().begin<uint8_t>())),
-      parse_timestamp_string_fn<false>{
-        *d_strings, default_tz_index, allow_tz_in_date_str});
+      parse_timestamp_string_fn<false>{*d_strings, default_tz_index, allow_tz_in_date_str});
   } else {
     auto const ft_cdv_ptr    = column_device_view::create(*transitions, stream);
     auto const d_transitions = lists_column_device_view{*ft_cdv_ptr};
@@ -655,10 +654,9 @@ std::unique_ptr<cudf::column> string_to_timestamp_with_tz(
  * If allow_time_zone is false and string contains time zone, then the string is
  * invalid.
  */
-std::unique_ptr<cudf::column> string_to_timestamp_without_tz(
-  cudf::strings_column_view const& input,
-  bool allow_time_zone,
-  bool ansi_mode)
+std::unique_ptr<cudf::column> string_to_timestamp_without_tz(cudf::strings_column_view const& input,
+                                                             bool allow_time_zone,
+                                                             bool ansi_mode)
 {
   if (input.size() == 0) { return nullptr; }
   return to_timestamp(input, ansi_mode, allow_time_zone);
diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp
index 14bfa6a060..f750594f9f 100644
--- a/src/main/cpp/src/datetime_parser.hpp
+++ b/src/main/cpp/src/datetime_parser.hpp
@@ -131,9 +131,8 @@ std::unique_ptr<cudf::column> string_to_timestamp_with_tz(
  * @returns the pointer of the timestamp result column, which points to nullptr
  * if there exists invalid inputs and ANSI mode is on.
  */
-std::unique_ptr<cudf::column> string_to_timestamp_without_tz(
-  cudf::strings_column_view const& input,
-  bool allow_time_zone,
-  bool ansi_mode);
+std::unique_ptr<cudf::column> string_to_timestamp_without_tz(cudf::strings_column_view const& input,
+                                                             bool allow_time_zone,
+                                                             bool ansi_mode);
 
 }  // namespace spark_rapids_jni

From 24e81cd54d2f6b9acdfcac0729d9b19aa7a38a40 Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Tue, 16 Jan 2024 09:48:34 +0800
Subject: [PATCH 22/35] Update comments

---
 .../com/nvidia/spark/rapids/jni/CastStrings.java  | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
index 455e71333c..a87a754f35 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
@@ -182,8 +182,11 @@ public static ColumnVector fromIntegersWithBase(ColumnView cv, int base) {
    *
    * Unlike Spark, Spark-Rapids currently does not support DST time zones.
    *
-   * Note: Do not support cast special strings(epoch now today yesterday tomorrow) to timestamp.
+   * Note:
+   * - Do not support cast special strings(epoch now today yesterday tomorrow) to timestamp.
    * Spark31x supports cast special strings while Spark320+ do not supports
+   * - Do not support DST time zones, throw ai.rapids.cudf.CudfException
+   *   if contains DST time zones.
    *
    * Example:
    * input = [" 2023", "2023-01-01T08:00:00Asia/Shanghai "]
@@ -201,8 +204,9 @@ public static ColumnVector fromIntegersWithBase(ColumnView cv, int base) {
    *                                contain time zone.
    * @param ansiEnabled             is Ansi mode
    * @return a timestamp column
-   * @throws IllegalArgumentException if cv contains invalid value or the time zone is
-   *                                  non-existed when ansiEnabled is true
+   * @throws IllegalArgumentException if any string in cv has invalid format or the time zone is
+   *                                  non-existed/wrong when ansiEnabled is true
+   * @throws CudfException            if time zone is a DST time zone
    */
   public static ColumnVector toTimestamp(ColumnView cv, ZoneId defaultTimeZone, boolean ansiEnabled) {
     if (!GpuTimeZoneDB.isSupportedTimeZone(defaultTimeZone)) {
@@ -269,8 +273,9 @@ public static ColumnVector toTimestamp(ColumnView cv, ZoneId defaultTimeZone, bo
    *                                when do not allow time zone.
    * @param ansiEnabled             is Ansi mode
    * @return a timestamp column
-   * @throws IllegalArgumentException if cv contains invalid value when
-   *                                  ansiEnabled is true
+   * @throws IllegalArgumentException if any string in cv has invalid format or contains time zone
+   *                                  while `allowTimeZone` is false when ANSI is true.
+   *
    */
   public static ColumnVector toTimestampWithoutTimeZone(ColumnView cv, boolean allowTimeZone, boolean ansiEnabled) {
     GpuTimeZoneDB singleton = GpuTimeZoneDB.getInstance();

From 374dede799d244e5a537f4158c2d8b2429c6b6e0 Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Thu, 18 Jan 2024 17:22:52 +0800
Subject: [PATCH 23/35] Update comments

---
 src/main/cpp/src/datetime_parser.cu           | 29 ++++++++++---------
 .../spark/rapids/jni/CastStringsTest.java     |  2 +-
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index ffaa8e6b6f..aa21459c0e 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -542,11 +542,13 @@ struct parse_timestamp_string_fn {
 };
 
 /**
- * The common entrance of string_to_timestamp, which combines two paths:
- * with_timezone and without_timezone. This function returns the The
- * transitions, tz_indices and default_tz_index are only for handling inputs
- * with timezone. So, this function distinguish with_timezone calls from
- * without_timezone ones by checking if transitions and tz_indices are nullptr.
+ * The common entrance of string_to_timestamp, two paths call this function:
+ * - `string_to_timestamp_with_tz` : with time zone
+ * - `string_to_timestamp_without_tz` : without time zone
+ * The parameters transitions, tz_indices and default_tz_index are only for handling
+ * inputs with timezone.
+ * It's called from `string_to_timestamp_without_tz` if transitions and tz_indices
+ * are nullptr, otherwise called from `string_to_timestamp_with_tz`.
  *
  */
 std::unique_ptr<cudf::column> to_timestamp(cudf::strings_column_view const& input,
@@ -630,9 +632,11 @@ std::unique_ptr<cudf::column> to_timestamp(cudf::strings_column_view const& inpu
 namespace spark_rapids_jni {
 
 /**
- * Parse string column with time zone to timestamp column,
- * Returns a pair of timestamp column and a bool indicates whether it successes.
- * If does not have time zone in string, use the default time zone.
+ * Parse string column with time zone to timestamp column.
+ * If a string does not have time zone in it, use the default time zone.
+ * Returns nullptr if ANSI mode is true and strings have any invalid value, returns non-null
+ * timestamp column otherwise.
+ *
  */
 std::unique_ptr<cudf::column> string_to_timestamp_with_tz(
   cudf::strings_column_view const& input,
@@ -648,11 +652,10 @@ std::unique_ptr<cudf::column> string_to_timestamp_with_tz(
 }
 
 /**
- * Parse string column with time zone to timestamp column,
- * Returns a pair of timestamp column and a bool indicates whether it successes.
- * Do not use the time zone in string.
- * If allow_time_zone is false and string contains time zone, then the string is
- * invalid.
+ * Parse string column without time zone to timestamp column.
+ * Returns nullptr if ANSI mode is true and strings have any invalid value, returns non-null
+ * timestamp column otherwise.
+ *
  */
 std::unique_ptr<cudf::column> string_to_timestamp_without_tz(cudf::strings_column_view const& input,
                                                              bool allow_time_zone,
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
index 6bee3c771c..ca3049bfed 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
@@ -440,7 +440,7 @@ void toTimestampTestWithTz() {
       }
     });
 
-    // Throw unsupported exception for symbols of special dates
+    // Throw IllegalArgumentException for symbols of special dates
     // Note: Spark 31x supports "epoch", "now", "today", "yesterday", "tomorrow".
     // But Spark 32x to Spark 35x do not supports.
     // Currently JNI do not supports

From 1d0ef4c6b477e4ae5a39e305a37ce53e93141fc9 Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Mon, 22 Jan 2024 23:20:38 +0800
Subject: [PATCH 24/35] Address comments

---
 src/main/cpp/src/datetime_parser.cu           | 146 +++++++++---------
 .../spark/rapids/jni/GpuTimeZoneDB.java       |  64 +++-----
 2 files changed, 94 insertions(+), 116 deletions(-)

diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index aa21459c0e..996b3a253a 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -119,8 +119,7 @@ struct parse_timestamp_string_fn {
   bool allow_tz_in_date_str = true;
   // The list column of transitions to figure out the correct offset
   // to adjust the timestamp. The type of the values in this column is
-  // LIST<STRUCT<utcInstant: int64, tzInstant: int64, utcOffset: int32,
-  // looseTzInstant: int64>>.
+  // LIST<STRUCT<utcInstant: int64, tzInstant: int64, utcOffset: int32>>.
   thrust::optional<lists_column_device_view const> transitions = thrust::nullopt;
   thrust::optional<column_device_view const> tz_indices        = thrust::nullopt;
   thrust::optional<column_device_view const> tz_short_ids      = thrust::nullopt;
@@ -157,43 +156,44 @@ struct parse_timestamp_string_fn {
 
     // path with timezone, in which timezone offset has to be determined before
     // computing unix_timestamp
-    int64_t tz_offset;
+    int64_t utc_offset;
     if (tz_lit_ptr == nullptr) {
-      tz_offset = extract_timezone_offset(compute_loose_epoch_s(ts_comp), default_tz_index);
+      // no tz in the string tailing, use default tz
+      utc_offset = extract_timezone_offset(compute_epoch_s(ts_comp), default_tz_index);
     } else {
       auto tz_view = string_view(tz_lit_ptr, tz_lit_len);
 
-      // Map short TZ ID to region-based timezone if tz_view is a short ID
-      auto const& short_name_col   = tz_short_ids->child(0);
-      auto const& region_based_col = tz_short_ids->child(1);
-      for (size_type i = 0; i < tz_short_ids->size(); i++) {
-        auto const& curr_short_id = short_name_col.element<string_view>(i);
-        if (curr_short_id == tz_view) {
-          // find short ID, replace tz_view with mapped region TZ ID
-          tz_view = region_based_col.element<string_view>(i);
-          break;
-        }
+      // map tz short IDs, has three map types: 
+      //   1:  Z->UTC; 
+      //   2:  short ID->regional based tz
+      //   3:  MST->"-07:00"
+      auto const& short_tz_id_col   = tz_short_ids->child(0);
+      auto const& map_to_tz_col = tz_short_ids->child(1);
+      auto const it = thrust::upper_bound(
+        thrust::seq, short_tz_id_col.begin(), short_tz_id_col.end(), tz_view);
+      if (it != short_tz_id_col.end() && *it == tz_view) {
+        auto short_tz_id_idx = static_cast<size_t>(it - short_tz_id_col.begin());
+        // found a map, replace with mapped tz
+        tz_view = map_to_tz_col.element<string_view>(short_tz_id_idx);
       }
 
       // Firstly, try parsing as utc-like timezone rep
-      if (auto [utc_offset, ret_code] = parse_utc_like_tz(tz_view); ret_code == 0) {
-        tz_offset = utc_offset;
-      } else if (ret_code == 1) {
+      auto [utc_offset, ret_code] = parse_utc_like_tz(tz_view);
+      if (ret_code == ParseUtcLikeTzResult::UTC_LIKE_TZ) {
+        utc_offset = utc_offset;
+      } else if (ret_code == ParseUtcLikeTzResult::NOT_UTC_LIKE_TZ) {
         // Then, try parsing as region-based timezone ID
         auto tz_index = query_index_from_tz_db(tz_view);
-        // tz_index < size(tzDB): found the ID in tzDB
-        // size(tzDB) <= tz_index < size(tzIDs): found the ID but not supported
-        // yet tz_index == size(tzIDs): invalid timezone ID
-        if (tz_index > transitions->size()) {
-          if (tz_index == tz_indices->size())
-            return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}},
-                                      ParseResult::INVALID);
+        if (tz_index < 0) {
+          // invalid tz
           return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}},
-                                    ParseResult::UNSUPPORTED);
+                                    ParseResult::INVALID);
+        } else {
+          // supported tz
+          utc_offset = extract_timezone_offset(compute_epoch_s(ts_comp), tz_index);
         }
-        tz_offset = extract_timezone_offset(compute_loose_epoch_s(ts_comp), tz_index);
       } else {
-        // (ret_code == 2) quick path to mark value invalid
+        // (ret_code == ParseUtcLikeTzResult::INVALID) quick path to mark value invalid
         return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID);
       }
     }
@@ -202,16 +202,20 @@ struct parse_timestamp_string_fn {
     auto const ts_unaligned = compute_epoch_us(ts_comp);
 
     return thrust::make_tuple(
-      cudf::timestamp_us{cudf::duration_us{ts_unaligned - tz_offset * 1000000L}}, ParseResult::OK);
+      cudf::timestamp_us{cudf::duration_us{ts_unaligned - utc_offset * 1000000L}}, ParseResult::OK);
   }
 
+  enum ParseUtcLikeTzResult { 
+    UTC_LIKE_TZ = 0, // successfully parsed the timezone offset
+    NOT_UTC_LIKE_TZ = 1,    // not a valid UTC-like timezone representation, maybe valid region-based
+    INVALID = 2      // not a valid timezone representation
+  };
+
   /**
    *
    * Parse UTC-like timezone representation such as: UTC+11:22:33, GMT-8:08:01.
    * This function is purposed to be fully align to Apache Spark's behavior. The
-   * function returns the status along with the result: 0 - successfully parsed
-   * the timezone offset 1 - not a valid UTC-like timezone representation, maybe
-   * valid region-based rep 2 - not a valid timezone representation
+   * function returns the status along with the ParseUtcLikeTzResult result.
    *
    * Valid patterns:
    *   with colon
@@ -224,7 +228,7 @@ struct parse_timestamp_string_fn {
    *   additional restriction: 18:00:00 is the upper bound (which means 18:00:01
    * is invalid)
    */
-  __device__ inline thrust::pair<int64_t, uint8_t> parse_utc_like_tz(
+  __device__ inline thrust::pair<int64_t, ParseUtcLikeTzResult> parse_utc_like_tz(
     string_view const& tz_lit) const
   {
     size_type len = tz_lit.size_bytes();
@@ -239,7 +243,7 @@ struct parse_timestamp_string_fn {
     }
 
     // return for the pattern UTC|GMT (without exact offset)
-    if (len == char_offset) return {0, 0};
+    if (len == char_offset) return {0, ParseUtcLikeTzResult::UTC_LIKE_TZ};
 
     // parse sign +|-
     char const sign_char = *(ptr + char_offset++);
@@ -250,7 +254,7 @@ struct parse_timestamp_string_fn {
       sign = -1L;
     } else {
       // if the rep starts with UTC|GMT, it can NOT be region-based rep
-      return {0, char_offset < 3 ? 1 : 2};
+      return {0, char_offset < 3 ? ParseUtcLikeTzResult::NOT_UTC_LIKE_TZ : ParseUtcLikeTzResult::INVALID};
     }
 
     // parse hh:mm:ss
@@ -259,14 +263,14 @@ struct parse_timestamp_string_fn {
     for (size_type i = 0; i < 3; i++) {
       // deal with the first digit
       hms[i] = *(ptr + char_offset++) - '0';
-      if (hms[i] < 0 || hms[i] > 9) return {0, 2};
+      if (hms[i] < 0 || hms[i] > 9) return {0, ParseUtcLikeTzResult::INVALID};
 
       // deal with trailing single digit instant:
       //  hh(GMT+8) - valid
       //  mm(GMT+11:2) - must be separated from (h)h by `:`
       //  ss(GMT-11:22:3) - invalid
       if (len == char_offset) {
-        if (i == 2 || (i == 1 && !has_colon)) return {0, 2};
+        if (i == 2 || (i == 1 && !has_colon)) return {0, ParseUtcLikeTzResult::INVALID};
         break;
       }
 
@@ -275,68 +279,68 @@ struct parse_timestamp_string_fn {
         // 1. (i == 1) one_digit mm with ss is invalid (+11:2:3)
         // 2. (i == 2) one_digit ss is invalid (+11:22:3)
         // 3. trailing `:` is invalid (GMT+8:)
-        if (i > 0 || len == ++char_offset) return {0, 2};
+        if (i > 0 || len == ++char_offset) return {0, ParseUtcLikeTzResult::INVALID};
         has_colon = true;
         continue;
       }
 
       // deal with the second digit
       auto digit = *(ptr + char_offset++) - '0';
-      if (digit < 0 || digit > 9) return {0, 2};
+      if (digit < 0 || digit > 9) return {0, ParseUtcLikeTzResult::INVALID};
       hms[i] = hms[i] * 10 + digit;
 
       if (len == char_offset) break;
       // deal with `:`
       if (*(ptr + char_offset) == ':') {
         // trailing `:` is invalid (UTC+11:)
-        if (len == ++char_offset) return {0, 2};
+        if (len == ++char_offset) return {0, ParseUtcLikeTzResult::INVALID};
         has_colon = true;
       }
     }
 
     // the upper bound is 18:00:00 (regardless of sign)
-    if (hms[0] > 18 || hms[1] > 59 || hms[2] > 59) return {0, 2};
-    if (hms[0] == 18 && hms[1] + hms[2] > 0) return {0, 2};
+    if (hms[0] > 18 || hms[1] > 59 || hms[2] > 59) return {0, ParseUtcLikeTzResult::INVALID};
+    if (hms[0] == 18 && hms[1] + hms[2] > 0) return {0, ParseUtcLikeTzResult::INVALID};
 
-    return {sign * (hms[0] * 3600L + hms[1] * 60L + hms[2]), 0};
+    return {sign * (hms[0] * 3600L + hms[1] * 60L + hms[2]), ParseUtcLikeTzResult::UTC_LIKE_TZ};
   }
 
   /**
-   * TODO: replace linear search with more efficient approach (like prefix tree)
+   * tz_indices is sorted, use binary search to find tz index.
    */
   __device__ inline int query_index_from_tz_db(string_view const& tz_lit) const
   {
-    auto predicate = [tz = tz_indices, &tz_lit] __device__(auto const i) {
-      return tz->element<string_view>(i) == tz_lit;
-    };
-    auto ret = thrust::find_if(thrust::seq,
-                               thrust::make_counting_iterator(0),
-                               thrust::make_counting_iterator(tz_indices->size()),
-                               predicate);
-
-    return *ret;
+    auto const it = thrust::upper_bound(thrust::seq,
+                               tz_indices->begin(),
+                               tz_indices->end(),
+                               tz_lit);
+    if (it != tz_indices->end() && *it == tz_lit) {
+      return it - tz_indices->begin();
+    } else {
+      return -1;
+    }
   }
 
   /**
-   * Perform binary search to search out the timezone offset based on loose epoch
+   * Perform binary search to search out the timezone offset based on local epoch
    * instants. Basically, this is the same approach as
    * `convert_timestamp_tz_functor`.
    */
-  __device__ inline int64_t extract_timezone_offset(int64_t loose_epoch_second,
+  __device__ inline int64_t extract_timezone_offset(int64_t local_epoch_second,
                                                     size_type tz_index) const
   {
+    auto const& tz_instants    = transitions->child().child(1);
     auto const& utc_offsets    = transitions->child().child(2);
-    auto const& loose_instants = transitions->child().child(3);
 
     auto const local_transitions = cudf::list_device_view{*transitions, tz_index};
     auto const list_size         = local_transitions.size();
 
     auto const transition_times = cudf::device_span<int64_t const>(
-      loose_instants.data<int64_t>() + local_transitions.element_offset(0),
+      tz_instants.data<int64_t>() + local_transitions.element_offset(0),
       static_cast<size_t>(list_size));
 
     auto const it = thrust::upper_bound(
-      thrust::seq, transition_times.begin(), transition_times.end(), loose_epoch_second);
+      thrust::seq, transition_times.begin(), transition_times.end(), local_epoch_second);
     auto const idx         = static_cast<size_type>(thrust::distance(transition_times.begin(), it));
     auto const list_offset = local_transitions.element_offset(idx - 1);
 
@@ -344,24 +348,9 @@ struct parse_timestamp_string_fn {
   }
 
   /**
-   * The formula to compute loose epoch from local time. The loose epoch is used
-   * to search for the corresponding timezone offset of specific zone ID from
-   * TimezoneDB. The target of loose epoch is to transfer local time to a number
-   * which is proportional to the real timestamp as easily as possible. Loose
-   * epoch, as a computation approach, helps us to align probe(kernel side) to
-   * the TimezoneDB(Java side). Then, we can apply binary search based on loose
-   * epoch instants of TimezoneDB to find out the correct timezone offset.
-   */
-  __device__ inline int64_t compute_loose_epoch_s(timestamp_components const& ts) const
-  {
-    return (ts.year * 400 + (ts.month - 1) * 31 + ts.day - 1) * 86400L + ts.hour * 3600L +
-           ts.minute * 60L + ts.second;
-  }
-
-  /**
-   * Leverage STL to convert local time to UTC unix_timestamp(in millisecond)
+   * Leverage STL to convert local time to UTC unix_timestamp(in seconds)
    */
-  __device__ inline int64_t compute_epoch_us(timestamp_components const& ts) const
+  __device__ inline int64_t compute_epoch_s(timestamp_components const& ts) const
   {
     auto const ymd =  // chrono class handles the leap year calculations for us
       cuda::std::chrono::year_month_day(cuda::std::chrono::year{ts.year},
@@ -369,8 +358,15 @@ struct parse_timestamp_string_fn {
                                         cuda::std::chrono::day{static_cast<uint32_t>(ts.day)});
     auto days = cuda::std::chrono::sys_days(ymd).time_since_epoch().count();
 
-    int64_t timestamp_s = (days * 24L * 3600L) + (ts.hour * 3600L) + (ts.minute * 60L) + ts.second;
+    return (days * 24L * 3600L) + (ts.hour * 3600L) + (ts.minute * 60L) + ts.second;
+  }
 
+  /**
+   * Leverage STL to convert local time to UTC unix_timestamp(in milliseconds)
+   */
+  __device__ inline int64_t compute_epoch_us(timestamp_components const& ts) const
+  {
+    int64_t timestamp_s = compute_epoch_s(ts);
     return timestamp_s * 1000000L + ts.microseconds;
   }
 
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
index 990d601889..087aba7549 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
@@ -17,18 +17,18 @@
 package com.nvidia.spark.rapids.jni;
 
 import java.time.Instant;
-import java.time.LocalDateTime;
 import java.time.ZoneId;
 import java.time.zone.ZoneOffsetTransition;
 import java.time.zone.ZoneRules;
 import java.time.zone.ZoneRulesException;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.TimeZone;
 import java.util.concurrent.*;
-import java.util.function.Function;
 
 import ai.rapids.cudf.*;
 
@@ -175,26 +175,21 @@ private void loadData(Executor executor) throws IllegalStateException {
   }
 
   /**
-   * load ZoneId.SHORT_IDS and append Z->UTC.
-   * The first 3 entries are: Z->UTC, PST->America/Los_Angeles, CTT->Asia/Shanghai
+   * load ZoneId.SHORT_IDS and append Z->UTC, then sort the IDs.
    */
   private void loadTimeZoneShortIDs() {
     HostColumnVector.DataType type = new HostColumnVector.StructType(false,
     new HostColumnVector.BasicType(false, DType.STRING),
     new HostColumnVector.BasicType(false, DType.STRING));
     ArrayList<HostColumnVector.StructData> data = new ArrayList<>();
-    // add Z->UTC
-    data.add(new HostColumnVector.StructData("Z", "UTC"));
-    // add PST CTT
-    for (Map.Entry<String, String> e : ZoneId.SHORT_IDS.entrySet()) {
-      if (e.getKey().equals("PST") || e.getKey().equals("CTT")) {
-        data.add(new HostColumnVector.StructData(e.getKey(), e.getValue()));
-      }
-    }
-    // add others
-    for (Map.Entry<String, String> e : ZoneId.SHORT_IDS.entrySet()) {
-      if (!(e.getKey().equals("PST") || e.getKey().equals("CTT"))) {
-        data.add(new HostColumnVector.StructData(e.getKey(), e.getValue()));
+    List<String> idList = new ArrayList<>(ZoneId.SHORT_IDS.keySet());
+    idList.add("Z");
+    Collections.sort(idList);
+    for (String id : idList) {
+      if (id.equals("Z")) {
+        data.add(new HostColumnVector.StructData(id, "UTC"));
+      } else {
+        data.add(new HostColumnVector.StructData(id, ZoneId.SHORT_IDS.get(id)));
       }
     }
     shortIDs = HostColumnVector.fromStructs(type, data);
@@ -208,23 +203,26 @@ public ColumnVector getTimeZoneShortIDs() {
   private void doLoadData() {
     synchronized (this) {
       try {
+        // load ZoneId.SHORT_IDS and append Z->UTC, then sort the IDs.
         loadTimeZoneShortIDs();
+        
         Map<String, Integer> zoneIdToTable = new HashMap<>();
         List<List<HostColumnVector.StructData>> masterTransitions = new ArrayList<>();
         // Build a timezone ID index for the rendering of timezone IDs which may be included in datetime-like strings.
         // For instance: "2023-11-5T03:04:55.1 Asia/Shanghai" -> This index helps to find the
         // offset of "Asia/Shanghai" in timezoneDB.
         //
-        // Currently, we do NOT support all timezone IDs. For unsupported ones, we ought to throw Exception anyway. And
-        // for invalid ones, we replace them with NULL value when ANSI mode is off. Therefore, we need to distinguish the
-        // unsupported ones from invalid ones which means the unsupported Ids need to be collected as well.
-        // To distinguish supported IDs from unsupported ones, we place all unsupported IDs behind supported ones:
-        // 1. Collect the IDs of all supported timezones in the order of masterTransitions.
-        // 2. Append the IDs of all unsupported timezones after the suported ones.
+        // Currently, we do NOT support all timezone IDs. For unsupported time zones, like invalid ones,
+        // we replace them with NULL value when ANSI mode is off when parsing string to timestamp.
+        // This list only contains supported time zones.
         List<String> zondIdList = new ArrayList<>();
         List<String> unsupportedZoneList = new ArrayList<>();
+        
+        // sort the IDs
+        String[] availableIDs = TimeZone.getAvailableIDs();
+        Arrays.sort(availableIDs);
 
-        for (String tzId : TimeZone.getAvailableIDs()) {
+        for (String tzId : availableIDs) {
           ZoneId zoneId;
           try {
             zoneId = ZoneId.of(tzId).normalized(); // we use the normalized form to dedupe
@@ -257,17 +255,6 @@ private void doLoadData() {
                       first.getOffsetBefore().getTotalSeconds(), Long.MIN_VALUE)
               );
               transitions.forEach(t -> {
-                // A simple approach to transform LocalDateTime to a value which is proportional to
-                // the exact EpochSecond. After caching these values along with EpochSeconds, we
-                // can easily search out which time zone transition rule we should apply according
-                // to LocalDateTime structs. The searching procedure is same as the binary search with
-                // exact EpochSeconds(convert_timestamp_tz_functor), except using "loose instant"
-                // as search index instead of exact EpochSeconds.
-                Function<LocalDateTime, Long> localToLooseEpochSecond = lt ->
-                        86400L * (lt.getYear() * 400L + (lt.getMonthValue() - 1) * 31L +
-                                lt.getDayOfMonth() - 1) +
-                                3600L * lt.getHour() + 60L * lt.getMinute() + lt.getSecond();
-
                 // Whether transition is an overlap vs gap.
                 // In Spark:
                 // if it's a gap, then we use the offset after *on* the instant
@@ -279,8 +266,7 @@ private void doLoadData() {
                       new HostColumnVector.StructData(
                           t.getInstant().getEpochSecond(),
                           t.getInstant().getEpochSecond() + t.getOffsetAfter().getTotalSeconds(),
-                          t.getOffsetAfter().getTotalSeconds(),
-                          localToLooseEpochSecond.apply(t.getDateTimeAfter())
+                          t.getOffsetAfter().getTotalSeconds()
                       )
                   );
                 } else {
@@ -288,8 +274,7 @@ private void doLoadData() {
                       new HostColumnVector.StructData(
                           t.getInstant().getEpochSecond(),
                           t.getInstant().getEpochSecond() + t.getOffsetBefore().getTotalSeconds(),
-                          t.getOffsetAfter().getTotalSeconds(),
-                          localToLooseEpochSecond.apply(t.getDateTimeBefore())
+                          t.getOffsetAfter().getTotalSeconds()
                       )
                   );
                 }
@@ -311,9 +296,6 @@ private void doLoadData() {
         HostColumnVector.DataType resultType =
             new HostColumnVector.ListType(false, childType);
 
-        // Append the IDs of all unsupported timezones after the suported ones.
-        zondIdList.addAll(unsupportedZoneList);
-
         try (HostColumnVector fixedTransitions = HostColumnVector.fromLists(resultType, masterTransitions.toArray(new List[0]))) {
           try (HostColumnVector zoneIdVector = HostColumnVector.fromStrings(zondIdList.toArray(new String[0]))) {
             fixedTransitionsFuture.complete(fixedTransitions.incRefCount());

From b87584023c37678d9af7e1c6e86de042899b2e45 Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Wed, 24 Jan 2024 00:18:16 +0800
Subject: [PATCH 25/35] Address comments

---
 src/main/cpp/src/datetime_parser.cu           | 204 +++++++++++-------
 .../nvidia/spark/rapids/jni/CastStrings.java  |   5 +
 .../spark/rapids/jni/GpuTimeZoneDB.java       |  71 ++++--
 .../spark/rapids/jni/CastStringsTest.java     |  10 +-
 4 files changed, 195 insertions(+), 95 deletions(-)

diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index 996b3a253a..18750c9a27 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -103,6 +103,18 @@ __device__ __host__ bool is_valid_digits(int segment, int digits)
          (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2);
 }
 
+/**
+ * function to get a string from string view
+ */
+struct get_string_fn {
+  column_device_view const& string_view;
+
+  __device__ cudf::string_view operator()(size_t idx)
+  {
+    return string_view.element<cudf::string_view>(idx);
+  }
+};
+
 /**
  * We have to distinguish INVALID value with UNSUPPORTED value.
  * INVALID means the value is invalid in Spark SQL.
@@ -119,9 +131,10 @@ struct parse_timestamp_string_fn {
   bool allow_tz_in_date_str = true;
   // The list column of transitions to figure out the correct offset
   // to adjust the timestamp. The type of the values in this column is
-  // LIST<STRUCT<utcInstant: int64, tzInstant: int64, utcOffset: int32>>.
+  // LIST<STRUCT<utcInstant: int64, tzInstant: int64, utcOffset: int32,
+  // looseTzInstant: int64>>.
   thrust::optional<lists_column_device_view const> transitions = thrust::nullopt;
-  thrust::optional<column_device_view const> tz_indices        = thrust::nullopt;
+  thrust::optional<column_device_view const> sorted_tz_names   = thrust::nullopt;
   thrust::optional<column_device_view const> tz_short_ids      = thrust::nullopt;
 
   __device__ thrust::tuple<cudf::timestamp_us, uint8_t> operator()(const cudf::size_type& idx) const
@@ -159,42 +172,51 @@ struct parse_timestamp_string_fn {
     int64_t utc_offset;
     if (tz_lit_ptr == nullptr) {
       // no tz in the string tailing, use default tz
-      utc_offset = extract_timezone_offset(compute_epoch_s(ts_comp), default_tz_index);
+      utc_offset = compute_utc_offset(compute_loose_epoch_s(ts_comp), default_tz_index);
     } else {
       auto tz_view = string_view(tz_lit_ptr, tz_lit_len);
 
-      // map tz short IDs, has three map types: 
-      //   1:  Z->UTC; 
-      //   2:  short ID->regional based tz
-      //   3:  MST->"-07:00"
-      auto const& short_tz_id_col   = tz_short_ids->child(0);
-      auto const& map_to_tz_col = tz_short_ids->child(1);
-      auto const it = thrust::upper_bound(
-        thrust::seq, short_tz_id_col.begin(), short_tz_id_col.end(), tz_view);
-      if (it != short_tz_id_col.end() && *it == tz_view) {
-        auto short_tz_id_idx = static_cast<size_t>(it - short_tz_id_col.begin());
-        // found a map, replace with mapped tz
-        tz_view = map_to_tz_col.element<string_view>(short_tz_id_idx);
+      // map tz short IDs to time zone index in transitions.
+      // Here only handle regional base tz map: short ID->regional based tz
+      // Note: here do not handle special short IDs: EST: -05:00; HST: -10:00; MST: -07:00
+      auto const short_tz_id_col = tz_short_ids->child(0);
+      auto const map_to_tz_col   = tz_short_ids->child(1);
+      auto string_iter_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
+                                                               get_string_fn{short_tz_id_col});
+      auto string_iter_end   = string_iter_begin + short_tz_id_col.size();
+      auto it                = thrust::lower_bound(
+        thrust::seq, string_iter_begin, string_iter_end, tz_view, thrust::less<string_view>());
+      int tz_index_for_short_tz = -1;
+      if (it != string_iter_end && *it == tz_view) {
+        // found a map, get the time zone index
+        auto short_id_index   = static_cast<size_type>(it - string_iter_begin);
+        tz_index_for_short_tz = static_cast<int>(map_to_tz_col.element<int32_t>(short_id_index));
       }
 
-      // Firstly, try parsing as utc-like timezone rep
-      auto [utc_offset, ret_code] = parse_utc_like_tz(tz_view);
-      if (ret_code == ParseUtcLikeTzResult::UTC_LIKE_TZ) {
-        utc_offset = utc_offset;
-      } else if (ret_code == ParseUtcLikeTzResult::NOT_UTC_LIKE_TZ) {
-        // Then, try parsing as region-based timezone ID
-        auto tz_index = query_index_from_tz_db(tz_view);
-        if (tz_index < 0) {
-          // invalid tz
-          return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}},
-                                    ParseResult::INVALID);
+      if (tz_index_for_short_tz >= 0) {
+        // it's a supported short ID, and found the tz index
+        utc_offset = compute_utc_offset(compute_loose_epoch_s(ts_comp), tz_index_for_short_tz);
+      } else {
+        // Firstly, try parsing as utc-like timezone rep
+        // Note: parse_utc_like_tz handles special short IDs: EST: -05:00; HST: -10:00; MST: -07:00
+        auto [fix_offset, ret_code] = parse_utc_like_tz(tz_view);
+        if (ret_code == ParseUtcLikeTzResult::UTC_LIKE_TZ) {
+          utc_offset = fix_offset;
+        } else if (ret_code == ParseUtcLikeTzResult::NOT_UTC_LIKE_TZ) {
+          // Then, try parsing as region-based timezone ID
+          auto tz_index = query_index_from_tz_db(tz_view);
+          if (tz_index < 0) {
+            // TODO: distinguish unsupported and invalid tz
+            return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}},
+                                      ParseResult::INVALID);
+          } else {
+            // supported tz
+            utc_offset = compute_utc_offset(compute_loose_epoch_s(ts_comp), tz_index);
+          }
         } else {
-          // supported tz
-          utc_offset = extract_timezone_offset(compute_epoch_s(ts_comp), tz_index);
+          // (ret_code == ParseUtcLikeTzResult::INVALID) quick path to mark value invalid
+          return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID);
         }
-      } else {
-        // (ret_code == ParseUtcLikeTzResult::INVALID) quick path to mark value invalid
-        return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID);
       }
     }
 
@@ -205,10 +227,10 @@ struct parse_timestamp_string_fn {
       cudf::timestamp_us{cudf::duration_us{ts_unaligned - utc_offset * 1000000L}}, ParseResult::OK);
   }
 
-  enum ParseUtcLikeTzResult { 
-    UTC_LIKE_TZ = 0, // successfully parsed the timezone offset
-    NOT_UTC_LIKE_TZ = 1,    // not a valid UTC-like timezone representation, maybe valid region-based
-    INVALID = 2      // not a valid timezone representation
+  enum ParseUtcLikeTzResult {
+    UTC_LIKE_TZ     = 0,  // successfully parsed the timezone offset
+    NOT_UTC_LIKE_TZ = 1,  // not a valid UTC-like timezone representation, maybe valid region-based
+    INVALID         = 2   // not a valid timezone representation
   };
 
   /**
@@ -218,6 +240,11 @@ struct parse_timestamp_string_fn {
    * function returns the status along with the ParseUtcLikeTzResult result.
    *
    * Valid patterns:
+   *   Z: means UTC
+   *   short tz IDs that is UTC like
+   *     EST: -05:00
+   *     HST: -10:00
+   *     MST: -07:00
    *   with colon
    *     hh:mm      : ^(GMT|UTC)?[+-](\d|0[0-9]|1[0-8]):(\d|[0-5][0-9])
    *     hh:mm:ss   : ^(GMT|UTC)?[+-](\d|0[0-9]|1[0-8]):[0-5][0-9]:[0-5][0-9]
@@ -235,6 +262,23 @@ struct parse_timestamp_string_fn {
 
     char const* ptr = tz_lit.data();
 
+    // Z time zone
+    if (len == 1 && *ptr == 'Z') { return {0, ParseUtcLikeTzResult::UTC_LIKE_TZ}; }
+
+    // handle short tz IDs that is UTC like: EST, HST, MST
+    if (len == 3) {
+      if ((*ptr == 'E' && *(ptr + 1) == 'S' && *(ptr + 2) == 'T')) {
+        // EST: -05:00
+        return {-5L * 3600L, ParseUtcLikeTzResult::UTC_LIKE_TZ};
+      } else if ((*ptr == 'H' && *(ptr + 1) == 'S' && *(ptr + 2) == 'T')) {
+        // HST: -10:00
+        return {-10L * 3600L, ParseUtcLikeTzResult::UTC_LIKE_TZ};
+      } else if ((*ptr == 'M' && *(ptr + 1) == 'S' && *(ptr + 2) == 'T')) {
+        // MST: -07:00
+        return {-7L * 3600L, ParseUtcLikeTzResult::UTC_LIKE_TZ};
+      }
+    }
+
     size_t char_offset = 0;
     // skip UTC|GMT if existing
     if (len > 2 && ((*ptr == 'G' && *(ptr + 1) == 'M' && *(ptr + 2) == 'T') ||
@@ -254,7 +298,8 @@ struct parse_timestamp_string_fn {
       sign = -1L;
     } else {
       // if the rep starts with UTC|GMT, it can NOT be region-based rep
-      return {0, char_offset < 3 ? ParseUtcLikeTzResult::NOT_UTC_LIKE_TZ : ParseUtcLikeTzResult::INVALID};
+      return {
+        0, char_offset < 3 ? ParseUtcLikeTzResult::NOT_UTC_LIKE_TZ : ParseUtcLikeTzResult::INVALID};
     }
 
     // parse hh:mm:ss
@@ -306,67 +351,75 @@ struct parse_timestamp_string_fn {
   }
 
   /**
-   * tz_indices is sorted, use binary search to find tz index.
+   * use binary search to find tz index.
    */
   __device__ inline int query_index_from_tz_db(string_view const& tz_lit) const
   {
-    auto const it = thrust::upper_bound(thrust::seq,
-                               tz_indices->begin(),
-                               tz_indices->end(),
-                               tz_lit);
-    if (it != tz_indices->end() && *it == tz_lit) {
-      return it - tz_indices->begin();
+    auto string_iter_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
+                                                             get_string_fn{*sorted_tz_names});
+    auto string_iter_end   = string_iter_begin + sorted_tz_names->size();
+    auto it                = thrust::lower_bound(
+      thrust::seq, string_iter_begin, string_iter_end, tz_lit, thrust::less<string_view>());
+    if (it != string_iter_end && *it == tz_lit) {
+      // found tz
+      return static_cast<int>(it - string_iter_begin);
     } else {
+      // not found tz
       return -1;
     }
   }
 
   /**
-   * Perform binary search to search out the timezone offset based on local epoch
+   * Perform binary search to search out the offset from UTC based on local epoch
    * instants. Basically, this is the same approach as
    * `convert_timestamp_tz_functor`.
    */
-  __device__ inline int64_t extract_timezone_offset(int64_t local_epoch_second,
-                                                    size_type tz_index) const
+  __device__ inline int64_t compute_utc_offset(int64_t loose_epoch_second, size_type tz_index) const
   {
-    auto const& tz_instants    = transitions->child().child(1);
     auto const& utc_offsets    = transitions->child().child(2);
+    auto const& loose_instants = transitions->child().child(3);
 
     auto const local_transitions = cudf::list_device_view{*transitions, tz_index};
     auto const list_size         = local_transitions.size();
 
     auto const transition_times = cudf::device_span<int64_t const>(
-      tz_instants.data<int64_t>() + local_transitions.element_offset(0),
+      loose_instants.data<int64_t>() + local_transitions.element_offset(0),
       static_cast<size_t>(list_size));
 
     auto const it = thrust::upper_bound(
-      thrust::seq, transition_times.begin(), transition_times.end(), local_epoch_second);
+      thrust::seq, transition_times.begin(), transition_times.end(), loose_epoch_second);
     auto const idx         = static_cast<size_type>(thrust::distance(transition_times.begin(), it));
     auto const list_offset = local_transitions.element_offset(idx - 1);
-
     return static_cast<int64_t>(utc_offsets.element<int32_t>(list_offset));
   }
 
   /**
-   * Leverage STL to convert local time to UTC unix_timestamp(in seconds)
+   * The formula to compute loose epoch from local time. The loose epoch is used
+   * to search for the corresponding timezone offset of specific zone ID from
+   * TimezoneDB. The target of loose epoch is to transfer local time to a number
+   * which is proportional to the real timestamp as easily as possible. Loose
+   * epoch, as a computation approach, helps us to align probe(kernel side) to
+   * the TimezoneDB(Java side). Then, we can apply binary search based on loose
+   * epoch instants of TimezoneDB to find out the correct timezone offset.
    */
-  __device__ inline int64_t compute_epoch_s(timestamp_components const& ts) const
+  __device__ inline int64_t compute_loose_epoch_s(timestamp_components const& ts) const
   {
-    auto const ymd =  // chrono class handles the leap year calculations for us
-      cuda::std::chrono::year_month_day(cuda::std::chrono::year{ts.year},
-                                        cuda::std::chrono::month{static_cast<uint32_t>(ts.month)},
-                                        cuda::std::chrono::day{static_cast<uint32_t>(ts.day)});
-    auto days = cuda::std::chrono::sys_days(ymd).time_since_epoch().count();
-
-    return (days * 24L * 3600L) + (ts.hour * 3600L) + (ts.minute * 60L) + ts.second;
+    return (ts.year * 400 + (ts.month - 1) * 31 + ts.day - 1) * 86400L + ts.hour * 3600L +
+           ts.minute * 60L + ts.second;
   }
 
   /**
-   * Leverage STL to convert local time to UTC unix_timestamp(in milliseconds)
+   * Leverage STL to convert local time to UTC unix_timestamp(in millisecond)
    */
   __device__ inline int64_t compute_epoch_us(timestamp_components const& ts) const
   {
-    int64_t timestamp_s = compute_epoch_s(ts);
+    auto const ymd =  // chrono class handles the leap year calculations for us
+      cuda::std::chrono::year_month_day(cuda::std::chrono::year{ts.year},
+                                        cuda::std::chrono::month{static_cast<uint32_t>(ts.month)},
+                                        cuda::std::chrono::day{static_cast<uint32_t>(ts.day)});
+    auto days = cuda::std::chrono::sys_days(ymd).time_since_epoch().count();
+
+    int64_t timestamp_s = (days * 24L * 3600L) + (ts.hour * 3600L) + (ts.minute * 60L) + ts.second;
     return timestamp_s * 1000000L + ts.microseconds;
   }
 
@@ -541,19 +594,20 @@ struct parse_timestamp_string_fn {
  * The common entrance of string_to_timestamp, two paths call this function:
  * - `string_to_timestamp_with_tz` : with time zone
  * - `string_to_timestamp_without_tz` : without time zone
- * The parameters transitions, tz_indices and default_tz_index are only for handling
+ * The parameters transitions, sorted_tz_names and default_tz_index are only for handling
  * inputs with timezone.
- * It's called from `string_to_timestamp_without_tz` if transitions and tz_indices
+ * It's called from `string_to_timestamp_without_tz` if transitions and sorted_tz_names
  * are nullptr, otherwise called from `string_to_timestamp_with_tz`.
  *
  */
-std::unique_ptr<cudf::column> to_timestamp(cudf::strings_column_view const& input,
-                                           bool ansi_mode,
-                                           bool allow_tz_in_date_str                   = true,
-                                           size_type default_tz_index                  = 1000000000,
-                                           cudf::column_view const* transitions        = nullptr,
-                                           cudf::strings_column_view const* tz_indices = nullptr,
-                                           cudf::column_view const* tz_short_ids       = nullptr)
+std::unique_ptr<cudf::column> to_timestamp(
+  cudf::strings_column_view const& input,
+  bool ansi_mode,
+  bool allow_tz_in_date_str                        = true,
+  size_type default_tz_index                       = 1000000000,
+  cudf::column_view const* transitions             = nullptr,
+  cudf::strings_column_view const* sorted_tz_names = nullptr,
+  cudf::column_view const* tz_short_ids            = nullptr)
 {
   auto const stream = cudf::get_default_stream();
   auto const mr     = rmm::mr::get_current_device_resource();
@@ -570,7 +624,7 @@ std::unique_ptr<cudf::column> to_timestamp(cudf::strings_column_view const& inpu
   auto result_valid_col = cudf::make_fixed_width_column(
     cudf::data_type{cudf::type_id::UINT8}, input.size(), cudf::mask_state::UNALLOCATED, stream, mr);
 
-  if (transitions == nullptr || tz_indices == nullptr) {
+  if (transitions == nullptr || sorted_tz_names == nullptr) {
     thrust::transform(
       rmm::exec_policy(stream),
       thrust::make_counting_iterator(0),
@@ -582,7 +636,7 @@ std::unique_ptr<cudf::column> to_timestamp(cudf::strings_column_view const& inpu
   } else {
     auto const ft_cdv_ptr    = column_device_view::create(*transitions, stream);
     auto const d_transitions = lists_column_device_view{*ft_cdv_ptr};
-    auto d_tz_indices        = cudf::column_device_view::create(tz_indices->parent(), stream);
+    auto d_sorted_tz_names   = cudf::column_device_view::create(sorted_tz_names->parent(), stream);
     auto d_tz_short_ids      = column_device_view::create(*tz_short_ids, stream);
 
     thrust::transform(
@@ -593,7 +647,7 @@ std::unique_ptr<cudf::column> to_timestamp(cudf::strings_column_view const& inpu
         thrust::make_tuple(result_col->mutable_view().begin<cudf::timestamp_us>(),
                            result_valid_col->mutable_view().begin<uint8_t>())),
       parse_timestamp_string_fn<true>{
-        *d_strings, default_tz_index, true, d_transitions, *d_tz_indices, *d_tz_short_ids});
+        *d_strings, default_tz_index, true, d_transitions, *d_sorted_tz_names, *d_tz_short_ids});
   }
 
   auto valid_view = result_valid_col->mutable_view();
@@ -637,14 +691,14 @@ namespace spark_rapids_jni {
 std::unique_ptr<cudf::column> string_to_timestamp_with_tz(
   cudf::strings_column_view const& input,
   cudf::column_view const& transitions,
-  cudf::strings_column_view const& tz_indices,
+  cudf::strings_column_view const& sorted_tz_names,
   cudf::size_type default_tz_index,
   bool ansi_mode,
   cudf::column_view const& tz_short_ids)
 {
   if (input.size() == 0) { return nullptr; }
   return to_timestamp(
-    input, ansi_mode, true, default_tz_index, &transitions, &tz_indices, &tz_short_ids);
+    input, ansi_mode, true, default_tz_index, &transitions, &sorted_tz_names, &tz_short_ids);
 }
 
 /**
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
index a87a754f35..cb0a29ce1f 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
@@ -266,6 +266,11 @@ public static ColumnVector toTimestamp(ColumnView cv, ZoneId defaultTimeZone, bo
    * allowSpecialExpressions = true, ansiEnabled = false)
    * ts is: ['2023-01-01 00:00:00', '2023-01-01T08:00:00']
    * 
+   * Note: this function will never use the time zones in the strings.
+   * allowTimeZone means whether allow time zone in the timestamp string.
+   * If allowTimeZone is true, the time zones are ignored if has.
+   * if allowTimeZone is false, then this function will throw exception if has any time zone in the strings and it's ANSI mode.
+   * 
    * @param cv                      The input string column to be converted.
    * @param allowTimeZone           whether allow time zone in the timestamp
    *                                string. e.g.:
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
index 087aba7549..8189364b6e 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
@@ -17,6 +17,7 @@
 package com.nvidia.spark.rapids.jni;
 
 import java.time.Instant;
+import java.time.LocalDateTime;
 import java.time.ZoneId;
 import java.time.zone.ZoneOffsetTransition;
 import java.time.zone.ZoneRules;
@@ -24,11 +25,13 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.Comparator;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.TimeZone;
 import java.util.concurrent.*;
+import java.util.function.Function;
 
 import ai.rapids.cudf.*;
 
@@ -175,21 +178,29 @@ private void loadData(Executor executor) throws IllegalStateException {
   }
 
   /**
-   * load ZoneId.SHORT_IDS and append Z->UTC, then sort the IDs.
+   * load ZoneId.SHORT_IDS and map to time zone index in transition table.
    */
-  private void loadTimeZoneShortIDs() {
+  private void loadTimeZoneShortIDs(Map<String, Integer> zoneIdToTable) {
     HostColumnVector.DataType type = new HostColumnVector.StructType(false,
     new HostColumnVector.BasicType(false, DType.STRING),
-    new HostColumnVector.BasicType(false, DType.STRING));
+    new HostColumnVector.BasicType(false, DType.INT32));
     ArrayList<HostColumnVector.StructData> data = new ArrayList<>();
+    // copy short IDs
     List<String> idList = new ArrayList<>(ZoneId.SHORT_IDS.keySet());
-    idList.add("Z");
+    // sort short IDs
     Collections.sort(idList);
     for (String id : idList) {
-      if (id.equals("Z")) {
-        data.add(new HostColumnVector.StructData(id, "UTC"));
+      String mapTo = ZoneId.SHORT_IDS.get(id);
+      if (mapTo.startsWith("+") || mapTo.startsWith("-")) {
+        // skip: EST: -05:00; HST: -10:00; MST: -07:00
+        // kernel will handle EST, HST, MST
+        // ZoneId.SHORT_IDS is deprecated, so it will not probably change
       } else {
-        data.add(new HostColumnVector.StructData(id, ZoneId.SHORT_IDS.get(id)));
+        Integer index = zoneIdToTable.get(mapTo);
+        // some short IDs are DST, skip unsupported
+        if (index != null) {
+          data.add(new HostColumnVector.StructData(id, index));
+        }
       }
     }
     shortIDs = HostColumnVector.fromStructs(type, data);
@@ -203,9 +214,6 @@ public ColumnVector getTimeZoneShortIDs() {
   private void doLoadData() {
     synchronized (this) {
       try {
-        // load ZoneId.SHORT_IDS and append Z->UTC, then sort the IDs.
-        loadTimeZoneShortIDs();
-        
         Map<String, Integer> zoneIdToTable = new HashMap<>();
         List<List<HostColumnVector.StructData>> masterTransitions = new ArrayList<>();
         // Build a timezone ID index for the rendering of timezone IDs which may be included in datetime-like strings.
@@ -218,20 +226,40 @@ private void doLoadData() {
         List<String> zondIdList = new ArrayList<>();
         List<String> unsupportedZoneList = new ArrayList<>();
         
-        // sort the IDs
-        String[] availableIDs = TimeZone.getAvailableIDs();
-        Arrays.sort(availableIDs);
-
-        for (String tzId : availableIDs) {
+        // collect zone id and sort
+        List<ZoneId> ids = new ArrayList<>();
+        for (String tzId : TimeZone.getAvailableIDs()) {
           ZoneId zoneId;
           try {
             zoneId = ZoneId.of(tzId).normalized(); // we use the normalized form to dedupe
+            ids.add(zoneId);
           } catch (ZoneRulesException e) {
             // Sometimes the list of getAvailableIDs() is one of the 3-letter abbreviations, however,
             // this use is deprecated due to ambiguity reasons (same abbrevation can be used for 
             // multiple time zones). These are not supported by ZoneId.of(...) directly here.
             continue;
           }
+        }
+        Collections.sort(ids, new Comparator<ZoneId>() {
+          @Override
+          public int compare(ZoneId o1, ZoneId o2) {
+            // sort by `getId`
+            return o1.getId().compareTo(o2.getId());
+          }
+        });
+
+        // A simple approach to transform LocalDateTime to a value which is proportional to
+        // the exact EpochSecond. After caching these values along with EpochSeconds, we
+        // can easily search out which time zone transition rule we should apply according
+        // to LocalDateTime structs. The searching procedure is same as the binary search with
+        // exact EpochSeconds(convert_timestamp_tz_functor), except using "loose instant"
+        // as search index instead of exact EpochSeconds.
+        Function<LocalDateTime, Long> localToLooseEpochSecond = lt ->
+                86400L * (lt.getYear() * 400L + (lt.getMonthValue() - 1) * 31L +
+                        lt.getDayOfMonth() - 1) +
+                        3600L * lt.getHour() + 60L * lt.getMinute() + lt.getSecond();
+
+        for (ZoneId zoneId : ids) {
           ZoneRules zoneRules = zoneId.getRules();
           // Filter by non-repeating rules
           if (!zoneRules.isFixedOffset() && !zoneRules.getTransitionRules().isEmpty()) {
@@ -266,7 +294,8 @@ private void doLoadData() {
                       new HostColumnVector.StructData(
                           t.getInstant().getEpochSecond(),
                           t.getInstant().getEpochSecond() + t.getOffsetAfter().getTotalSeconds(),
-                          t.getOffsetAfter().getTotalSeconds()
+                          t.getOffsetAfter().getTotalSeconds(),
+                          localToLooseEpochSecond.apply(t.getDateTimeAfter()) // this column is for rebase local date time
                       )
                   );
                 } else {
@@ -274,7 +303,8 @@ private void doLoadData() {
                       new HostColumnVector.StructData(
                           t.getInstant().getEpochSecond(),
                           t.getInstant().getEpochSecond() + t.getOffsetBefore().getTotalSeconds(),
-                          t.getOffsetAfter().getTotalSeconds()
+                          t.getOffsetAfter().getTotalSeconds(),
+                          localToLooseEpochSecond.apply(t.getDateTimeBefore()) // this column is for rebase local date time
                       )
                   );
                 }
@@ -288,6 +318,9 @@ private void doLoadData() {
         }
         zoneIdToTableFuture.complete(zoneIdToTable);
 
+        // load ZoneId.SHORT_IDS and append Z->UTC, then sort the IDs.
+        loadTimeZoneShortIDs(zoneIdToTable);
+
         HostColumnVector.DataType childType = new HostColumnVector.StructType(false,
             new HostColumnVector.BasicType(false, DType.INT64),
             new HostColumnVector.BasicType(false, DType.INT64),
@@ -331,6 +364,10 @@ private HostColumnVector getHostFixedTransitions() {
     }
   }
 
+  /**
+   * get map from time zone to time zone index in transition table. 
+   * @return map from time zone to time zone index in transition table. 
+   */
   public Map<String, Integer> getZoneIDMap() {
     try {
       return zoneIdToTableFuture.get(TIMEOUT_SECS, TimeUnit.SECONDS);
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
index ca3049bfed..86f5203249 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
@@ -31,6 +31,7 @@
 import ai.rapids.cudf.AssertUtils;
 import ai.rapids.cudf.ColumnVector;
 import ai.rapids.cudf.DType;
+import ai.rapids.cudf.HostColumnVector;
 import ai.rapids.cudf.Table;
 
 public class CastStringsTest {
@@ -407,6 +408,10 @@ void toTimestampTestWithTz() {
     // short TZ ID: BST->Asia/Dhaka, CTT->Asia/Shanghai
     entries.add(new AbstractMap.SimpleEntry<>("2023-11-5T03:04:55.1 CTT", 1699124695100000L));
     entries.add(new AbstractMap.SimpleEntry<>("2023-11-5T03:04:55.1 BST", 1699124695100000L + 7200L * 1000000L)); // BST is 2 hours later than CTT
+    // short TZ ID: EST: -05:00; HST: -10:00; MST: -07:00
+    entries.add(new AbstractMap.SimpleEntry<>("2023-11-5T03:04:55.1 EST", 1699124695100000L + 13L * 3600L * 1000000L)); // EST is 8 + 5  hours later than Asia/Shanghai
+    entries.add(new AbstractMap.SimpleEntry<>("2023-11-5T03:04:55.1 HST", 1699124695100000L + 18L * 3600L * 1000000L)); // HST is 8 + 10 hours later than Asia/Shanghai
+    entries.add(new AbstractMap.SimpleEntry<>("2023-11-5T03:04:55.1 MST", 1699124695100000L + 15L * 3600L * 1000000L)); // MST is 8 + 7  hours later than Asia/Shanghai
 
     int validDataSize = entries.size();
 
@@ -425,7 +430,6 @@ void toTimestampTestWithTz() {
     entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 10:20:30 -180001", null));
     entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 10:20:30 UTC+18:00:10", null));
     entries.add(new AbstractMap.SimpleEntry<>("2000-01-29 10:20:30 GMT-23:5", null));
-
     List<String> inputs = new ArrayList<>();
     List<Long> expects = new ArrayList<>();
     for (Map.Entry<String, Long> entry : entries) {
@@ -434,9 +438,9 @@ void toTimestampTestWithTz() {
     }
 
     // Throw unsupported exception for symbols because Europe/London contains DST rules
-    assertThrows(ai.rapids.cudf.CudfException.class, () -> {
+    assertThrows(IllegalArgumentException.class, () -> {
       try (ColumnVector input = ColumnVector.fromStrings("2000-01-29 1:2:3 Europe/London")) {
-        CastStrings.toTimestamp(input, ZoneId.of("UTC"), false);
+        CastStrings.toTimestamp(input, ZoneId.of("UTC"), true);
       }
     });
 

From 93a8331de69255b500299cde504b63dbb1aea0e1 Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Wed, 24 Jan 2024 15:26:12 +0800
Subject: [PATCH 26/35] Fixes; Comments

---
 src/main/cpp/src/datetime_parser.cu           | 26 ++++++++++---------
 .../nvidia/spark/rapids/jni/CastStrings.java  |  5 ++--
 .../spark/rapids/jni/GpuTimeZoneDB.java       | 11 +++++---
 3 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index 18750c9a27..0d1040766e 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -176,21 +176,23 @@ struct parse_timestamp_string_fn {
     } else {
       auto tz_view = string_view(tz_lit_ptr, tz_lit_len);
 
-      // map tz short IDs to time zone index in transitions.
+      // try to map tz short IDs to time zone index in transitions.
       // Here only handle regional base tz map: short ID->regional based tz
       // Note: here do not handle special short IDs: EST: -05:00; HST: -10:00; MST: -07:00
-      auto const short_tz_id_col = tz_short_ids->child(0);
-      auto const map_to_tz_col   = tz_short_ids->child(1);
-      auto string_iter_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
-                                                               get_string_fn{short_tz_id_col});
-      auto string_iter_end   = string_iter_begin + short_tz_id_col.size();
-      auto it                = thrust::lower_bound(
-        thrust::seq, string_iter_begin, string_iter_end, tz_view, thrust::less<string_view>());
       int tz_index_for_short_tz = -1;
-      if (it != string_iter_end && *it == tz_view) {
-        // found a map, get the time zone index
-        auto short_id_index   = static_cast<size_type>(it - string_iter_begin);
-        tz_index_for_short_tz = static_cast<int>(map_to_tz_col.element<int32_t>(short_id_index));
+      if (tz_view.length() == 3) {  // short ID length is always 3
+        auto const short_tz_id_col = tz_short_ids->child(0);
+        auto const map_to_tz_col   = tz_short_ids->child(1);
+        auto string_iter_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
+                                                                 get_string_fn{short_tz_id_col});
+        auto string_iter_end   = string_iter_begin + short_tz_id_col.size();
+        auto it                = thrust::lower_bound(
+          thrust::seq, string_iter_begin, string_iter_end, tz_view, thrust::less<string_view>());
+        if (it != string_iter_end && *it == tz_view) {
+          // found a map, get the time zone index
+          auto short_id_index   = static_cast<size_type>(it - string_iter_begin);
+          tz_index_for_short_tz = static_cast<int>(map_to_tz_col.element<int32_t>(short_id_index));
+        }
       }
 
       if (tz_index_for_short_tz >= 0) {
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
index cb0a29ce1f..4a91dc8557 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
@@ -185,8 +185,8 @@ public static ColumnVector fromIntegersWithBase(ColumnView cv, int base) {
    * Note:
    * - Do not support cast special strings(epoch now today yesterday tomorrow) to timestamp.
    * Spark31x supports cast special strings while Spark320+ do not supports
-   * - Do not support DST time zones, throw ai.rapids.cudf.CudfException
-   *   if contains DST time zones.
+   * - Do not support DST time zones, return null in non-ANSI mode.
+   * TODO: DST support.
    *
    * Example:
    * input = [" 2023", "2023-01-01T08:00:00Asia/Shanghai "]
@@ -206,7 +206,6 @@ public static ColumnVector fromIntegersWithBase(ColumnView cv, int base) {
    * @return a timestamp column
    * @throws IllegalArgumentException if any string in cv has invalid format or the time zone is
    *                                  non-existed/wrong when ansiEnabled is true
-   * @throws CudfException            if time zone is a DST time zone
    */
   public static ColumnVector toTimestamp(ColumnView cv, ZoneId defaultTimeZone, boolean ansiEnabled) {
     if (!GpuTimeZoneDB.isSupportedTimeZone(defaultTimeZone)) {
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
index 8189364b6e..eac216478a 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
@@ -45,8 +45,7 @@ public class GpuTimeZoneDB {
   private CompletableFuture<Map<String, Integer>> zoneIdToTableFuture;
   private CompletableFuture<HostColumnVector> fixedTransitionsFuture;
   private CompletableFuture<HostColumnVector> zoneIdVectorFuture;
-  // Used to store Java ZoneId.SHORT_IDS Map, e.g.: PST:America/Los_Angeles
-  // Note: also add a entry: Z->UTC
+  // Used to store Java ZoneId.SHORT_IDS Map: PST -> index of America/Los_Angeles in transition table.
   private HostColumnVector shortIDs;
 
   private boolean closed = false;
@@ -179,6 +178,7 @@ private void loadData(Executor executor) throws IllegalStateException {
 
   /**
    * load ZoneId.SHORT_IDS and map to time zone index in transition table.
+   * Note: ignored EST: -05:00; HST: -10:00; MST: -07:00
    */
   private void loadTimeZoneShortIDs(Map<String, Integer> zoneIdToTable) {
     HostColumnVector.DataType type = new HostColumnVector.StructType(false,
@@ -190,6 +190,7 @@ private void loadTimeZoneShortIDs(Map<String, Integer> zoneIdToTable) {
     // sort short IDs
     Collections.sort(idList);
     for (String id : idList) {
+      assert(id.length() == 3); // short ID lenght is always 3
       String mapTo = ZoneId.SHORT_IDS.get(id);
       if (mapTo.startsWith("+") || mapTo.startsWith("-")) {
         // skip: EST: -05:00; HST: -10:00; MST: -07:00
@@ -200,6 +201,8 @@ private void loadTimeZoneShortIDs(Map<String, Integer> zoneIdToTable) {
         // some short IDs are DST, skip unsupported
         if (index != null) {
           data.add(new HostColumnVector.StructData(id, index));
+        } else {
+          // TODO: index should not be null after DST is supported.
         }
       }
     }
@@ -231,7 +234,7 @@ private void doLoadData() {
         for (String tzId : TimeZone.getAvailableIDs()) {
           ZoneId zoneId;
           try {
-            zoneId = ZoneId.of(tzId).normalized(); // we use the normalized form to dedupe
+            zoneId = ZoneId.of(tzId, ZoneId.SHORT_IDS).normalized(); // we use the normalized form to dedupe
             ids.add(zoneId);
           } catch (ZoneRulesException e) {
             // Sometimes the list of getAvailableIDs() is one of the 3-letter abbreviations, however,
@@ -318,7 +321,7 @@ public int compare(ZoneId o1, ZoneId o2) {
         }
         zoneIdToTableFuture.complete(zoneIdToTable);
 
-        // load ZoneId.SHORT_IDS and append Z->UTC, then sort the IDs.
+        // load ZoneId.SHORT_IDS
         loadTimeZoneShortIDs(zoneIdToTable);
 
         HostColumnVector.DataType childType = new HostColumnVector.StructType(false,

From c8dffb130547e88271fad80e0abdbf39a2d75f2b Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Thu, 25 Jan 2024 10:12:30 +0800
Subject: [PATCH 27/35] Refector GpuTimeZoneDB; Add comment for year has max 6
 digits

---
 src/main/cpp/src/datetime_parser.cu           |  8 +++++++-
 .../nvidia/spark/rapids/jni/CastStrings.java  | 12 ++++-------
 .../spark/rapids/jni/GpuTimeZoneDB.java       | 20 +++++++++++--------
 3 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index 0d1040766e..0fdcc4f9bf 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -64,7 +64,13 @@ namespace {
  * Represents local date time in a time zone.
  */
 struct timestamp_components {
-  int32_t year;  // max 6 digits
+  /**
+   * year: Max 6 digits.
+   * Spark stores timestamp into Long in microseconds.
+   * A Long is able to represent a timestamp within [+-]200 thousand years.
+   * Calculated from: Long.MaxValue/MinValue / microseconds_per_year
+  */
+  int32_t year;
   int8_t month;
   int8_t day;
   int8_t hour;
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
index 74cea9e902..d67bc9c208 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
@@ -213,13 +213,10 @@ public static ColumnVector toTimestamp(ColumnView cv, ZoneId defaultTimeZone, bo
               defaultTimeZone.toString()));
     }
 
-    GpuTimeZoneDB singleton = GpuTimeZoneDB.getInstance();
-    GpuTimeZoneDB.cacheDatabase();
-    Integer tzIndex = singleton.getZoneIDMap().get(defaultTimeZone.normalized().toString());
-
-    try (Table transitions = singleton.getTransitions();
-         ColumnVector tzIndices = singleton.getZoneIDVector();
-         ColumnVector tzShortIDs = singleton.getTimeZoneShortIDs()) {
+    Integer tzIndex = GpuTimeZoneDB.getZoneIDMap().get(defaultTimeZone.normalized().toString());
+    try (Table transitions = GpuTimeZoneDB.getTransitions();
+         ColumnVector tzIndices = GpuTimeZoneDB.getZoneIDVector();
+         ColumnVector tzShortIDs = GpuTimeZoneDB.getTimeZoneShortIDs()) {
       return new ColumnVector(toTimestamp(cv.getNativeView(), transitions.getNativeView(),
               tzIndices.getNativeView(), tzIndex, ansiEnabled, tzShortIDs.getNativeView()));
     }
@@ -279,7 +276,6 @@ public static ColumnVector toTimestamp(ColumnView cv, ZoneId defaultTimeZone, bo
    *
    */
   public static ColumnVector toTimestampWithoutTimeZone(ColumnView cv, boolean allowTimeZone, boolean ansiEnabled) {
-    GpuTimeZoneDB.cacheDatabase();
     return new ColumnVector(toTimestampWithoutTimeZone(cv.getNativeView(), allowTimeZone,  ansiEnabled));
   }
 
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
index 02a171cae6..bf4e171c8d 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
@@ -444,32 +444,36 @@ public int compare(ZoneId o1, ZoneId o2) {
    * get map from time zone to time zone index in transition table. 
    * @return map from time zone to time zone index in transition table. 
    */
-  public Map<String, Integer> getZoneIDMap() {
-    return zoneIdToTable;
+  public static Map<String, Integer> getZoneIDMap() {
+    cacheDatabase();
+    return instance.zoneIdToTable;
   }
 
   /**
    * Get a map from short ID to time zone index in transitions for the short ID mapped time zone
    * @return
    */
-  public ColumnVector getTimeZoneShortIDs() {
-    return shortIDs.copyToDevice();
+  public static ColumnVector getTimeZoneShortIDs() {
+    cacheDatabase();
+    return instance.shortIDs.copyToDevice();
   }
 
   /**
    * Get a time zone list which is corresponding to the transitions
    * @return
    */
-  public ColumnVector getZoneIDVector() {
-    return zoneIdVector.copyToDevice();
+  public static ColumnVector getZoneIDVector() {
+    cacheDatabase();
+    return instance.zoneIdVector.copyToDevice();
   }
 
   /**
    * Transition table
    * @return
    */
-  public Table getTransitions() {
-    try (ColumnVector fixedTransitions = getFixedTransitions()) {
+  public static Table getTransitions() {
+    cacheDatabase();
+    try (ColumnVector fixedTransitions = instance.getFixedTransitions()) {
       return new Table(fixedTransitions);
     }
   }

From 4104173b26630e502a0c4a6fd90c6ebe312f90af Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Thu, 25 Jan 2024 10:18:13 +0800
Subject: [PATCH 28/35] format cpp code

---
 .clang-format                       | 155 ++++++++++++++++++++++++++++
 src/main/cpp/src/datetime_parser.cu |   2 +-
 2 files changed, 156 insertions(+), 1 deletion(-)
 create mode 100644 .clang-format

diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000000..26b9a5bf4c
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,155 @@
+---
+# Refer to the following link for the explanation of each params:
+#   http://releases.llvm.org/8.0.0/tools/clang/docs/ClangFormatStyleOptions.html
+Language: Cpp
+# BasedOnStyle: Google
+AccessModifierOffset: -1
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: true
+AlignConsecutiveBitFields: true
+AlignConsecutiveDeclarations: false
+AlignConsecutiveMacros: true
+AlignEscapedNewlines: Left
+AlignOperands: true
+AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortEnumsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLambdasOnASingleLine: true
+AllowShortLoopsOnASingleLine: false
+# This is deprecated
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments:  false
+BinPackParameters: false
+BraceWrapping:
+  AfterClass:            false
+  AfterControlStatement: false
+  AfterEnum:             false
+  AfterFunction:         false
+  AfterNamespace:        false
+  AfterObjCDeclaration:  false
+  AfterStruct:           false
+  AfterUnion:            false
+  AfterExternBlock:      false
+  BeforeCatch:           false
+  BeforeElse:            false
+  IndentBraces:          false
+  # disabling the below splits, else, they'll just add to the vertical length of source files!
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+  SplitEmptyNamespace: false
+BreakAfterJavaFieldAnnotations: false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: WebKit
+BreakBeforeInheritanceComma: false
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakInheritanceList: BeforeColon
+BreakStringLiterals: true
+ColumnLimit: 100
+CommentPragmas: '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+# Kept the below 2 to be the same as `IndentWidth` to keep everything uniform
+ConstructorInitializerIndentWidth: 2
+ContinuationIndentWidth: 2
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat: false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks: Preserve
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IndentCaseLabels: true
+IndentPPDirectives: None
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Never
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+RawStringFormats:
+  - Language: Cpp
+    Delimiters:
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+  - Language: TextProto
+    Delimiters:
+      - pb
+      - PB
+      - proto
+      - PROTO
+    EnclosingFunctions:
+      - EqualsProto
+      - EquivToProto
+      - PARSE_PARTIAL_TEXT_PROTO
+      - PARSE_TEST_PROTO
+      - PARSE_TEXT_PROTO
+      - ParseTextOrDie
+      - ParseTextProtoOrDie
+    CanonicalDelimiter: ''
+    BasedOnStyle: google
+# Enabling comment reflow causes doxygen comments to be messed up in their formats!
+ReflowComments: true
+SortIncludes: true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceBeforeSquareBrackets: false
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles: false
+SpacesInConditionalStatement: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: c++17
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+# Be consistent with indent-width, even for people who use tab for indentation!
+TabWidth: 2
+UseTab: Never
diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index 0fdcc4f9bf..dd30dae537 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -69,7 +69,7 @@ struct timestamp_components {
    * Spark stores timestamp into Long in microseconds.
    * A Long is able to represent a timestamp within [+-]200 thousand years.
    * Calculated from: Long.MaxValue/MinValue / microseconds_per_year
-  */
+   */
   int32_t year;
   int8_t month;
   int8_t day;

From 5af012c1c764a2273a50e445c024fa67410e747b Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Thu, 25 Jan 2024 17:06:03 +0800
Subject: [PATCH 29/35] Remove .clang-format

---
 .clang-format | 155 --------------------------------------------------
 1 file changed, 155 deletions(-)
 delete mode 100644 .clang-format

diff --git a/.clang-format b/.clang-format
deleted file mode 100644
index 26b9a5bf4c..0000000000
--- a/.clang-format
+++ /dev/null
@@ -1,155 +0,0 @@
----
-# Refer to the following link for the explanation of each params:
-#   http://releases.llvm.org/8.0.0/tools/clang/docs/ClangFormatStyleOptions.html
-Language: Cpp
-# BasedOnStyle: Google
-AccessModifierOffset: -1
-AlignAfterOpenBracket: Align
-AlignConsecutiveAssignments: true
-AlignConsecutiveBitFields: true
-AlignConsecutiveDeclarations: false
-AlignConsecutiveMacros: true
-AlignEscapedNewlines: Left
-AlignOperands: true
-AlignTrailingComments: true
-AllowAllArgumentsOnNextLine: true
-AllowAllConstructorInitializersOnNextLine: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: true
-AllowShortCaseLabelsOnASingleLine: true
-AllowShortEnumsOnASingleLine: true
-AllowShortFunctionsOnASingleLine: All
-AllowShortIfStatementsOnASingleLine: true
-AllowShortLambdasOnASingleLine: true
-AllowShortLoopsOnASingleLine: false
-# This is deprecated
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: Yes
-BinPackArguments:  false
-BinPackParameters: false
-BraceWrapping:
-  AfterClass:            false
-  AfterControlStatement: false
-  AfterEnum:             false
-  AfterFunction:         false
-  AfterNamespace:        false
-  AfterObjCDeclaration:  false
-  AfterStruct:           false
-  AfterUnion:            false
-  AfterExternBlock:      false
-  BeforeCatch:           false
-  BeforeElse:            false
-  IndentBraces:          false
-  # disabling the below splits, else, they'll just add to the vertical length of source files!
-  SplitEmptyFunction: false
-  SplitEmptyRecord: false
-  SplitEmptyNamespace: false
-BreakAfterJavaFieldAnnotations: false
-BreakBeforeBinaryOperators: None
-BreakBeforeBraces: WebKit
-BreakBeforeInheritanceComma: false
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
-BreakConstructorInitializers: BeforeColon
-BreakInheritanceList: BeforeColon
-BreakStringLiterals: true
-ColumnLimit: 100
-CommentPragmas: '^ IWYU pragma:'
-CompactNamespaces: false
-ConstructorInitializerAllOnOneLineOrOnePerLine: true
-# Kept the below 2 to be the same as `IndentWidth` to keep everything uniform
-ConstructorInitializerIndentWidth: 2
-ContinuationIndentWidth: 2
-Cpp11BracedListStyle: true
-DerivePointerAlignment: false
-DisableFormat: false
-ExperimentalAutoDetectBinPacking: false
-FixNamespaceComments: true
-ForEachMacros:
-  - foreach
-  - Q_FOREACH
-  - BOOST_FOREACH
-IncludeBlocks: Preserve
-IncludeIsMainRegex: '([-_](test|unittest))?$'
-IndentCaseLabels: true
-IndentPPDirectives: None
-IndentWidth:     2
-IndentWrappedFunctionNames: false
-JavaScriptQuotes: Leave
-JavaScriptWrapImports: true
-KeepEmptyLinesAtTheStartOfBlocks: false
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBinPackProtocolList: Never
-ObjCBlockIndentWidth: 2
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: true
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 1
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyBreakTemplateDeclaration: 10
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 200
-PointerAlignment: Left
-RawStringFormats:
-  - Language: Cpp
-    Delimiters:
-      - cc
-      - CC
-      - cpp
-      - Cpp
-      - CPP
-      - 'c++'
-      - 'C++'
-    CanonicalDelimiter: ''
-  - Language: TextProto
-    Delimiters:
-      - pb
-      - PB
-      - proto
-      - PROTO
-    EnclosingFunctions:
-      - EqualsProto
-      - EquivToProto
-      - PARSE_PARTIAL_TEXT_PROTO
-      - PARSE_TEST_PROTO
-      - PARSE_TEXT_PROTO
-      - ParseTextOrDie
-      - ParseTextProtoOrDie
-    CanonicalDelimiter: ''
-    BasedOnStyle: google
-# Enabling comment reflow causes doxygen comments to be messed up in their formats!
-ReflowComments: true
-SortIncludes: true
-SortUsingDeclarations: true
-SpaceAfterCStyleCast: false
-SpaceAfterTemplateKeyword: true
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceBeforeSquareBrackets: false
-SpaceInEmptyBlock: false
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 2
-SpacesInAngles: false
-SpacesInConditionalStatement: false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard: c++17
-StatementMacros:
-  - Q_UNUSED
-  - QT_REQUIRE_VERSION
-# Be consistent with indent-width, even for people who use tab for indentation!
-TabWidth: 2
-UseTab: Never

From 97a8f8f0135a922acb3bc20a6f118c78b207638f Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Thu, 25 Jan 2024 17:09:29 +0800
Subject: [PATCH 30/35] Fix do not support non-normalized time zone, like:
 Etc/GMT; Optimize short time zone ID handling, remove binary search on short
 IDs

---
 src/main/cpp/src/CastStringJni.cpp            |  14 +-
 src/main/cpp/src/datetime_parser.cu           | 154 ++++------
 src/main/cpp/src/datetime_parser.hpp          |  12 +-
 .../nvidia/spark/rapids/jni/CastStrings.java  |  11 +-
 .../spark/rapids/jni/GpuTimeZoneDB.java       | 278 +++++++++---------
 .../spark/rapids/jni/CastStringsTest.java     |   8 +-
 6 files changed, 211 insertions(+), 266 deletions(-)

diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp
index fa48650f32..1d39e7152e 100644
--- a/src/main/cpp/src/CastStringJni.cpp
+++ b/src/main/cpp/src/CastStringJni.cpp
@@ -264,8 +264,7 @@ Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv* env,
                                                          jlong transitions_handle,
                                                          jlong tz_indices_col,
                                                          jint tz_default_index,
-                                                         jboolean ansi_enabled,
-                                                         jlong tz_short_ids)
+                                                         jboolean ansi_enabled)
 {
   JNI_NULL_CHECK(env, input_column, "input column is null", 0);
   try {
@@ -275,12 +274,11 @@ Java_com_nvidia_spark_rapids_jni_CastStrings_toTimestamp(JNIEnv* env,
       cudf::strings_column_view(*reinterpret_cast<cudf::column_view const*>(input_column));
     auto const transitions =
       reinterpret_cast<cudf::table_view const*>(transitions_handle)->column(0);
-    auto const& tz_indices_view =
-      cudf::strings_column_view(*reinterpret_cast<cudf::column_view const*>(tz_indices_col));
-    auto const tz_index                        = static_cast<cudf::size_type>(tz_default_index);
-    const cudf::column_view* tz_short_ids_view = reinterpret_cast<cudf::column_view*>(tz_short_ids);
-    auto ret_cv                                = spark_rapids_jni::string_to_timestamp_with_tz(
-      input_view, transitions, tz_indices_view, tz_index, ansi_enabled, *tz_short_ids_view);
+    const cudf::column_view* tz_indices_view =
+      reinterpret_cast<cudf::column_view const*>(tz_indices_col);
+    auto const tz_index = static_cast<cudf::size_type>(tz_default_index);
+    auto ret_cv         = spark_rapids_jni::string_to_timestamp_with_tz(
+      input_view, transitions, *tz_indices_view, tz_index, ansi_enabled);
     if (ret_cv) { return cudf::jni::release_as_jlong(ret_cv); }
   }
   CATCH_STD(env, 0);
diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index dd30dae537..fb72944cdb 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -140,8 +140,7 @@ struct parse_timestamp_string_fn {
   // LIST<STRUCT<utcInstant: int64, tzInstant: int64, utcOffset: int32,
   // looseTzInstant: int64>>.
   thrust::optional<lists_column_device_view const> transitions = thrust::nullopt;
-  thrust::optional<column_device_view const> sorted_tz_names   = thrust::nullopt;
-  thrust::optional<column_device_view const> tz_short_ids      = thrust::nullopt;
+  thrust::optional<column_device_view const> tz_indices        = thrust::nullopt;
 
   __device__ thrust::tuple<cudf::timestamp_us, uint8_t> operator()(const cudf::size_type& idx) const
   {
@@ -181,50 +180,23 @@ struct parse_timestamp_string_fn {
       utc_offset = compute_utc_offset(compute_loose_epoch_s(ts_comp), default_tz_index);
     } else {
       auto tz_view = string_view(tz_lit_ptr, tz_lit_len);
-
-      // try to map tz short IDs to time zone index in transitions.
-      // Here only handle regional base tz map: short ID->regional based tz
-      // Note: here do not handle special short IDs: EST: -05:00; HST: -10:00; MST: -07:00
-      int tz_index_for_short_tz = -1;
-      if (tz_view.length() == 3) {  // short ID length is always 3
-        auto const short_tz_id_col = tz_short_ids->child(0);
-        auto const map_to_tz_col   = tz_short_ids->child(1);
-        auto string_iter_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
-                                                                 get_string_fn{short_tz_id_col});
-        auto string_iter_end   = string_iter_begin + short_tz_id_col.size();
-        auto it                = thrust::lower_bound(
-          thrust::seq, string_iter_begin, string_iter_end, tz_view, thrust::less<string_view>());
-        if (it != string_iter_end && *it == tz_view) {
-          // found a map, get the time zone index
-          auto short_id_index   = static_cast<size_type>(it - string_iter_begin);
-          tz_index_for_short_tz = static_cast<int>(map_to_tz_col.element<int32_t>(short_id_index));
-        }
-      }
-
-      if (tz_index_for_short_tz >= 0) {
-        // it's a supported short ID, and found the tz index
-        utc_offset = compute_utc_offset(compute_loose_epoch_s(ts_comp), tz_index_for_short_tz);
-      } else {
-        // Firstly, try parsing as utc-like timezone rep
-        // Note: parse_utc_like_tz handles special short IDs: EST: -05:00; HST: -10:00; MST: -07:00
-        auto [fix_offset, ret_code] = parse_utc_like_tz(tz_view);
-        if (ret_code == ParseUtcLikeTzResult::UTC_LIKE_TZ) {
-          utc_offset = fix_offset;
-        } else if (ret_code == ParseUtcLikeTzResult::NOT_UTC_LIKE_TZ) {
-          // Then, try parsing as region-based timezone ID
-          auto tz_index = query_index_from_tz_db(tz_view);
-          if (tz_index < 0) {
-            // TODO: distinguish unsupported and invalid tz
-            return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}},
-                                      ParseResult::INVALID);
-          } else {
-            // supported tz
-            utc_offset = compute_utc_offset(compute_loose_epoch_s(ts_comp), tz_index);
-          }
-        } else {
-          // (ret_code == ParseUtcLikeTzResult::INVALID) quick path to mark value invalid
+      // Firstly, try parsing as utc-like timezone rep
+      auto [fix_offset, ret_code] = parse_utc_like_tz(tz_view);
+      if (ret_code == ParseUtcLikeTzResult::UTC_LIKE_TZ) {
+        utc_offset = fix_offset;
+      } else if (ret_code == ParseUtcLikeTzResult::NOT_UTC_LIKE_TZ) {
+        // Then, try parsing as region-based timezone ID
+        auto tz_index = query_index_from_tz_db(tz_view);
+        if (tz_index < 0) {
+          // TODO: distinguish unsupported and invalid tz
           return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID);
+        } else {
+          // supported tz
+          utc_offset = compute_utc_offset(compute_loose_epoch_s(ts_comp), tz_index);
         }
+      } else {
+        // (ret_code == ParseUtcLikeTzResult::INVALID) quick path to mark value invalid
+        return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID);
       }
     }
 
@@ -247,12 +219,6 @@ struct parse_timestamp_string_fn {
    * This function is purposed to be fully align to Apache Spark's behavior. The
    * function returns the status along with the ParseUtcLikeTzResult result.
    *
-   * Valid patterns:
-   *   Z: means UTC
-   *   short tz IDs that is UTC like
-   *     EST: -05:00
-   *     HST: -10:00
-   *     MST: -07:00
    *   with colon
    *     hh:mm      : ^(GMT|UTC)?[+-](\d|0[0-9]|1[0-8]):(\d|[0-5][0-9])
    *     hh:mm:ss   : ^(GMT|UTC)?[+-](\d|0[0-9]|1[0-8]):[0-5][0-9]:[0-5][0-9]
@@ -270,23 +236,6 @@ struct parse_timestamp_string_fn {
 
     char const* ptr = tz_lit.data();
 
-    // Z time zone
-    if (len == 1 && *ptr == 'Z') { return {0, ParseUtcLikeTzResult::UTC_LIKE_TZ}; }
-
-    // handle short tz IDs that is UTC like: EST, HST, MST
-    if (len == 3) {
-      if ((*ptr == 'E' && *(ptr + 1) == 'S' && *(ptr + 2) == 'T')) {
-        // EST: -05:00
-        return {-5L * 3600L, ParseUtcLikeTzResult::UTC_LIKE_TZ};
-      } else if ((*ptr == 'H' && *(ptr + 1) == 'S' && *(ptr + 2) == 'T')) {
-        // HST: -10:00
-        return {-10L * 3600L, ParseUtcLikeTzResult::UTC_LIKE_TZ};
-      } else if ((*ptr == 'M' && *(ptr + 1) == 'S' && *(ptr + 2) == 'T')) {
-        // MST: -07:00
-        return {-7L * 3600L, ParseUtcLikeTzResult::UTC_LIKE_TZ};
-      }
-    }
-
     size_t char_offset = 0;
     // skip UTC|GMT if existing
     if (len > 2 && ((*ptr == 'G' && *(ptr + 1) == 'M' && *(ptr + 2) == 'T') ||
@@ -363,14 +312,18 @@ struct parse_timestamp_string_fn {
    */
   __device__ inline int query_index_from_tz_db(string_view const& tz_lit) const
   {
-    auto string_iter_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
-                                                             get_string_fn{*sorted_tz_names});
-    auto string_iter_end   = string_iter_begin + sorted_tz_names->size();
-    auto it                = thrust::lower_bound(
+    auto const tz_col                  = tz_indices->child(0);
+    auto const index_in_transition_col = tz_indices->child(1);
+
+    auto string_iter_begin =
+      thrust::make_transform_iterator(thrust::make_counting_iterator(0), get_string_fn{tz_col});
+    auto string_iter_end = string_iter_begin + tz_col.size();
+    auto it              = thrust::lower_bound(
       thrust::seq, string_iter_begin, string_iter_end, tz_lit, thrust::less<string_view>());
     if (it != string_iter_end && *it == tz_lit) {
       // found tz
-      return static_cast<int>(it - string_iter_begin);
+      auto tz_name_index = static_cast<size_type>(it - string_iter_begin);
+      return static_cast<int>(index_in_transition_col.element<int32_t>(tz_name_index));
     } else {
       // not found tz
       return -1;
@@ -602,20 +555,18 @@ struct parse_timestamp_string_fn {
  * The common entrance of string_to_timestamp, two paths call this function:
  * - `string_to_timestamp_with_tz` : with time zone
  * - `string_to_timestamp_without_tz` : without time zone
- * The parameters transitions, sorted_tz_names and default_tz_index are only for handling
+ * The parameters transitions, tz_indices and default_tz_index are only for handling
  * inputs with timezone.
- * It's called from `string_to_timestamp_without_tz` if transitions and sorted_tz_names
+ * It's called from `string_to_timestamp_without_tz` if transitions and tz_indices
  * are nullptr, otherwise called from `string_to_timestamp_with_tz`.
  *
  */
-std::unique_ptr<cudf::column> to_timestamp(
-  cudf::strings_column_view const& input,
-  bool ansi_mode,
-  bool allow_tz_in_date_str                        = true,
-  size_type default_tz_index                       = 1000000000,
-  cudf::column_view const* transitions             = nullptr,
-  cudf::strings_column_view const* sorted_tz_names = nullptr,
-  cudf::column_view const* tz_short_ids            = nullptr)
+std::unique_ptr<cudf::column> to_timestamp(cudf::strings_column_view const& input,
+                                           bool ansi_mode,
+                                           bool allow_tz_in_date_str            = true,
+                                           size_type default_tz_index           = 1000000000,
+                                           cudf::column_view const* transitions = nullptr,
+                                           cudf::column_view const* tz_indices  = nullptr)
 {
   auto const stream = cudf::get_default_stream();
   auto const mr     = rmm::mr::get_current_device_resource();
@@ -632,7 +583,7 @@ std::unique_ptr<cudf::column> to_timestamp(
   auto result_valid_col = cudf::make_fixed_width_column(
     cudf::data_type{cudf::type_id::UINT8}, input.size(), cudf::mask_state::UNALLOCATED, stream, mr);
 
-  if (transitions == nullptr || sorted_tz_names == nullptr) {
+  if (transitions == nullptr || tz_indices == nullptr) {
     thrust::transform(
       rmm::exec_policy(stream),
       thrust::make_counting_iterator(0),
@@ -644,18 +595,16 @@ std::unique_ptr<cudf::column> to_timestamp(
   } else {
     auto const ft_cdv_ptr    = column_device_view::create(*transitions, stream);
     auto const d_transitions = lists_column_device_view{*ft_cdv_ptr};
-    auto d_sorted_tz_names   = cudf::column_device_view::create(sorted_tz_names->parent(), stream);
-    auto d_tz_short_ids      = column_device_view::create(*tz_short_ids, stream);
-
-    thrust::transform(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator(0),
-      thrust::make_counting_iterator(input.size()),
-      thrust::make_zip_iterator(
-        thrust::make_tuple(result_col->mutable_view().begin<cudf::timestamp_us>(),
-                           result_valid_col->mutable_view().begin<uint8_t>())),
-      parse_timestamp_string_fn<true>{
-        *d_strings, default_tz_index, true, d_transitions, *d_sorted_tz_names, *d_tz_short_ids});
+    auto d_tz_indices        = cudf::column_device_view::create(*tz_indices, stream);
+
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator(0),
+                      thrust::make_counting_iterator(input.size()),
+                      thrust::make_zip_iterator(
+                        thrust::make_tuple(result_col->mutable_view().begin<cudf::timestamp_us>(),
+                                           result_valid_col->mutable_view().begin<uint8_t>())),
+                      parse_timestamp_string_fn<true>{
+                        *d_strings, default_tz_index, true, d_transitions, *d_tz_indices});
   }
 
   auto valid_view = result_valid_col->mutable_view();
@@ -696,17 +645,14 @@ namespace spark_rapids_jni {
  * timestamp column otherwise.
  *
  */
-std::unique_ptr<cudf::column> string_to_timestamp_with_tz(
-  cudf::strings_column_view const& input,
-  cudf::column_view const& transitions,
-  cudf::strings_column_view const& sorted_tz_names,
-  cudf::size_type default_tz_index,
-  bool ansi_mode,
-  cudf::column_view const& tz_short_ids)
+std::unique_ptr<cudf::column> string_to_timestamp_with_tz(cudf::strings_column_view const& input,
+                                                          cudf::column_view const& transitions,
+                                                          cudf::column_view const& tz_indices,
+                                                          cudf::size_type default_tz_index,
+                                                          bool ansi_mode)
 {
   if (input.size() == 0) { return nullptr; }
-  return to_timestamp(
-    input, ansi_mode, true, default_tz_index, &transitions, &sorted_tz_names, &tz_short_ids);
+  return to_timestamp(input, ansi_mode, true, default_tz_index, &transitions, &tz_indices);
 }
 
 /**
diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp
index f750594f9f..2d45f68dca 100644
--- a/src/main/cpp/src/datetime_parser.hpp
+++ b/src/main/cpp/src/datetime_parser.hpp
@@ -72,13 +72,11 @@ namespace spark_rapids_jni {
  * @returns the pointer of the timestamp result column, which points to nullptr
  * if there exists invalid inputs and ANSI mode is on.
  */
-std::unique_ptr<cudf::column> string_to_timestamp_with_tz(
-  cudf::strings_column_view const& input,
-  cudf::column_view const& transitions,
-  cudf::strings_column_view const& tz_indices,
-  cudf::size_type default_tz_index,
-  bool ansi_mode,
-  cudf::column_view const& tz_short_ids);
+std::unique_ptr<cudf::column> string_to_timestamp_with_tz(cudf::strings_column_view const& input,
+                                                          cudf::column_view const& transitions,
+                                                          cudf::column_view const& tz_indices,
+                                                          cudf::size_type default_tz_index,
+                                                          bool ansi_mode);
 
 /**
  *
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
index d67bc9c208..3c4c4a3cc6 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
@@ -210,15 +210,14 @@ public static ColumnVector fromIntegersWithBase(ColumnView cv, int base) {
   public static ColumnVector toTimestamp(ColumnView cv, ZoneId defaultTimeZone, boolean ansiEnabled) {
     if (!GpuTimeZoneDB.isSupportedTimeZone(defaultTimeZone)) {
       throw new IllegalArgumentException(String.format("Unsupported timezone: %s",
-              defaultTimeZone.toString()));
+              defaultTimeZone.getId()));
     }
 
-    Integer tzIndex = GpuTimeZoneDB.getZoneIDMap().get(defaultTimeZone.normalized().toString());
+    Integer tzIndex = GpuTimeZoneDB.getZoneIDMap().get(defaultTimeZone.getId());
     try (Table transitions = GpuTimeZoneDB.getTransitions();
-         ColumnVector tzIndices = GpuTimeZoneDB.getZoneIDVector();
-         ColumnVector tzShortIDs = GpuTimeZoneDB.getTimeZoneShortIDs()) {
+         ColumnVector tzIndices = GpuTimeZoneDB.getZoneIDVector()) {
       return new ColumnVector(toTimestamp(cv.getNativeView(), transitions.getNativeView(),
-              tzIndices.getNativeView(), tzIndex, ansiEnabled, tzShortIDs.getNativeView()));
+              tzIndices.getNativeView(), tzIndex, ansiEnabled));
     }
   }
 
@@ -291,7 +290,7 @@ private static native long toIntegersWithBase(long nativeColumnView, int base,
     boolean ansiEnabled, int dtype);
   private static native long fromIntegersWithBase(long nativeColumnView, int base);
   private static native long toTimestamp(long input,
-      long transitions, long tzIndices, int tzIndex, boolean ansiEnabled, long tzShortIDs);
+      long transitions, long tzIndices, int tzIndex, boolean ansiEnabled);
   private static native long toTimestampWithoutTimeZone(long input, boolean allowTimeZone,
       boolean ansiEnabled);
 }
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
index bf4e171c8d..4c84ae1892 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
@@ -34,8 +34,10 @@
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.TimeZone;
 import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.ExecutionException;
@@ -69,12 +71,8 @@ public class GpuTimeZoneDB {
   // zone id to index in `fixedTransitions`
   private Map<String, Integer> zoneIdToTable;
 
-  // Used to store Java ZoneId.SHORT_IDS Map, e.g.: For PST -> America/Los_Angeles
-  // Save: PST -> index of America/Los_Angeles in transition table.
-  private HostColumnVector shortIDs;
-
-  // zone id list
-  private HostColumnVector zoneIdVector;
+  // host column vector<String, Integer> for `zoneIdToTable`, sorted by time zone strings
+  private HostColumnVector zoneIdToTableVec;
 
   // Guarantee singleton instance
   private GpuTimeZoneDB() {
@@ -218,13 +216,9 @@ private void closeResources() {
       fixedTransitions.close();
       fixedTransitions = null;
     }
-    if (shortIDs != null) {
-      shortIDs.close();
-      shortIDs = null;
-    }
-    if (zoneIdVector != null) {
-      zoneIdVector.close();
-      zoneIdVector = null;
+    if (zoneIdToTableVec != null) {
+      zoneIdToTableVec.close();
+      zoneIdToTableVec = null;
     }
   }
 
@@ -284,74 +278,36 @@ public static ZoneId getZoneId(String timeZoneId) {
     return ZoneId.of(formattedZoneId, ZoneId.SHORT_IDS);
   }
 
-  /**
-   * load ZoneId.SHORT_IDS and map to time zone index in transition table.
-   * Note: ignored EST: -05:00; HST: -10:00; MST: -07:00
-   */
-  private void loadTimeZoneShortIDs(Map<String, Integer> zoneIdToTable) {
-    HostColumnVector.DataType type = new HostColumnVector.StructType(false,
-    new HostColumnVector.BasicType(false, DType.STRING),
-    new HostColumnVector.BasicType(false, DType.INT32));
-    ArrayList<HostColumnVector.StructData> data = new ArrayList<>();
-    // copy short IDs
-    List<String> idList = new ArrayList<>(ZoneId.SHORT_IDS.keySet());
-    // sort short IDs
-    Collections.sort(idList);
-    for (String id : idList) {
-      assert(id.length() == 3); // short ID lenght is always 3
-      String mapTo = ZoneId.SHORT_IDS.get(id);
-      if (mapTo.startsWith("+") || mapTo.startsWith("-")) {
-        // skip: EST: -05:00; HST: -10:00; MST: -07:00
-        // kernel will handle EST, HST, MST
-        // ZoneId.SHORT_IDS is deprecated, so it will not probably change
-      } else {
-        Integer index = zoneIdToTable.get(mapTo);
-        // some short IDs are DST, skip unsupported
-        if (index != null) {
-          data.add(new HostColumnVector.StructData(id, index));
-        } else {
-          // TODO: index should not be null after DST is supported.
-        }
-      }
-    }
-    shortIDs = HostColumnVector.fromStructs(type, data);
-  }
-
   @SuppressWarnings("unchecked")
   private void loadData() {
     try {
-      zoneIdToTable = new HashMap<>();
-      List<List<HostColumnVector.StructData>> masterTransitions = new ArrayList<>();
-      // Build a timezone ID index for the rendering of timezone IDs which may be included in datetime-like strings.
-      // For instance: "2023-11-5T03:04:55.1 Asia/Shanghai" -> This index helps to find the
-      // offset of "Asia/Shanghai" in timezoneDB.
-      //
-      // Currently, we do NOT support all timezone IDs. For unsupported time zones, like invalid ones,
-      // we replace them with NULL value when ANSI mode is off when parsing string to timestamp.
-      // This list only contains supported time zones.
-      List<String> zondIdList = new ArrayList<>();
-
-      // collect zone id and sort
-      List<ZoneId> ids = new ArrayList<>();
-      for (String tzId : TimeZone.getAvailableIDs()) {
-        ZoneId zoneId;
-        try {
-          zoneId = ZoneId.of(tzId, ZoneId.SHORT_IDS).normalized(); // we use the normalized form to dedupe
-          ids.add(zoneId);
-        } catch (ZoneRulesException e) {
-          // Sometimes the list of getAvailableIDs() is one of the 3-letter abbreviations, however,
-          // this use is deprecated due to ambiguity reasons (same abbrevation can be used for
-          // multiple time zones). These are not supported by ZoneId.of(...) directly here.
-          continue;
+      // Note: ZoneId.normalized will transform fixed offset time zone to standard fixed offset
+      // e.g.: ZoneId.of("Etc/GMT").normalized.getId = Z; ZoneId.of("Etc/GMT+0").normalized.getId = Z
+      // Both Etc/GMT and Etc/GMT+0 have normalized Z.
+      // We use the normalized form to dedupe,
+      // but should record map from TimeZone.getAvailableIDs() Set to normalized Set.
+      // `fixedTransitions` saves transitions for normalized time zones.
+      // Spark uses time zones from TimeZone.getAvailableIDs()
+      // So we have a Map<String, Int> from TimeZone.getAvailableIDs() to index of `fixedTransitions`.
+
+      // get and sort time zones
+      String[] timeZones = TimeZone.getAvailableIDs();
+      List<String> sortedTimeZones = new ArrayList<>(Arrays.asList(timeZones));
+      // Note: Z is a special normalized time zone from UTC: ZoneId.of("UTC").normalized = Z
+      // TimeZone.getAvailableIDs does not contains Z and ZoneId.SHORT_IDS also does not contain Z
+      // Should add Z to `zoneIdToTable`
+      sortedTimeZones.add("Z");
+      Collections.sort(sortedTimeZones);
+
+      // Note: Spark uses ZoneId.SHORT_IDS
+      // `TimeZone.getAvailableIDs` contains all keys in `ZoneId.SHORT_IDS`
+      // So do not need extra work for ZoneId.SHORT_IDS, here just check this assumption
+      for (String tz : ZoneId.SHORT_IDS.keySet()) {
+        if (!sortedTimeZones.contains(tz)) {
+          throw new IllegalStateException(
+              String.format("Can not find short Id %s in time zones %s", tz, sortedTimeZones));
         }
       }
-      Collections.sort(ids, new Comparator<ZoneId>() {
-        @Override
-        public int compare(ZoneId o1, ZoneId o2) {
-          // sort by `getId`
-          return o1.getId().compareTo(o2.getId());
-        }
-      });
 
       // A simple approach to transform LocalDateTime to a value which is proportional to
       // the exact EpochSecond. After caching these values along with EpochSeconds, we
@@ -364,66 +320,30 @@ public int compare(ZoneId o1, ZoneId o2) {
                       lt.getDayOfMonth() - 1) +
                       3600L * lt.getHour() + 60L * lt.getMinute() + lt.getSecond();
 
-      for (ZoneId zoneId : ids) {
-        ZoneRules zoneRules = zoneId.getRules();
+      List<List<HostColumnVector.StructData>> masterTransitions = new ArrayList<>();
+
+      // map: normalizedTimeZone -> index in fixedTransitions
+      Map<String, Integer> mapForNormalizedTimeZone = new HashMap<>();
+      // go though all time zones and save by normalized time zone
+      List<String> sortedSupportedTimeZones = new ArrayList<>();
+      for (String timeZone : sortedTimeZones) {
+        ZoneId normalizedZoneId = ZoneId.of(timeZone, ZoneId.SHORT_IDS).normalized();
+        String normalizedTimeZone = normalizedZoneId.getId();
+        ZoneRules zoneRules = normalizedZoneId.getRules();
         // Filter by non-repeating rules
         if (!zoneRules.isFixedOffset() && !zoneRules.getTransitionRules().isEmpty()) {
           continue;
         }
-        if (!zoneIdToTable.containsKey(zoneId.getId())) {
-          List<ZoneOffsetTransition> transitions = zoneRules.getTransitions();
+        sortedSupportedTimeZones.add(timeZone);
+        if (!mapForNormalizedTimeZone.containsKey(normalizedTimeZone)) { // dedup
+          List<HostColumnVector.StructData> data = getTransitionData(localToLooseEpochSecond, zoneRules);
+          // add transition data for time zone
           int idx = masterTransitions.size();
-          List<HostColumnVector.StructData> data = new ArrayList<>();
-          if (zoneRules.isFixedOffset()) {
-            data.add(
-                new HostColumnVector.StructData(Long.MIN_VALUE, Long.MIN_VALUE,
-                    zoneRules.getOffset(Instant.now()).getTotalSeconds(), Long.MIN_VALUE)
-            );
-          } else {
-            // Capture the first official offset (before any transition) using Long min
-            ZoneOffsetTransition first = transitions.get(0);
-            data.add(
-                new HostColumnVector.StructData(Long.MIN_VALUE, Long.MIN_VALUE,
-                    first.getOffsetBefore().getTotalSeconds(), Long.MIN_VALUE)
-            );
-            transitions.forEach(t -> {
-              // Whether transition is an overlap vs gap.
-              // In Spark:
-              // if it's a gap, then we use the offset after *on* the instant
-              // If it's an overlap, then there are 2 sets of valid timestamps in that are overlapping
-              // So, for the transition to UTC, you need to compare to instant + {offset before} 
-              // The time math still uses {offset after}
-              if (t.isGap()) {
-                data.add(
-                    new HostColumnVector.StructData(
-                        t.getInstant().getEpochSecond(),
-                        t.getInstant().getEpochSecond() + t.getOffsetAfter().getTotalSeconds(),
-                        t.getOffsetAfter().getTotalSeconds(),
-                        localToLooseEpochSecond.apply(t.getDateTimeAfter()) // this column is for rebase local date time
-                    )
-                );
-              } else {
-                data.add(
-                    new HostColumnVector.StructData(
-                        t.getInstant().getEpochSecond(),
-                        t.getInstant().getEpochSecond() + t.getOffsetBefore().getTotalSeconds(),
-                        t.getOffsetAfter().getTotalSeconds(),
-                        localToLooseEpochSecond.apply(t.getDateTimeBefore()) // this column is for rebase local date time
-                    )
-                );
-              }
-            });
-          }
+          mapForNormalizedTimeZone.put(normalizedTimeZone, idx);
           masterTransitions.add(data);
-          zoneIdToTable.put(zoneId.getId(), idx);
-          // Collect the IDs of all supported timezones in the order of masterTransitions
-          zondIdList.add(zoneId.getId());
         }
       }
 
-      // load ZoneId.SHORT_IDS
-      loadTimeZoneShortIDs(zoneIdToTable);
-
       HostColumnVector.DataType childType = new HostColumnVector.StructType(false,
           new HostColumnVector.BasicType(false, DType.INT64),
           new HostColumnVector.BasicType(false, DType.INT64),
@@ -432,39 +352,117 @@ public int compare(ZoneId o1, ZoneId o2) {
       HostColumnVector.DataType resultType =
           new HostColumnVector.ListType(false, childType);
 
+      // generate all transitions for all time zones
       fixedTransitions = HostColumnVector.fromLists(resultType, masterTransitions.toArray(new List[0]));
-      zoneIdVector = HostColumnVector.fromStrings(zondIdList.toArray(new String[0]));
+
+      // generate `zoneIdToTable`, key should be time zone not normalized time zone
+      zoneIdToTable = new HashMap<>();
+      for (String timeZone : sortedSupportedTimeZones) {
+        // map from time zone to normalized
+        String normalized = ZoneId.of(timeZone, ZoneId.SHORT_IDS).normalized().getId();
+        Integer index = mapForNormalizedTimeZone.get(normalized);
+        if (index != null) {
+          zoneIdToTable.put(timeZone, index);
+        } else {
+          throw new IllegalStateException("Could not find index for normalized time zone " + normalized);
+        }
+      }
+      // generate host vector
+      zoneIdToTableVec = generateZoneIdToTableVec(sortedSupportedTimeZones, zoneIdToTable);
+    } catch (IllegalStateException e) {
+      throw e;
     } catch (Exception e) {
       throw new IllegalStateException("load time zone DB cache failed!", e);
     }
+  }
 
+  // generate transition data for a time zone
+  private List<HostColumnVector.StructData> getTransitionData(Function<LocalDateTime, Long> localToLooseEpochSecond,
+      ZoneRules zoneRules) {
+    List<ZoneOffsetTransition> transitions = zoneRules.getTransitions();
+    List<HostColumnVector.StructData> data = new ArrayList<>();
+    if (zoneRules.isFixedOffset()) {
+      data.add(
+          new HostColumnVector.StructData(Long.MIN_VALUE, Long.MIN_VALUE,
+              zoneRules.getOffset(Instant.now()).getTotalSeconds(), Long.MIN_VALUE)
+      );
+    } else {
+      // Capture the first official offset (before any transition) using Long min
+      ZoneOffsetTransition first = transitions.get(0);
+      data.add(
+          new HostColumnVector.StructData(Long.MIN_VALUE, Long.MIN_VALUE,
+              first.getOffsetBefore().getTotalSeconds(), Long.MIN_VALUE)
+      );
+      transitions.forEach(t -> {
+        // Whether transition is an overlap vs gap.
+        // In Spark:
+        // if it's a gap, then we use the offset after *on* the instant
+        // If it's an overlap, then there are 2 sets of valid timestamps in that are overlapping
+        // So, for the transition to UTC, you need to compare to instant + {offset before} 
+        // The time math still uses {offset after}
+        if (t.isGap()) {
+          data.add(
+              new HostColumnVector.StructData(
+                  t.getInstant().getEpochSecond(),
+                  t.getInstant().getEpochSecond() + t.getOffsetAfter().getTotalSeconds(),
+                  t.getOffsetAfter().getTotalSeconds(),
+                  localToLooseEpochSecond.apply(t.getDateTimeAfter()) // this column is for rebase local date time
+              )
+          );
+        } else {
+          data.add(
+              new HostColumnVector.StructData(
+                  t.getInstant().getEpochSecond(),
+                  t.getInstant().getEpochSecond() + t.getOffsetBefore().getTotalSeconds(),
+                  t.getOffsetAfter().getTotalSeconds(),
+                  localToLooseEpochSecond.apply(t.getDateTimeBefore()) // this column is for rebase local date time
+              )
+          );
+        }
+      });
+    }
+    return data;
   }
 
   /**
-   * get map from time zone to time zone index in transition table. 
-   * @return map from time zone to time zone index in transition table. 
+   * Generate map from time zone to index in transition table.
+   * regular time zone map to normalized time zone, then get from 
+   * @param sortedSupportedTimeZones is sorted and supported time zones
+   * @param zoneIdToTableMap is a map from non-normalized time zone to index in transition table
    */
-  public static Map<String, Integer> getZoneIDMap() {
-    cacheDatabase();
-    return instance.zoneIdToTable;
+  private static HostColumnVector generateZoneIdToTableVec(List<String> sortedSupportedTimeZones, Map<String, Integer> zoneIdToTableMap) {
+    HostColumnVector.DataType type = new HostColumnVector.StructType(false,
+    new HostColumnVector.BasicType(false, DType.STRING),
+    new HostColumnVector.BasicType(false, DType.INT32));
+    ArrayList<HostColumnVector.StructData> data = new ArrayList<>();
+
+    for (String timeZone : sortedSupportedTimeZones) {
+      Integer mapTo = zoneIdToTableMap.get(timeZone);
+      if (mapTo != null) {
+        data.add(new HostColumnVector.StructData(timeZone, mapTo));
+      } else {
+        throw new IllegalStateException("Could not find index for time zone " + timeZone);
+      }
+    }
+    return HostColumnVector.fromStructs(type, data);
   }
 
   /**
-   * Get a map from short ID to time zone index in transitions for the short ID mapped time zone
-   * @return
+   * get map from time zone to time zone index in transition table. 
+   * @return map from time zone to time zone index in transition table. 
    */
-  public static ColumnVector getTimeZoneShortIDs() {
+  public static Map<String, Integer> getZoneIDMap() {
     cacheDatabase();
-    return instance.shortIDs.copyToDevice();
+    return instance.zoneIdToTable;
   }
 
   /**
-   * Get a time zone list which is corresponding to the transitions
+   * Get vector from time zone to index in transition table
    * @return
    */
   public static ColumnVector getZoneIDVector() {
     cacheDatabase();
-    return instance.zoneIdVector.copyToDevice();
+    return instance.zoneIdToTableVec.copyToDevice();
   }
 
   /**
@@ -493,7 +491,7 @@ private ColumnVector getFixedTransitions() {
    * @return list of fixed transitions
    */
   List getHostFixedTransitions(String zoneId) {
-    zoneId = ZoneId.of(zoneId).normalized().toString(); // we use the normalized form to dedupe
+    zoneId = ZoneId.of(zoneId, ZoneId.SHORT_IDS).normalized().toString(); // we use the normalized form to dedupe
     Integer idx = getZoneIDMap().get(zoneId);
     if (idx == null) {
       return null;
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
index 86f5203249..de2a7738f6 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
@@ -412,6 +412,10 @@ void toTimestampTestWithTz() {
     entries.add(new AbstractMap.SimpleEntry<>("2023-11-5T03:04:55.1 EST", 1699124695100000L + 13L * 3600L * 1000000L)); // EST is 8 + 5  hours later than Asia/Shanghai
     entries.add(new AbstractMap.SimpleEntry<>("2023-11-5T03:04:55.1 HST", 1699124695100000L + 18L * 3600L * 1000000L)); // HST is 8 + 10 hours later than Asia/Shanghai
     entries.add(new AbstractMap.SimpleEntry<>("2023-11-5T03:04:55.1 MST", 1699124695100000L + 15L * 3600L * 1000000L)); // MST is 8 + 7  hours later than Asia/Shanghai
+    // test time zones not in notmalized names, e.g,: ZoneId.of("Etc/GMT").normalized.getId = Z; ZoneId.of("Etc/GMT+0").normalized.getId = Z; Etc/GMT+10 -> -10:00
+    entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 Etc/GMT", 1571610824100000L));
+    entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 Etc/GMT+0", 1571610824100000L));
+    entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 Etc/GMT+10", 1571592825100000L));
 
     int validDataSize = entries.size();
 
@@ -459,8 +463,10 @@ void toTimestampTestWithTz() {
     try (
         ColumnVector input = ColumnVector.fromStrings(inputs.toArray(new String[0]));
         ColumnVector expected = ColumnVector.timestampMicroSecondsFromBoxedLongs(expects.toArray(new Long[0]));
-        ColumnVector actual = CastStrings.toTimestamp(input, ZoneId.of("UTC"), false)) {
+        ColumnVector actual = CastStrings.toTimestamp(input, ZoneId.of("UTC"), false);
+        ColumnVector actual2 = CastStrings.toTimestamp(input, ZoneId.of("Z"), false)) {
       AssertUtils.assertColumnsAreEqual(expected, actual);
+      AssertUtils.assertColumnsAreEqual(expected, actual2);
     }
 
     // Should NOT throw exception because all inputs are valid

From 0a7efd9b473880f8a64ed5a2f9d8571d423a5333 Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Fri, 26 Jan 2024 13:34:19 +0800
Subject: [PATCH 31/35] Refector to address comments

---
 src/main/cpp/src/datetime_parser.cu           | 144 +++++++++---------
 src/main/cpp/src/datetime_parser.hpp          |  16 +-
 .../spark/rapids/jni/GpuTimeZoneDB.java       |   2 +-
 3 files changed, 78 insertions(+), 84 deletions(-)

diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index fb72944cdb..e1ad607555 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,24 +16,17 @@
 
 #include "datetime_parser.hpp"
 
-#include <iostream>
-#include <vector>
-
-#include <cuda/std/cassert>
-
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/null_mask.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/valid_if.cuh>
-
 #include <cudf/lists/list_device_view.cuh>
 #include <cudf/lists/lists_column_device_view.cuh>
+#include <cudf/reduction.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/string_view.cuh>
-
-#include <cudf/reduction.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/span.hpp>
@@ -49,15 +42,12 @@
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
-using column                   = cudf::column;
-using column_device_view       = cudf::column_device_view;
-using column_view              = cudf::column_view;
-using lists_column_device_view = cudf::detail::lists_column_device_view;
-using size_type                = cudf::size_type;
-using string_view              = cudf::string_view;
-using struct_view              = cudf::struct_view;
-using table_view               = cudf::table_view;
+#include <cuda/std/cassert>
 
+#include <iostream>
+#include <vector>
+
+namespace spark_rapids_jni {
 namespace {
 
 /**
@@ -99,11 +89,11 @@ __device__ __host__ inline bool is_whitespace(const char chr)
 __device__ __host__ bool is_valid_digits(int segment, int digits)
 {
   // A Long is able to represent a timestamp within [+-]200 thousand years
-  const int constexpr maxDigitsYear = 6;
+  constexpr int maxDigitsYear = 6;
   // For the nanosecond part, more than 6 digits is allowed, but will be
   // truncated.
   return segment == 6 || (segment == 0 && digits >= 4 && digits <= maxDigitsYear) ||
-         // For the zoneId segment(7), it's could be zero digits when it's a
+         // For the zoneId segment(7), it could be zero digits when it's a
          // region-based zone ID
          (segment == 7 && digits <= 2) ||
          (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2);
@@ -113,7 +103,7 @@ __device__ __host__ bool is_valid_digits(int segment, int digits)
  * function to get a string from string view
  */
 struct get_string_fn {
-  column_device_view const& string_view;
+  cudf::column_device_view const& string_view;
 
   __device__ cudf::string_view operator()(size_t idx)
   {
@@ -132,15 +122,18 @@ enum ParseResult { OK = 0, INVALID = 1, UNSUPPORTED = 2 };
 
 template <bool with_timezone>
 struct parse_timestamp_string_fn {
-  column_device_view const d_strings;
-  size_type default_tz_index;
-  bool allow_tz_in_date_str = true;
+  // below three are required:
+  cudf::column_device_view const& d_strings;
+  cudf::size_type const default_tz_index;
+  bool const allow_tz_in_date_str;
+
+  // below two are optinal:
   // The list column of transitions to figure out the correct offset
   // to adjust the timestamp. The type of the values in this column is
   // LIST<STRUCT<utcInstant: int64, tzInstant: int64, utcOffset: int32,
   // looseTzInstant: int64>>.
-  thrust::optional<lists_column_device_view const> transitions = thrust::nullopt;
-  thrust::optional<column_device_view const> tz_indices        = thrust::nullopt;
+  thrust::optional<cudf::detail::lists_column_device_view const> transitions = thrust::nullopt;
+  thrust::optional<cudf::column_device_view const> tz_indices                = thrust::nullopt;
 
   __device__ thrust::tuple<cudf::timestamp_us, uint8_t> operator()(const cudf::size_type& idx) const
   {
@@ -152,16 +145,15 @@ struct parse_timestamp_string_fn {
     auto const d_str = d_strings.element<cudf::string_view>(idx);
 
     timestamp_components ts_comp{};
-    char const* tz_lit_ptr = nullptr;
-    size_type tz_lit_len   = 0;
+    char const* tz_lit_ptr     = nullptr;
+    cudf::size_type tz_lit_len = 0;
     switch (parse_string_to_timestamp_us(&ts_comp, &tz_lit_ptr, &tz_lit_len, d_str)) {
       case ParseResult::INVALID:
         return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID);
       case ParseResult::UNSUPPORTED:
         return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}},
                                   ParseResult::UNSUPPORTED);
-      case ParseResult::OK:
-      default: break;
+      case ParseResult::OK: break;
     }
 
     if constexpr (!with_timezone) {
@@ -179,14 +171,14 @@ struct parse_timestamp_string_fn {
       // no tz in the string tailing, use default tz
       utc_offset = compute_utc_offset(compute_loose_epoch_s(ts_comp), default_tz_index);
     } else {
-      auto tz_view = string_view(tz_lit_ptr, tz_lit_len);
+      auto const tz_view = cudf::string_view(tz_lit_ptr, tz_lit_len);
       // Firstly, try parsing as utc-like timezone rep
       auto [fix_offset, ret_code] = parse_utc_like_tz(tz_view);
       if (ret_code == ParseUtcLikeTzResult::UTC_LIKE_TZ) {
         utc_offset = fix_offset;
       } else if (ret_code == ParseUtcLikeTzResult::NOT_UTC_LIKE_TZ) {
         // Then, try parsing as region-based timezone ID
-        auto tz_index = query_index_from_tz_db(tz_view);
+        auto const tz_index = query_index_from_tz_db(tz_view);
         if (tz_index < 0) {
           // TODO: distinguish unsupported and invalid tz
           return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID);
@@ -216,7 +208,7 @@ struct parse_timestamp_string_fn {
   /**
    *
    * Parse UTC-like timezone representation such as: UTC+11:22:33, GMT-8:08:01.
-   * This function is purposed to be fully align to Apache Spark's behavior. The
+   * This function is purposed to be fully aligned to Apache Spark's behavior. The
    * function returns the status along with the ParseUtcLikeTzResult result.
    *
    *   with colon
@@ -230,9 +222,9 @@ struct parse_timestamp_string_fn {
    * is invalid)
    */
   __device__ inline thrust::pair<int64_t, ParseUtcLikeTzResult> parse_utc_like_tz(
-    string_view const& tz_lit) const
+    cudf::string_view const& tz_lit) const
   {
-    size_type len = tz_lit.size_bytes();
+    cudf::size_type const len = tz_lit.size_bytes();
 
     char const* ptr = tz_lit.data();
 
@@ -262,7 +254,7 @@ struct parse_timestamp_string_fn {
     // parse hh:mm:ss
     int64_t hms[3] = {0L, 0L, 0L};
     bool has_colon = false;
-    for (size_type i = 0; i < 3; i++) {
+    for (cudf::size_type i = 0; i < 3; i++) {
       // deal with the first digit
       hms[i] = *(ptr + char_offset++) - '0';
       if (hms[i] < 0 || hms[i] > 9) return {0, ParseUtcLikeTzResult::INVALID};
@@ -287,7 +279,7 @@ struct parse_timestamp_string_fn {
       }
 
       // deal with the second digit
-      auto digit = *(ptr + char_offset++) - '0';
+      auto const digit = *(ptr + char_offset++) - '0';
       if (digit < 0 || digit > 9) return {0, ParseUtcLikeTzResult::INVALID};
       hms[i] = hms[i] * 10 + digit;
 
@@ -310,19 +302,20 @@ struct parse_timestamp_string_fn {
   /**
    * use binary search to find tz index.
    */
-  __device__ inline int query_index_from_tz_db(string_view const& tz_lit) const
+  __device__ inline int query_index_from_tz_db(cudf::string_view const& tz_lit) const
   {
     auto const tz_col                  = tz_indices->child(0);
     auto const index_in_transition_col = tz_indices->child(1);
 
-    auto string_iter_begin =
+    auto const string_iter_begin =
       thrust::make_transform_iterator(thrust::make_counting_iterator(0), get_string_fn{tz_col});
-    auto string_iter_end = string_iter_begin + tz_col.size();
-    auto it              = thrust::lower_bound(
-      thrust::seq, string_iter_begin, string_iter_end, tz_lit, thrust::less<string_view>());
+    auto const string_iter_end = string_iter_begin + tz_col.size();
+    auto const it              = thrust::lower_bound(
+      thrust::seq, string_iter_begin, string_iter_end, tz_lit, thrust::less<cudf::string_view>());
     if (it != string_iter_end && *it == tz_lit) {
       // found tz
-      auto tz_name_index = static_cast<size_type>(it - string_iter_begin);
+      auto const tz_name_index =
+        static_cast<cudf::size_type>(thrust::distance(string_iter_begin, it));
       return static_cast<int>(index_in_transition_col.element<int32_t>(tz_name_index));
     } else {
       // not found tz
@@ -335,7 +328,8 @@ struct parse_timestamp_string_fn {
    * instants. Basically, this is the same approach as
    * `convert_timestamp_tz_functor`.
    */
-  __device__ inline int64_t compute_utc_offset(int64_t loose_epoch_second, size_type tz_index) const
+  __device__ inline int64_t compute_utc_offset(int64_t const loose_epoch_second,
+                                               cudf::size_type const tz_index) const
   {
     auto const& utc_offsets    = transitions->child().child(2);
     auto const& loose_instants = transitions->child().child(3);
@@ -349,7 +343,7 @@ struct parse_timestamp_string_fn {
 
     auto const it = thrust::upper_bound(
       thrust::seq, transition_times.begin(), transition_times.end(), loose_epoch_second);
-    auto const idx         = static_cast<size_type>(thrust::distance(transition_times.begin(), it));
+    auto const idx = static_cast<cudf::size_type>(thrust::distance(transition_times.begin(), it));
     auto const list_offset = local_transitions.element_offset(idx - 1);
     return static_cast<int64_t>(utc_offsets.element<int32_t>(list_offset));
   }
@@ -362,6 +356,10 @@ struct parse_timestamp_string_fn {
    * epoch, as a computation approach, helps us to align probe(kernel side) to
    * the TimezoneDB(Java side). Then, we can apply binary search based on loose
    * epoch instants of TimezoneDB to find out the correct timezone offset.
+   *
+   * Loose epoch column is used for binary search.
+   * Here we use 400 days a year, it's safe, because mapping from local time to
+   * loose epoch is monotonic.
    */
   __device__ inline int64_t compute_loose_epoch_s(timestamp_components const& ts) const
   {
@@ -370,7 +368,7 @@ struct parse_timestamp_string_fn {
   }
 
   /**
-   * Leverage STL to convert local time to UTC unix_timestamp(in millisecond)
+   * Leverage STL to convert local time to UTC timestamp(in microseconds)
    */
   __device__ inline int64_t compute_epoch_us(timestamp_components const& ts) const
   {
@@ -378,9 +376,10 @@ struct parse_timestamp_string_fn {
       cuda::std::chrono::year_month_day(cuda::std::chrono::year{ts.year},
                                         cuda::std::chrono::month{static_cast<uint32_t>(ts.month)},
                                         cuda::std::chrono::day{static_cast<uint32_t>(ts.day)});
-    auto days = cuda::std::chrono::sys_days(ymd).time_since_epoch().count();
+    auto const days = cuda::std::chrono::sys_days(ymd).time_since_epoch().count();
 
-    int64_t timestamp_s = (days * 24L * 3600L) + (ts.hour * 3600L) + (ts.minute * 60L) + ts.second;
+    int64_t const timestamp_s =
+      (days * 24L * 3600L) + (ts.hour * 3600L) + (ts.minute * 60L) + ts.second;
     return timestamp_s * 1000000L + ts.microseconds;
   }
 
@@ -395,7 +394,7 @@ struct parse_timestamp_string_fn {
   __device__ inline ParseResult parse_string_to_timestamp_us(
     timestamp_components* ts_comp,
     char const** parsed_tz_ptr,
-    size_type* parsed_tz_length,
+    cudf::size_type* parsed_tz_length,
     cudf::string_view const& timestamp_str) const
   {
     const char* curr_ptr = timestamp_str.data();
@@ -412,8 +411,8 @@ struct parse_timestamp_string_fn {
 
     if (curr_ptr == end_ptr) { return ParseResult::INVALID; }
 
-    const char* const bytes      = curr_ptr;
-    const size_type bytes_length = end_ptr - curr_ptr;
+    const char* const bytes            = curr_ptr;
+    const cudf::size_type bytes_length = end_ptr - curr_ptr;
 
     // segments stores: [year, month, day, hour, minute, seconds, microseconds, no_use_item,
     // no_use_item] the two tail items are no use, but here keeps them as Spark does
@@ -424,7 +423,6 @@ struct parse_timestamp_string_fn {
     int current_segment_digits = 0;
     size_t j                   = 0;
     int digits_milli           = 0;
-    // bool just_time = false;
     thrust::optional<int> year_sign;
     if ('-' == bytes[j] || '+' == bytes[j]) {
       if ('-' == bytes[j]) {
@@ -436,11 +434,10 @@ struct parse_timestamp_string_fn {
     }
 
     while (j < bytes_length) {
-      char b           = bytes[j];
-      int parsed_value = static_cast<int32_t>(b - '0');
+      char const b           = bytes[j];
+      int const parsed_value = static_cast<int32_t>(b - '0');
       if (parsed_value < 0 || parsed_value > 9) {
         if (0 == j && 'T' == b) {
-          // just_time = true;
           i += 3;
         } else if (i < 2) {
           if (b == '-') {
@@ -450,7 +447,6 @@ struct parse_timestamp_string_fn {
             current_segment_digits = 0;
             i += 1;
           } else if (0 == i && ':' == b && !year_sign.has_value()) {
-            // just_time = true;
             if (!is_valid_digits(3, current_segment_digits)) { return ParseResult::INVALID; }
             segments[3]            = current_segment_value;
             current_segment_value  = 0;
@@ -561,17 +557,17 @@ struct parse_timestamp_string_fn {
  * are nullptr, otherwise called from `string_to_timestamp_with_tz`.
  *
  */
-std::unique_ptr<cudf::column> to_timestamp(cudf::strings_column_view const& input,
-                                           bool ansi_mode,
-                                           bool allow_tz_in_date_str            = true,
-                                           size_type default_tz_index           = 1000000000,
-                                           cudf::column_view const* transitions = nullptr,
-                                           cudf::column_view const* tz_indices  = nullptr)
+std::unique_ptr<cudf::column> to_timestamp(
+  cudf::strings_column_view const& input,
+  bool const ansi_mode,
+  bool const allow_tz_in_date_str,
+  cudf::size_type const default_tz_index = -1,
+  cudf::column_view const* transitions   = nullptr,
+  cudf::column_view const* tz_indices    = nullptr,
+  rmm::cuda_stream_view stream           = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource())
 {
-  auto const stream = cudf::get_default_stream();
-  auto const mr     = rmm::mr::get_current_device_resource();
-
-  auto d_strings = cudf::column_device_view::create(input.parent(), stream);
+  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
   // column to store the result timestamp
   auto result_col =
     cudf::make_timestamp_column(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS},
@@ -593,9 +589,9 @@ std::unique_ptr<cudf::column> to_timestamp(cudf::strings_column_view const& inpu
                            result_valid_col->mutable_view().begin<uint8_t>())),
       parse_timestamp_string_fn<false>{*d_strings, default_tz_index, allow_tz_in_date_str});
   } else {
-    auto const ft_cdv_ptr    = column_device_view::create(*transitions, stream);
-    auto const d_transitions = lists_column_device_view{*ft_cdv_ptr};
-    auto d_tz_indices        = cudf::column_device_view::create(*tz_indices, stream);
+    auto const ft_cdv_ptr    = cudf::column_device_view::create(*transitions, stream);
+    auto const d_transitions = cudf::detail::lists_column_device_view{*ft_cdv_ptr};
+    auto const d_tz_indices  = cudf::column_device_view::create(*tz_indices, stream);
 
     thrust::transform(rmm::exec_policy(stream),
                       thrust::make_counting_iterator(0),
@@ -634,9 +630,7 @@ std::unique_ptr<cudf::column> to_timestamp(cudf::strings_column_view const& inpu
   return result_col;
 }
 
-}  // namespace
-
-namespace spark_rapids_jni {
+}  // anonymous namespace
 
 /**
  * Parse string column with time zone to timestamp column.
@@ -648,8 +642,8 @@ namespace spark_rapids_jni {
 std::unique_ptr<cudf::column> string_to_timestamp_with_tz(cudf::strings_column_view const& input,
                                                           cudf::column_view const& transitions,
                                                           cudf::column_view const& tz_indices,
-                                                          cudf::size_type default_tz_index,
-                                                          bool ansi_mode)
+                                                          cudf::size_type const default_tz_index,
+                                                          bool const ansi_mode)
 {
   if (input.size() == 0) { return nullptr; }
   return to_timestamp(input, ansi_mode, true, default_tz_index, &transitions, &tz_indices);
@@ -662,8 +656,8 @@ std::unique_ptr<cudf::column> string_to_timestamp_with_tz(cudf::strings_column_v
  *
  */
 std::unique_ptr<cudf::column> string_to_timestamp_without_tz(cudf::strings_column_view const& input,
-                                                             bool allow_time_zone,
-                                                             bool ansi_mode)
+                                                             bool const allow_time_zone,
+                                                             bool const ansi_mode)
 {
   if (input.size() == 0) { return nullptr; }
   return to_timestamp(input, ansi_mode, allow_time_zone);
diff --git a/src/main/cpp/src/datetime_parser.hpp b/src/main/cpp/src/datetime_parser.hpp
index 2d45f68dca..ba83f43064 100644
--- a/src/main/cpp/src/datetime_parser.hpp
+++ b/src/main/cpp/src/datetime_parser.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,9 +61,9 @@ namespace spark_rapids_jni {
  * Unlike Spark, Spark-Rapids currently does not support DST time zones.
  *
  * @param input input string column view.
- * @param transitions TimezoneDB, the table of transitions contains all
- * information for timezones
- * @param tz_indices TimezoneDB index of region-based timezone IDs
+ * @param transitions refer to TimezoneDB, the table of transitions contains all
+ * information for timezones.
+ * @param tz_indices refer to TimezoneDB, map from time zone to TimezoneDB transition index.
  * @param default_tz_index the index of default timezone in TimezoneDB, if input
  * date-like string does not contain a time zone (like: YYYY-MM-DD:hhmmss), use
  * this time zone.
@@ -75,8 +75,8 @@ namespace spark_rapids_jni {
 std::unique_ptr<cudf::column> string_to_timestamp_with_tz(cudf::strings_column_view const& input,
                                                           cudf::column_view const& transitions,
                                                           cudf::column_view const& tz_indices,
-                                                          cudf::size_type default_tz_index,
-                                                          bool ansi_mode);
+                                                          cudf::size_type const default_tz_index,
+                                                          bool const ansi_mode);
 
 /**
  *
@@ -130,7 +130,7 @@ std::unique_ptr<cudf::column> string_to_timestamp_with_tz(cudf::strings_column_v
  * if there exists invalid inputs and ANSI mode is on.
  */
 std::unique_ptr<cudf::column> string_to_timestamp_without_tz(cudf::strings_column_view const& input,
-                                                             bool allow_time_zone,
-                                                             bool ansi_mode);
+                                                             bool const allow_time_zone,
+                                                             bool const ansi_mode);
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
index 4c84ae1892..5da89319cb 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
@@ -1,5 +1,5 @@
 /*
-* Copyright (c) 2023-2024-2024, NVIDIA CORPORATION.
+* Copyright (c) 2023-2024, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.

From 21f99dbabb323f319e2fa08611908fcc727cac27 Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Fri, 26 Jan 2024 17:38:23 +0800
Subject: [PATCH 32/35] Fix cases

---
 .../java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java  | 8 +++++---
 .../com/nvidia/spark/rapids/jni/CastStringsTest.java     | 2 +-
 .../java/com/nvidia/spark/rapids/jni/TimeZoneTest.java   | 9 ++++++++-
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
index 5da89319cb..efcd592604 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java
@@ -66,9 +66,12 @@ public class GpuTimeZoneDB {
   // structs. The type of this column vector is:
   //   LIST<STRUCT<utcInstant: int64, localInstant: int64, offset: int32, looseInstant: int64>>
   // use this reference to indicate if time zone cache is initialized.
+  // `fixedTransitions` saves transitions for deduplicated time zones, diferent time zones
+  // may map to one normalized time zone.
   private HostColumnVector fixedTransitions;
 
-  // zone id to index in `fixedTransitions`
+  // time zone to index in `fixedTransitions`
+  // The key of `zoneIdToTable` is the time zone names before dedup.
   private Map<String, Integer> zoneIdToTable;
 
   // host column vector<String, Integer> for `zoneIdToTable`, sorted by time zone strings
@@ -487,11 +490,10 @@ private ColumnVector getFixedTransitions() {
    * fixed transitions for a particular zoneId. 
    *
    * It has default visibility so the test can access it.
-   * @param zoneId
+   * @param zoneId the time zones from TimeZone.getAvailableIDs without `ZoneId.normalized`
    * @return list of fixed transitions
    */
   List getHostFixedTransitions(String zoneId) {
-    zoneId = ZoneId.of(zoneId, ZoneId.SHORT_IDS).normalized().toString(); // we use the normalized form to dedupe
     Integer idx = getZoneIDMap().get(zoneId);
     if (idx == null) {
       return null;
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
index de2a7738f6..cafe69a6b4 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
@@ -415,7 +415,7 @@ void toTimestampTestWithTz() {
     // test time zones not in notmalized names, e.g,: ZoneId.of("Etc/GMT").normalized.getId = Z; ZoneId.of("Etc/GMT+0").normalized.getId = Z; Etc/GMT+10 -> -10:00
     entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 Etc/GMT", 1571610824100000L));
     entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 Etc/GMT+0", 1571610824100000L));
-    entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 Etc/GMT+10", 1571592825100000L));
+    entries.add(new AbstractMap.SimpleEntry<>("2019-10-20 22:33:44.1 Etc/GMT+10", 1571646824100000L));
 
     int validDataSize = entries.size();
 
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/TimeZoneTest.java b/src/test/java/com/nvidia/spark/rapids/jni/TimeZoneTest.java
index 7aaec496de..ebc3e2cb58 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/TimeZoneTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/TimeZoneTest.java
@@ -22,6 +22,7 @@
 import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
 
 import ai.rapids.cudf.ColumnVector;
 
@@ -45,12 +46,18 @@ static void cleanup() {
   void databaseLoadedTest() {
     // Check for a few timezones
     GpuTimeZoneDB instance = GpuTimeZoneDB.getInstance();
+
+    // UTC+8 is not in `TimeZone.getAvailableIDs`, so return null
+    // UTC+8 can be handle by kernel directly
     List transitions = instance.getHostFixedTransitions("UTC+8");
-    assertNotNull(transitions);
+    assertNull(transitions);
+
     assertEquals(1, transitions.size());
     transitions = instance.getHostFixedTransitions("Asia/Shanghai");
     assertNotNull(transitions);
+
     ZoneId shanghai = ZoneId.of("Asia/Shanghai").normalized();
+    // inserted a min transition place holder, so it's n + 1
     assertEquals(shanghai.getRules().getTransitions().size() + 1, transitions.size());
   }
   

From 6ddb91c12a1b8618911dc30f954b87dc258d8c0e Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Fri, 26 Jan 2024 20:54:50 +0800
Subject: [PATCH 33/35] Fix cudaErrorIllegalAddress error; Fix null pointer bug

---
 src/main/cpp/src/datetime_parser.cu                         | 2 +-
 src/test/java/com/nvidia/spark/rapids/jni/TimeZoneTest.java | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index e1ad607555..eae5503645 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -123,7 +123,7 @@ enum ParseResult { OK = 0, INVALID = 1, UNSUPPORTED = 2 };
 template <bool with_timezone>
 struct parse_timestamp_string_fn {
   // below three are required:
-  cudf::column_device_view const& d_strings;
+  cudf::column_device_view const d_strings;
   cudf::size_type const default_tz_index;
   bool const allow_tz_in_date_str;
 
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/TimeZoneTest.java b/src/test/java/com/nvidia/spark/rapids/jni/TimeZoneTest.java
index ebc3e2cb58..f50fe64c51 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/TimeZoneTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/TimeZoneTest.java
@@ -52,7 +52,6 @@ void databaseLoadedTest() {
     List transitions = instance.getHostFixedTransitions("UTC+8");
     assertNull(transitions);
 
-    assertEquals(1, transitions.size());
     transitions = instance.getHostFixedTransitions("Asia/Shanghai");
     assertNotNull(transitions);
 

From 863cb8330d16f33c20f1e284b7b3125253c74e05 Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Fri, 26 Jan 2024 21:17:53 +0800
Subject: [PATCH 34/35] Update comments

---
 src/main/cpp/src/datetime_parser.cu | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index eae5503645..d02139cce4 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -635,8 +635,9 @@ std::unique_ptr<cudf::column> to_timestamp(
 /**
  * Parse string column with time zone to timestamp column.
  * If a string does not have time zone in it, use the default time zone.
- * Returns nullptr if ANSI mode is true and strings have any invalid value, returns non-null
- * timestamp column otherwise.
+ *
+ * Returns nullptr if ANSI mode is true and strings have invalid data,
+ * otherwise, returns non-null timestamp column(the invalid date will be empty in this column)
  *
  */
 std::unique_ptr<cudf::column> string_to_timestamp_with_tz(cudf::strings_column_view const& input,

From de746455d7c52abe94bb219fdb9dac71976e74cf Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Tue, 30 Jan 2024 14:51:35 +0800
Subject: [PATCH 35/35] Refactor

---
 src/main/cpp/src/datetime_parser.cu | 79 +++++++++++++++--------------
 1 file changed, 42 insertions(+), 37 deletions(-)

diff --git a/src/main/cpp/src/datetime_parser.cu b/src/main/cpp/src/datetime_parser.cu
index d02139cce4..505f9821ad 100644
--- a/src/main/cpp/src/datetime_parser.cu
+++ b/src/main/cpp/src/datetime_parser.cu
@@ -142,12 +142,14 @@ struct parse_timestamp_string_fn {
       return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID);
     }
 
-    auto const d_str = d_strings.element<cudf::string_view>(idx);
-
-    timestamp_components ts_comp{};
-    char const* tz_lit_ptr     = nullptr;
-    cudf::size_type tz_lit_len = 0;
-    switch (parse_string_to_timestamp_us(&ts_comp, &tz_lit_ptr, &tz_lit_len, d_str)) {
+    auto const d_str     = d_strings.element<cudf::string_view>(idx);
+    auto parse_ret_tuple = parse_string_to_timestamp_us(d_str);
+    auto ts_comp         = thrust::get<0>(parse_ret_tuple);
+    auto tz_lit_ptr      = thrust::get<1>(parse_ret_tuple);
+    auto tz_lit_len      = thrust::get<2>(parse_ret_tuple);
+    auto result          = thrust::get<3>(parse_ret_tuple);
+
+    switch (result) {
       case ParseResult::INVALID:
         return thrust::make_tuple(cudf::timestamp_us{cudf::duration_us{0}}, ParseResult::INVALID);
       case ParseResult::UNSUPPORTED:
@@ -391,12 +393,15 @@ struct parse_timestamp_string_fn {
    * Parse a string with time zone to a timestamp.
    * The bool in the returned tuple is false if the parse failed.
    */
-  __device__ inline ParseResult parse_string_to_timestamp_us(
-    timestamp_components* ts_comp,
-    char const** parsed_tz_ptr,
-    cudf::size_type* parsed_tz_length,
-    cudf::string_view const& timestamp_str) const
+  __device__ inline thrust::tuple<timestamp_components, char const*, cudf::size_type, ParseResult>
+  parse_string_to_timestamp_us(cudf::string_view const& timestamp_str) const
   {
+    timestamp_components ts_comp{};
+    char const* parsed_tz_ptr        = nullptr;
+    cudf::size_type parsed_tz_length = -1;
+    auto invalid_ret =
+      thrust::make_tuple(ts_comp, parsed_tz_ptr, parsed_tz_length, ParseResult::INVALID);
+
     const char* curr_ptr = timestamp_str.data();
     const char* end_ptr  = curr_ptr + timestamp_str.size_bytes();
 
@@ -409,7 +414,7 @@ struct parse_timestamp_string_fn {
       --end_ptr;
     }
 
-    if (curr_ptr == end_ptr) { return ParseResult::INVALID; }
+    if (curr_ptr == end_ptr) { return invalid_ret; }
 
     const char* const bytes            = curr_ptr;
     const cudf::size_type bytes_length = end_ptr - curr_ptr;
@@ -441,72 +446,72 @@ struct parse_timestamp_string_fn {
           i += 3;
         } else if (i < 2) {
           if (b == '-') {
-            if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; }
+            if (!is_valid_digits(i, current_segment_digits)) { return invalid_ret; }
             segments[i]            = current_segment_value;
             current_segment_value  = 0;
             current_segment_digits = 0;
             i += 1;
           } else if (0 == i && ':' == b && !year_sign.has_value()) {
-            if (!is_valid_digits(3, current_segment_digits)) { return ParseResult::INVALID; }
+            if (!is_valid_digits(3, current_segment_digits)) { return invalid_ret; }
             segments[3]            = current_segment_value;
             current_segment_value  = 0;
             current_segment_digits = 0;
             i                      = 4;
           } else {
-            return ParseResult::INVALID;
+            return invalid_ret;
           }
         } else if (2 == i) {
           if (' ' == b || 'T' == b) {
-            if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; }
+            if (!is_valid_digits(i, current_segment_digits)) { return invalid_ret; }
             segments[i]            = current_segment_value;
             current_segment_value  = 0;
             current_segment_digits = 0;
             i += 1;
           } else {
-            return ParseResult::INVALID;
+            return invalid_ret;
           }
         } else if (3 == i || 4 == i) {
           if (':' == b) {
-            if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; }
+            if (!is_valid_digits(i, current_segment_digits)) { return invalid_ret; }
             segments[i]            = current_segment_value;
             current_segment_value  = 0;
             current_segment_digits = 0;
             i += 1;
           } else {
-            return ParseResult::INVALID;
+            return invalid_ret;
           }
         } else if (5 == i || 6 == i) {
           if ('.' == b && 5 == i) {
-            if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; }
+            if (!is_valid_digits(i, current_segment_digits)) { return invalid_ret; }
             segments[i]            = current_segment_value;
             current_segment_value  = 0;
             current_segment_digits = 0;
             i += 1;
           } else {
             if (!is_valid_digits(i, current_segment_digits) || !allow_tz_in_date_str) {
-              return ParseResult::INVALID;
+              return invalid_ret;
             }
             segments[i]            = current_segment_value;
             current_segment_value  = 0;
             current_segment_digits = 0;
             i += 1;
-            *parsed_tz_ptr = bytes + j;
+            parsed_tz_ptr = bytes + j;
             // strip the whitespace between timestamp and timezone
-            while (*parsed_tz_ptr < end_ptr && is_whitespace(**parsed_tz_ptr))
-              ++(*parsed_tz_ptr);
-            *parsed_tz_length = end_ptr - *parsed_tz_ptr;
+            while (parsed_tz_ptr < end_ptr && is_whitespace(*parsed_tz_ptr))
+              ++parsed_tz_ptr;
+            parsed_tz_length = end_ptr - parsed_tz_ptr;
             break;
           }
           if (i == 6 && '.' != b) { i += 1; }
         } else {
           if (i < segments_len && (':' == b || ' ' == b)) {
-            if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; }
+            if (!is_valid_digits(i, current_segment_digits)) { return invalid_ret; }
             segments[i]            = current_segment_value;
             current_segment_value  = 0;
             current_segment_digits = 0;
             i += 1;
           } else {
-            return ParseResult::INVALID;
+            return invalid_ret;
           }
         }
       } else {
@@ -521,7 +526,7 @@ struct parse_timestamp_string_fn {
       j += 1;
     }
 
-    if (!is_valid_digits(i, current_segment_digits)) { return ParseResult::INVALID; }
+    if (!is_valid_digits(i, current_segment_digits)) { return invalid_ret; }
     segments[i] = current_segment_value;
 
     while (digits_milli < 6) {
@@ -535,15 +540,15 @@ struct parse_timestamp_string_fn {
     // copy segments to equivalent kernel timestamp_components
     // Note: In order to keep above code is equivalent to Spark implementation,
     //       did not use `timestamp_components` directly to save values.
-    ts_comp->year         = segments[0];
-    ts_comp->month        = static_cast<int8_t>(segments[1]);
-    ts_comp->day          = static_cast<int8_t>(segments[2]);
-    ts_comp->hour         = static_cast<int8_t>(segments[3]);
-    ts_comp->minute       = static_cast<int8_t>(segments[4]);
-    ts_comp->second       = static_cast<int8_t>(segments[5]);
-    ts_comp->microseconds = segments[6];
-
-    return ParseResult::OK;
+    ts_comp.year         = segments[0];
+    ts_comp.month        = static_cast<int8_t>(segments[1]);
+    ts_comp.day          = static_cast<int8_t>(segments[2]);
+    ts_comp.hour         = static_cast<int8_t>(segments[3]);
+    ts_comp.minute       = static_cast<int8_t>(segments[4]);
+    ts_comp.second       = static_cast<int8_t>(segments[5]);
+    ts_comp.microseconds = segments[6];
+
+    return thrust::make_tuple(ts_comp, parsed_tz_ptr, parsed_tz_length, ParseResult::OK);
   }
 };