From 181b8c81d733db1b6eb30230f7e98ede8671706c Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Mon, 22 Jan 2024 22:30:02 +0800 Subject: [PATCH] Use JNI kernel to parse string to timestamp Signed-off-by: Chong Gao --- docs/compatibility.md | 45 +++--- .../com/nvidia/spark/rapids/GpuCast.scala | 145 +----------------- 2 files changed, 28 insertions(+), 162 deletions(-) diff --git a/docs/compatibility.md b/docs/compatibility.md index 8060866dc3b..daf18a6ba2b 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -764,32 +764,29 @@ To allow casts from string to timestamp on the GPU, enable the configuration pro Casting from string to timestamp currently has the following limitations. -| Format or Pattern | Supported on GPU? | -| ------------------------------------------------------------------- | ------------------| -| `"yyyy"` | Yes | -| `"yyyy-[M]M"` | Yes | -| `"yyyy-[M]M "` | Yes | -| `"yyyy-[M]M-[d]d"` | Yes | -| `"yyyy-[M]M-[d]d "` | Yes | -| `"yyyy-[M]M-[d]dT[h]h:[m]m:[s]s[zone_id]"` | Partial [\[1\]](#Footnote1) | -| `"yyyy-[M]M-[d]d [h]h:[m]m:[s]s[zone_id]"` | Partial [\[1\]](#Footnote1) | -| `"yyyy-[M]M-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]"` | Partial [\[1\]](#Footnote1) | -| `"yyyy-[M]M-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]"` | Partial [\[1\]](#Footnote1) | -| `"[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]"` | Partial [\[1\]](#Footnote1) | -| `"T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]"` | Partial [\[1\]](#Footnote1) | -| `"epoch"` | Yes | -| `"now"` | Yes | -| `"today"` | Yes | -| `"tomorrow"` | Yes | -| `"yesterday"` | Yes | +| Format or Pattern | Supported on GPU? | +|---------------------------------------------------------------------|------------------------------| +| `"yyyy"` | Yes | +| `"yyyy-[M]M"` | Yes | +| `"yyyy-[M]M "` | Yes | +| `"yyyy-[M]M-[d]d"` | Yes | +| `"yyyy-[M]M-[d]d "` | Yes | +| `"yyyy-[M]M-[d]dT[h]h:[m]m:[s]s[zone_id]"` | Partial [\[1\]](#Footnote1) | +| `"yyyy-[M]M-[d]d [h]h:[m]m:[s]s[zone_id]"` | Partial [\[1\]](#Footnote1) | +| `"yyyy-[M]M-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]"` | Partial [\[1\]](#Footnote1) | +| `"yyyy-[M]M-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]"` | Partial [\[1\]](#Footnote1) | +| `"[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]"` | No | +| `"T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]"` | No | +| `"epoch"` | No | +| `"now"` | No | +| `"today"` | No | +| `"tomorrow"` | No | +| `"yesterday"` | No | - [1] Leap seconds are not supported. If a zone_id is provided then only - timezone 'Z' (UTC) is supported. Casting unsupported formats will result in null values. - -Spark is very lenient when casting from string to timestamp because all date and time components -are optional, meaning that input values such as `T`, `T2`, `:`, `::`, `1:`, `:1`, and `::1` -are considered valid timestamps. The GPU will treat these values as invalid and cast them to null -values. + non-DST(Daylight Saving Times) timezones are supported. Casting unsupported formats will result in + null values. From Spark 320 and 320+, Spark does not support "epoch", "now", "today", "tomorrow", + "yesterday", GPU is consistent with Spark320+. ### Constant Folding diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala index 1e70090d0a7..b056f30b4a0 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala @@ -17,7 +17,7 @@ package com.nvidia.spark.rapids import java.text.SimpleDateFormat -import java.time.DateTimeException +import java.time.{DateTimeException, ZoneId} import java.util.Optional import scala.collection.mutable.ArrayBuffer @@ -289,14 +289,6 @@ object GpuCast { private val DATE_REGEX_YYYY_MM = "\\A\\d{4}\\-\\d{1,2}\\Z" private val DATE_REGEX_YYYY = "\\A\\d{4}\\Z" - private val TIMESTAMP_REGEX_YYYY_MM_DD = "\\A\\d{4}\\-\\d{1,2}\\-\\d{1,2}[ ]?\\Z" - private val TIMESTAMP_REGEX_YYYY_MM = "\\A\\d{4}\\-\\d{1,2}[ ]?\\Z" - private val TIMESTAMP_REGEX_YYYY = "\\A\\d{4}[ ]?\\Z" - private val TIMESTAMP_REGEX_FULL = - "\\A\\d{4}\\-\\d{1,2}\\-\\d{1,2}[ T]?(\\d{1,2}:\\d{1,2}:([0-5]\\d|\\d)(\\.\\d{0,6})?Z?)\\Z" - private val TIMESTAMP_REGEX_NO_DATE = - "\\A[T]?(\\d{1,2}:\\d{1,2}:([0-5]\\d|\\d)(\\.\\d{0,6})?Z?)\\Z" - private val BIG_DECIMAL_LONG_MIN = BigDecimal(Long.MinValue) private val BIG_DECIMAL_LONG_MAX = BigDecimal(Long.MaxValue) @@ -561,7 +553,7 @@ object GpuCast { castStringToDate(trimmed) } case TimestampType => - castStringToTimestamp(trimmed, ansiMode) + castStringToTimestamp(trimmed, ansiMode, options.timeZoneId) } } case (StringType, dt: DecimalType) => @@ -1413,134 +1405,11 @@ object GpuCast { } } - /** This method does not close the `input` ColumnVector. */ - def convertTimestampOrNull( - input: ColumnVector, - regex: String, - cudfFormat: String): ColumnVector = { - - withResource(Scalar.fromNull(DType.TIMESTAMP_MICROSECONDS)) { orElse => - val prog = new RegexProgram(regex, CaptureGroups.NON_CAPTURE) - val isValidTimestamp = withResource(input.matchesRe(prog)) { isMatch => - withResource(input.isTimestamp(cudfFormat)) { isTimestamp => - isMatch.and(isTimestamp) - } - } - withResource(isValidTimestamp) { isValidTimestamp => - withResource(input.asTimestampMicroseconds(cudfFormat)) { asDays => - isValidTimestamp.ifElse(asDays, orElse) - } - } - } - } - - /** This method does not close the `input` ColumnVector. */ - private def convertTimestampOr( - input: ColumnVector, - regex: String, - cudfFormat: String, - orElse: ColumnVector): ColumnVector = { - - withResource(orElse) { orElse => - val prog = new RegexProgram(regex, CaptureGroups.NON_CAPTURE) - val isValidTimestamp = withResource(input.matchesRe(prog)) { isMatch => - withResource(input.isTimestamp(cudfFormat)) { isTimestamp => - isMatch.and(isTimestamp) - } - } - withResource(isValidTimestamp) { isValidTimestamp => - withResource(input.asTimestampMicroseconds(cudfFormat)) { asDays => - isValidTimestamp.ifElse(asDays, orElse) - } - } - } - } - - /** This method does not close the `input` ColumnVector. */ - private def convertFullTimestampOr( - input: ColumnVector, - orElse: ColumnVector): ColumnVector = { - - val cudfFormat1 = "%Y-%m-%d %H:%M:%S.%f" - val cudfFormat2 = "%Y-%m-%dT%H:%M:%S.%f" - val cudfFormat3 = "%Y-%m-%d %H:%M:%S" - val cudfFormat4 = "%Y-%m-%dT%H:%M:%S" - - withResource(orElse) { orElse => - - // valid dates must match the regex and either of the cuDF formats - val isCudfMatch = Seq( - cudfFormat2, - cudfFormat3, - cudfFormat4 - ).foldLeft(input.isTimestamp(cudfFormat1)) { (isTimestamp, nextFormat) => - withResource(isTimestamp) { _ => - withResource(input.isTimestamp(nextFormat)) { nextIsTimeStamp => - isTimestamp.or(nextIsTimeStamp) - } - } - } - - val isValidTimestamp = withResource(isCudfMatch) { _ => - val prog = new RegexProgram(TIMESTAMP_REGEX_FULL, CaptureGroups.NON_CAPTURE) - withResource(input.matchesRe(prog)) { isRegexMatch => - isCudfMatch.and(isRegexMatch) - } - } - - // we only need to parse with one of the cuDF formats because the parsing code ignores - // the ' ' or 'T' between the date and time components - withResource(isValidTimestamp) { _ => - withResource(input.asTimestampMicroseconds(cudfFormat1)) { asDays => - isValidTimestamp.ifElse(asDays, orElse) - } - } - } - } - - def castStringToTimestamp(input: ColumnVector, ansiMode: Boolean): ColumnVector = { - - // special timestamps - val today = DateUtils.currentDate() - val todayStr = new SimpleDateFormat("yyyy-MM-dd") - .format(today * DateUtils.ONE_DAY_SECONDS * 1000L) - - var sanitizedInput = input.incRefCount() - - // prepend today's date to timestamp formats without dates - sanitizedInput = withResource(sanitizedInput) { _ => - val prog = new RegexProgram(TIMESTAMP_REGEX_NO_DATE) - sanitizedInput.stringReplaceWithBackrefs(prog, s"${todayStr}T\\1") - } - - withResource(sanitizedInput) { sanitizedInput => - // convert dates that are in valid timestamp formats - val converted = - convertFullTimestampOr(sanitizedInput, - convertTimestampOr(sanitizedInput, TIMESTAMP_REGEX_YYYY_MM_DD, "%Y-%m-%d", - convertTimestampOr(sanitizedInput, TIMESTAMP_REGEX_YYYY_MM, "%Y-%m", - convertTimestampOrNull(sanitizedInput, TIMESTAMP_REGEX_YYYY, "%Y")))) - - // handle special dates like "epoch", "now", etc. - val finalResult = closeOnExcept(converted) { tsVector => - DateUtils.fetchSpecialDates(DType.TIMESTAMP_MICROSECONDS) match { - case specialDates if specialDates.nonEmpty => - // `tsVector` will be closed in replaceSpecialDates. - replaceSpecialDates(sanitizedInput, tsVector, specialDates) - case _ => - tsVector - } - } - - if (ansiMode) { - // When ANSI mode is enabled, we need to throw an exception if any values could not be - // converted - checkResultForAnsiMode(input, finalResult, - "One or more values could not be converted to TimestampType") - } else { - finalResult - } - } + def castStringToTimestamp(input: ColumnVector, + ansiMode: Boolean, + timeZoneId: Option[String] = None): ColumnVector = { + val timeZone = timeZoneId.map(t => ZoneId.of(t)).getOrElse(ZoneId.of("UTC")) + CastStrings.toTimestamp(input, timeZone, ansiMode) } private def castMapToMap(