Add support for more date formats and remove incompat from to_unix_ti…

…mestamp Signed-off-by: Andy Grove <[email protected]>
NVIDIA · Nov 17, 2020 · 94bfbf9 · 94bfbf9
1 parent 9021145
commit 94bfbf9
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 13 deletions.
diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py
@@ -160,46 +160,45 @@ def test_dayofyear(data_gen):
     assert_gpu_and_cpu_are_equal_collect(
             lambda spark : unary_op_df(spark, data_gen).select(f.dayofyear(f.col('a'))))
 
-@incompat #Really only the string is
 @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn)
 def test_unix_timestamp(data_gen):
     assert_gpu_and_cpu_are_equal_collect(
             lambda spark : unary_op_df(spark, data_gen).select(f.unix_timestamp(f.col('a'))))
 
-@incompat #Really only the string is
 @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn)
 def test_to_unix_timestamp(data_gen):
     assert_gpu_and_cpu_are_equal_collect(
             lambda spark : unary_op_df(spark, data_gen).selectExpr("to_unix_timestamp(a)"))
 
-@incompat #Really only the string is
 @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn)
 def test_unix_timestamp_improved(data_gen):
-    conf = {"spark.rapids.sql.improvedTimeOps.enabled": "true"}
+    conf = {"spark.rapids.sql.improvedTimeOps.enabled": "true",
+            "spark.sql.legacy.timeParserPolicy": "EXCEPTION"}
     assert_gpu_and_cpu_are_equal_collect(
             lambda spark : unary_op_df(spark, data_gen).select(f.unix_timestamp(f.col('a'))), conf)
 
-@incompat #Really only the string is
 @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn)
 def test_to_unix_timestamp_improved(data_gen):
-    conf = {"spark.rapids.sql.improvedTimeOps.enabled": "true"}
+    conf = {"spark.rapids.sql.improvedTimeOps.enabled": "true",
+            "spark.sql.legacy.timeParserPolicy": "EXCEPTION"}
     assert_gpu_and_cpu_are_equal_collect(
             lambda spark : unary_op_df(spark, data_gen).selectExpr("to_unix_timestamp(a)"), conf)
 
 str_date_and_format_gen = [pytest.param(StringGen('[0-9]{4}/[01][0-9]'),'yyyy/MM', marks=pytest.mark.xfail(reason="cudf does no checks")),
         (StringGen('[0-9]{4}/[01][12]/[0-2][1-8]'),'yyyy/MM/dd'),
         (ConvertGen(DateGen(nullable=False), lambda d: d.strftime('%Y/%m').zfill(7), data_type=StringType()), 'yyyy/MM')]
 
-@incompat
 @pytest.mark.parametrize('data_gen,date_form', str_date_and_format_gen, ids=idfn)
 def test_string_to_unix_timestamp(data_gen, date_form):
-    print("date: " + date_form)
+    print("test_string_to_unix_timestamp date_form: {}".format(date_form))
+    conf = {"spark.rapids.sql.improvedTimeOps.enabled": "true",
+            "spark.sql.legacy.timeParserPolicy": "EXCEPTION"}
     assert_gpu_and_cpu_are_equal_collect(
-        lambda spark : unary_op_df(spark, data_gen, seed=1).selectExpr("to_unix_timestamp(a, '{}')".format(date_form)))
+        lambda spark : unary_op_df(spark, data_gen, seed=1).selectExpr("to_unix_timestamp(a, '{}')".format(date_form)), conf)
 
-@incompat
 @pytest.mark.parametrize('data_gen,date_form', str_date_and_format_gen, ids=idfn)
 def test_string_unix_timestamp(data_gen, date_form):
+    print("test_string_unix_timestamp date_form: {}".format(date_form))
     assert_gpu_and_cpu_are_equal_collect(
             lambda spark : unary_op_df(spark, data_gen, seed=1).select(f.unix_timestamp(f.col('a'), date_form)))
 

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
@@ -1075,9 +1075,7 @@ object GpuOverrides {
             GpuToUnixTimestamp(lhs, rhs, sparkFormat, strfFormat)
           }
         }
-      })
-      .incompat("Incorrectly formatted strings and bogus dates produce garbage data" +
-        " instead of null"),
+      }),
     expr[UnixTimestamp](
       "Returns the UNIX timestamp of current or specified time",
       (a, conf, p, r) => new UnixTimeExprMeta[UnixTimestamp](a, conf, p, r){

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
@@ -327,6 +327,9 @@ object GpuToTimestamp {
   /** We are compatible with Spark for these formats */
   val COMPATIBLE_FORMATS = Seq(
     "yyyy-MM-dd",
+    "yyyy-MM",
+    "yyyy/MM/dd",
+    "yyyy/MM",
     "dd/MM/yyyy",
     "yyyy-MM-dd HH:mm:ss"
   )

diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala
@@ -47,6 +47,21 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite {
     df => df.withColumn("c1", unix_timestamp(col("c0"), "yyyy-MM-dd"))
   }
 
+  testSparkResultsAreEqual("unix_timestamp parse yyyy/MM",
+    timestampsAsStrings,
+    new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) {
+    df => df.withColumn("c1", unix_timestamp(col("c0"), "yyyy/MM"))
+  }
+
+  testSparkResultsAreEqual("to_unix_timestamp parse yyyy/MM",
+    timestampsAsStrings,
+    new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) {
+    df => {
+      df.createOrReplaceTempView("df")
+      df.sqlContext.sql("SELECT c0, to_unix_timestamp(c0, 'yyyy/MM') FROM df")
+    }
+  }
+
   testSparkResultsAreEqual("unix_timestamp parse timestamp",
       timestampsAsStrings,
       new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) {
@@ -128,6 +143,12 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite {
     "31/12/1999",
     "31/12/1999 11:59:59.999",
     "1999-12-31",
+    "1999/12/31",
+    "1999-12",
+    "1999/12",
+    "1975/06",
+    "1975/06/18",
+    "1975/06/18 06:48:57",
     "1999-12-31\n",
     "\t1999-12-31",
     "\n1999-12-31",