Add fuzz tests for cast from string to other types (#2898)

NVIDIA · Jul 13, 2021 · 76c57f1 · 76c57f1
1 parent 1c3fc8b
commit 76c57f1
Show file tree

Hide file tree

Showing 3 changed files with 154 additions and 34 deletions.
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala
@@ -24,6 +24,7 @@ import java.util.TimeZone
 
 import ai.rapids.cudf.ColumnVector
 import scala.collection.JavaConverters._
+import scala.util.Random
 
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.{DataFrame, Row, SparkSession}
@@ -56,6 +57,140 @@ class CastOpSuite extends GpuExpressionTestSuite {
     for (from <- supportedTypes; to <- supportedTypes) yield (from, to)
   }
 
+  private val BOOL_CHARS = " \t\r\nFALSEfalseTRUEtrue01yesYESnoNO"
+  private val NUMERIC_CHARS = "inf \t\r\n0123456789.+-eE"
+  private val DATE_CHARS = " \t\r\n0123456789:-/TZ"
+
+  ignore("Cast from string to boolean using random inputs") {
+    // Test ignored due to known issues
+    // https://github.com/NVIDIA/spark-rapids/issues/2902
+    testCastStringTo(DataTypes.BooleanType,
+      generateRandomStrings(Some(BOOL_CHARS), maxStringLen = 1))
+    testCastStringTo(DataTypes.BooleanType,
+      generateRandomStrings(Some(BOOL_CHARS), maxStringLen = 3))
+    testCastStringTo(DataTypes.BooleanType, generateRandomStrings(Some(BOOL_CHARS)))
+  }
+
+  ignore("Cast from string to boolean using hand-picked values") {
+    // Test ignored due to known issues
+    // https://github.com/NVIDIA/spark-rapids/issues/2902
+    testCastStringTo(DataTypes.BooleanType, Seq("\n\nN", "False", "FALSE", "false", "FaLsE",
+      "f", "F", "True", "TRUE", "true", "tRuE", "t", "T", "Y", "y", "10", "01", "0", "1"))
+  }
+
+  ignore("Cast from string to byte using random inputs") {
+    // Test ignored due to known issues
+    // https://github.com/NVIDIA/spark-rapids/issues/2899
+    testCastStringTo(DataTypes.ByteType, generateRandomStrings(Some(NUMERIC_CHARS)))
+  }
+
+  ignore("Cast from string to short using random inputs") {
+    // Test ignored due to known issues
+    // https://github.com/NVIDIA/spark-rapids/issues/2899
+    testCastStringTo(DataTypes.ShortType, generateRandomStrings(Some(NUMERIC_CHARS)))
+  }
+
+  ignore("Cast from string to int using random inputs") {
+    // Test ignored due to known issues
+    // https://github.com/NVIDIA/spark-rapids/issues/2899
+    testCastStringTo(DataTypes.IntegerType, generateRandomStrings(Some(NUMERIC_CHARS)))
+  }
+
+  ignore("Cast from string to long using random inputs") {
+    // Test ignored due to known issues
+    // https://github.com/NVIDIA/spark-rapids/issues/2899
+    testCastStringTo(DataTypes.LongType, generateRandomStrings(Some(NUMERIC_CHARS)))
+  }
+
+  ignore("Cast from string to float using random inputs") {
+    // Test ignored due to known issues
+    // https://github.com/NVIDIA/spark-rapids/issues/2900
+    testCastStringTo(DataTypes.FloatType, generateRandomStrings(Some(NUMERIC_CHARS)))
+  }
+
+  ignore("Cast from string to double using random inputs") {
+    // Test ignored due to known issues
+    // https://github.com/NVIDIA/spark-rapids/issues/2900
+    testCastStringTo(DataTypes.DoubleType, generateRandomStrings(Some(NUMERIC_CHARS)))
+  }
+
+  test("Cast from string to date using random inputs") {
+    testCastStringTo(DataTypes.DateType, generateRandomStrings(Some(DATE_CHARS), maxStringLen = 8))
+  }
+
+  test("Cast from string to date using random inputs with valid year prefix") {
+    testCastStringTo(DataTypes.DateType,
+      generateRandomStrings(Some(DATE_CHARS), maxStringLen = 8, Some("2021")))
+  }
+
+  ignore("Cast from string to timestamp using random inputs") {
+    // Test ignored due to known issues
+    // https://github.com/NVIDIA/spark-rapids/issues/2889
+    testCastStringTo(DataTypes.TimestampType,
+      generateRandomStrings(Some(DATE_CHARS), maxStringLen = 32, None))
+  }
+
+  ignore("Cast from string to timestamp using random inputs with valid year prefix") {
+    // Test ignored due to known issues
+    // https://github.com/NVIDIA/spark-rapids/issues/2889
+    testCastStringTo(DataTypes.TimestampType,
+      generateRandomStrings(Some(DATE_CHARS), maxStringLen = 32, Some("2021-")))
+  }
+
+  private def generateRandomStrings(
+      validChars: Option[String],
+      maxStringLen: Int = 12,
+      prefix: Option[String] = None): Seq[String] = {
+    val randomValueCount = 8192
+
+    val random = new Random(0)
+    val r = new EnhancedRandom(random,
+      FuzzerOptions(validChars, maxStringLen))
+
+    (0 until randomValueCount)
+      .map(_ => prefix.getOrElse("") + r.nextString())
+  }
+
+  private def testCastStringTo(toType: DataType, strings: Seq[String]) {
+
+    def castDf(spark: SparkSession): Seq[Row] = {
+      import spark.implicits._
+      val df = strings.zipWithIndex.toDF("c0", "id").repartition(2)
+      val castDf = df.withColumn("c1", col("c0").cast(toType))
+      castDf.collect()
+    }
+
+    val INDEX_ID = 1
+    val INDEX_C0 = 0
+    val INDEX_C1 = 2
+
+    val cpu = withCpuSparkSession(castDf)
+      .sortBy(_.getInt(INDEX_ID))
+
+    val conf = new SparkConf()
+      .set(RapidsConf.EXPLAIN.key, "ALL")
+      .set(RapidsConf.INCOMPATIBLE_DATE_FORMATS.key, "true")
+      .set(RapidsConf.ENABLE_CAST_STRING_TO_TIMESTAMP.key, "true")
+      .set(RapidsConf.ENABLE_CAST_STRING_TO_FLOAT.key, "true")
+      .set(RapidsConf.ENABLE_CAST_STRING_TO_DECIMAL.key, "true")
+      .set(RapidsConf.ENABLE_CAST_STRING_TO_INTEGER.key, "true")
+
+    val gpu = withGpuSparkSession(castDf, conf)
+      .sortBy(_.getInt(INDEX_ID))
+
+    for ((cpuRow, gpuRow) <- cpu.zip(gpu)) {
+      assert(cpuRow.getString(INDEX_C0) === gpuRow.getString(INDEX_C0))
+      assert(cpuRow.getInt(INDEX_ID) === gpuRow.getInt(INDEX_ID))
+      val cpuValue = cpuRow.get(INDEX_C1)
+      val gpuValue = gpuRow.get(INDEX_C1)
+      if (!compare(cpuValue, gpuValue)) {
+        val inputValue = cpuRow.getString(INDEX_C0)
+        fail(s"Mismatch casting string [$inputValue] " +
+          s"to $toType. CPU: $cpuValue; GPU: $gpuValue")
+      }
+    }
+  }
+
   test("Test all supported casts with in-range values") {
     // test cast() and ansi_cast()
     Seq(false, true).foreach { ansiEnabled =>

diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/FuzzerUtils.scala b/tests/src/test/scala/com/nvidia/spark/rapids/FuzzerUtils.scala
@@ -38,10 +38,7 @@ object FuzzerUtils {
   /**
    * Default options when generating random data.
    */
-  private val DEFAULT_OPTIONS = FuzzerOptions(
-    numbersAsStrings = true,
-    asciiStringsOnly = false,
-    maxStringLen = 64)
+  private val DEFAULT_OPTIONS = FuzzerOptions()
 
   /**
    * Create a schema with the specified data types.
@@ -331,20 +328,6 @@ class EnhancedRandom(protected val r: Random, protected val options: FuzzerOptio
     }
   }
 
-  def nextString(): String = {
-    if (options.numbersAsStrings) {
-      r.nextInt(5) match {
-        case 0 => String.valueOf(r.nextInt())
-        case 1 => String.valueOf(r.nextLong())
-        case 2 => String.valueOf(r.nextFloat())
-        case 3 => String.valueOf(r.nextDouble())
-        case 4 => generateString()
-      }
-    } else {
-      generateString()
-    }
-  }
-
   def nextDate(): Date = {
     val futureDate = 6321706291000L // Upper limit Sunday, April 29, 2170 9:31:31 PM
     new Date((futureDate * r.nextDouble()).toLong);
@@ -355,22 +338,26 @@ class EnhancedRandom(protected val r: Random, protected val options: FuzzerOptio
     new Timestamp((futureDate * r.nextDouble()).toLong)
   }
 
-  private def generateString(): String = {
-    if (options.asciiStringsOnly) {
-      val b = new StringBuilder()
-      for (_ <- 0 until options.maxStringLen) {
-        b.append(ASCII_CHARS.charAt(r.nextInt(ASCII_CHARS.length)))
-      }
-      b.toString
-    } else {
-      r.nextString(r.nextInt(options.maxStringLen))
+  def nextString(): String = {
+    val length = r.nextInt(options.maxStringLen)
+    options.validStringChars match {
+      case Some(ch) => nextString(ch, length)
+      case _ =>
+        // delegate to Scala's Random.nextString
+        r.nextString(length)
+    }
+  }
+
+  def nextString(validStringChars: String, maxStringLen: Int): String = {
+    val b = new StringBuilder(maxStringLen)
+    for (_ <- 0 until maxStringLen) {
+      b.append(validStringChars.charAt(r.nextInt(validStringChars.length)))
     }
+    b.toString
   }
 
-  private val ASCII_CHARS = "abcdefghijklmnopqrstuvwxyz"
 }
 
 case class FuzzerOptions(
-    numbersAsStrings: Boolean = true,
-    asciiStringsOnly: Boolean = false,
+    validStringChars: Option[String] = None,
     maxStringLen: Int = 64)
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/HashAggregatesSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/HashAggregatesSuite.scala
@@ -73,8 +73,7 @@ class HashAggregatesSuite extends SparkQueryCompareTestSuite {
   }
 
   def firstDf(spark: SparkSession): DataFrame = {
-    val options = FuzzerOptions(asciiStringsOnly = true, numbersAsStrings = false,
-        maxStringLen = 4)
+    val options = FuzzerOptions(maxStringLen = 4)
     val schema = FuzzerUtils.createSchema(Seq(DataTypes.StringType, DataTypes.IntegerType))
     FuzzerUtils.generateDataFrame(spark, schema, 100, options, seed = 0)
       .withColumn("c2", col("c1").mod(lit(10)))
@@ -857,8 +856,7 @@ class HashAggregatesSuite extends SparkQueryCompareTestSuite {
   private def randomDF(dataType: DataType)(spark: SparkSession) : DataFrame = {
     val schema = FuzzerUtils.createSchema(Seq(DataTypes.StringType, dataType))
     FuzzerUtils.generateDataFrame(spark, schema, rowCount = 1000,
-      options = FuzzerOptions(numbersAsStrings = false, asciiStringsOnly = true,
-          maxStringLen = 2))
+      options = FuzzerOptions(maxStringLen = 2))
   }
 
   FLOAT_TEST_testSparkResultsAreEqual("empty df: reduction count", floatCsvDf,