Skip to content

Commit

Permalink
Add fuzz tests for cast from string to other types (#2898)
Browse files Browse the repository at this point in the history
  • Loading branch information
andygrove authored Jul 13, 2021
1 parent 1c3fc8b commit 76c57f1
Show file tree
Hide file tree
Showing 3 changed files with 154 additions and 34 deletions.
135 changes: 135 additions & 0 deletions tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import java.util.TimeZone

import ai.rapids.cudf.ColumnVector
import scala.collection.JavaConverters._
import scala.util.Random

import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
Expand Down Expand Up @@ -56,6 +57,140 @@ class CastOpSuite extends GpuExpressionTestSuite {
for (from <- supportedTypes; to <- supportedTypes) yield (from, to)
}

private val BOOL_CHARS = " \t\r\nFALSEfalseTRUEtrue01yesYESnoNO"
private val NUMERIC_CHARS = "inf \t\r\n0123456789.+-eE"
private val DATE_CHARS = " \t\r\n0123456789:-/TZ"

ignore("Cast from string to boolean using random inputs") {
// Test ignored due to known issues
// https://github.com/NVIDIA/spark-rapids/issues/2902
testCastStringTo(DataTypes.BooleanType,
generateRandomStrings(Some(BOOL_CHARS), maxStringLen = 1))
testCastStringTo(DataTypes.BooleanType,
generateRandomStrings(Some(BOOL_CHARS), maxStringLen = 3))
testCastStringTo(DataTypes.BooleanType, generateRandomStrings(Some(BOOL_CHARS)))
}

ignore("Cast from string to boolean using hand-picked values") {
// Test ignored due to known issues
// https://github.com/NVIDIA/spark-rapids/issues/2902
testCastStringTo(DataTypes.BooleanType, Seq("\n\nN", "False", "FALSE", "false", "FaLsE",
"f", "F", "True", "TRUE", "true", "tRuE", "t", "T", "Y", "y", "10", "01", "0", "1"))
}

ignore("Cast from string to byte using random inputs") {
// Test ignored due to known issues
// https://github.com/NVIDIA/spark-rapids/issues/2899
testCastStringTo(DataTypes.ByteType, generateRandomStrings(Some(NUMERIC_CHARS)))
}

ignore("Cast from string to short using random inputs") {
// Test ignored due to known issues
// https://github.com/NVIDIA/spark-rapids/issues/2899
testCastStringTo(DataTypes.ShortType, generateRandomStrings(Some(NUMERIC_CHARS)))
}

ignore("Cast from string to int using random inputs") {
// Test ignored due to known issues
// https://github.com/NVIDIA/spark-rapids/issues/2899
testCastStringTo(DataTypes.IntegerType, generateRandomStrings(Some(NUMERIC_CHARS)))
}

ignore("Cast from string to long using random inputs") {
// Test ignored due to known issues
// https://github.com/NVIDIA/spark-rapids/issues/2899
testCastStringTo(DataTypes.LongType, generateRandomStrings(Some(NUMERIC_CHARS)))
}

ignore("Cast from string to float using random inputs") {
// Test ignored due to known issues
// https://github.com/NVIDIA/spark-rapids/issues/2900
testCastStringTo(DataTypes.FloatType, generateRandomStrings(Some(NUMERIC_CHARS)))
}

ignore("Cast from string to double using random inputs") {
// Test ignored due to known issues
// https://github.com/NVIDIA/spark-rapids/issues/2900
testCastStringTo(DataTypes.DoubleType, generateRandomStrings(Some(NUMERIC_CHARS)))
}

test("Cast from string to date using random inputs") {
testCastStringTo(DataTypes.DateType, generateRandomStrings(Some(DATE_CHARS), maxStringLen = 8))
}

test("Cast from string to date using random inputs with valid year prefix") {
testCastStringTo(DataTypes.DateType,
generateRandomStrings(Some(DATE_CHARS), maxStringLen = 8, Some("2021")))
}

ignore("Cast from string to timestamp using random inputs") {
// Test ignored due to known issues
// https://github.com/NVIDIA/spark-rapids/issues/2889
testCastStringTo(DataTypes.TimestampType,
generateRandomStrings(Some(DATE_CHARS), maxStringLen = 32, None))
}

ignore("Cast from string to timestamp using random inputs with valid year prefix") {
// Test ignored due to known issues
// https://github.com/NVIDIA/spark-rapids/issues/2889
testCastStringTo(DataTypes.TimestampType,
generateRandomStrings(Some(DATE_CHARS), maxStringLen = 32, Some("2021-")))
}

private def generateRandomStrings(
validChars: Option[String],
maxStringLen: Int = 12,
prefix: Option[String] = None): Seq[String] = {
val randomValueCount = 8192

val random = new Random(0)
val r = new EnhancedRandom(random,
FuzzerOptions(validChars, maxStringLen))

(0 until randomValueCount)
.map(_ => prefix.getOrElse("") + r.nextString())
}

private def testCastStringTo(toType: DataType, strings: Seq[String]) {

def castDf(spark: SparkSession): Seq[Row] = {
import spark.implicits._
val df = strings.zipWithIndex.toDF("c0", "id").repartition(2)
val castDf = df.withColumn("c1", col("c0").cast(toType))
castDf.collect()
}

val INDEX_ID = 1
val INDEX_C0 = 0
val INDEX_C1 = 2

val cpu = withCpuSparkSession(castDf)
.sortBy(_.getInt(INDEX_ID))

val conf = new SparkConf()
.set(RapidsConf.EXPLAIN.key, "ALL")
.set(RapidsConf.INCOMPATIBLE_DATE_FORMATS.key, "true")
.set(RapidsConf.ENABLE_CAST_STRING_TO_TIMESTAMP.key, "true")
.set(RapidsConf.ENABLE_CAST_STRING_TO_FLOAT.key, "true")
.set(RapidsConf.ENABLE_CAST_STRING_TO_DECIMAL.key, "true")
.set(RapidsConf.ENABLE_CAST_STRING_TO_INTEGER.key, "true")

val gpu = withGpuSparkSession(castDf, conf)
.sortBy(_.getInt(INDEX_ID))

for ((cpuRow, gpuRow) <- cpu.zip(gpu)) {
assert(cpuRow.getString(INDEX_C0) === gpuRow.getString(INDEX_C0))
assert(cpuRow.getInt(INDEX_ID) === gpuRow.getInt(INDEX_ID))
val cpuValue = cpuRow.get(INDEX_C1)
val gpuValue = gpuRow.get(INDEX_C1)
if (!compare(cpuValue, gpuValue)) {
val inputValue = cpuRow.getString(INDEX_C0)
fail(s"Mismatch casting string [$inputValue] " +
s"to $toType. CPU: $cpuValue; GPU: $gpuValue")
}
}
}

test("Test all supported casts with in-range values") {
// test cast() and ansi_cast()
Seq(false, true).foreach { ansiEnabled =>
Expand Down
47 changes: 17 additions & 30 deletions tests/src/test/scala/com/nvidia/spark/rapids/FuzzerUtils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,7 @@ object FuzzerUtils {
/**
* Default options when generating random data.
*/
private val DEFAULT_OPTIONS = FuzzerOptions(
numbersAsStrings = true,
asciiStringsOnly = false,
maxStringLen = 64)
private val DEFAULT_OPTIONS = FuzzerOptions()

/**
* Create a schema with the specified data types.
Expand Down Expand Up @@ -331,20 +328,6 @@ class EnhancedRandom(protected val r: Random, protected val options: FuzzerOptio
}
}

def nextString(): String = {
if (options.numbersAsStrings) {
r.nextInt(5) match {
case 0 => String.valueOf(r.nextInt())
case 1 => String.valueOf(r.nextLong())
case 2 => String.valueOf(r.nextFloat())
case 3 => String.valueOf(r.nextDouble())
case 4 => generateString()
}
} else {
generateString()
}
}

def nextDate(): Date = {
val futureDate = 6321706291000L // Upper limit Sunday, April 29, 2170 9:31:31 PM
new Date((futureDate * r.nextDouble()).toLong);
Expand All @@ -355,22 +338,26 @@ class EnhancedRandom(protected val r: Random, protected val options: FuzzerOptio
new Timestamp((futureDate * r.nextDouble()).toLong)
}

private def generateString(): String = {
if (options.asciiStringsOnly) {
val b = new StringBuilder()
for (_ <- 0 until options.maxStringLen) {
b.append(ASCII_CHARS.charAt(r.nextInt(ASCII_CHARS.length)))
}
b.toString
} else {
r.nextString(r.nextInt(options.maxStringLen))
def nextString(): String = {
val length = r.nextInt(options.maxStringLen)
options.validStringChars match {
case Some(ch) => nextString(ch, length)
case _ =>
// delegate to Scala's Random.nextString
r.nextString(length)
}
}

def nextString(validStringChars: String, maxStringLen: Int): String = {
val b = new StringBuilder(maxStringLen)
for (_ <- 0 until maxStringLen) {
b.append(validStringChars.charAt(r.nextInt(validStringChars.length)))
}
b.toString
}

private val ASCII_CHARS = "abcdefghijklmnopqrstuvwxyz"
}

case class FuzzerOptions(
numbersAsStrings: Boolean = true,
asciiStringsOnly: Boolean = false,
validStringChars: Option[String] = None,
maxStringLen: Int = 64)
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,7 @@ class HashAggregatesSuite extends SparkQueryCompareTestSuite {
}

def firstDf(spark: SparkSession): DataFrame = {
val options = FuzzerOptions(asciiStringsOnly = true, numbersAsStrings = false,
maxStringLen = 4)
val options = FuzzerOptions(maxStringLen = 4)
val schema = FuzzerUtils.createSchema(Seq(DataTypes.StringType, DataTypes.IntegerType))
FuzzerUtils.generateDataFrame(spark, schema, 100, options, seed = 0)
.withColumn("c2", col("c1").mod(lit(10)))
Expand Down Expand Up @@ -857,8 +856,7 @@ class HashAggregatesSuite extends SparkQueryCompareTestSuite {
private def randomDF(dataType: DataType)(spark: SparkSession) : DataFrame = {
val schema = FuzzerUtils.createSchema(Seq(DataTypes.StringType, dataType))
FuzzerUtils.generateDataFrame(spark, schema, rowCount = 1000,
options = FuzzerOptions(numbersAsStrings = false, asciiStringsOnly = true,
maxStringLen = 2))
options = FuzzerOptions(maxStringLen = 2))
}

FLOAT_TEST_testSparkResultsAreEqual("empty df: reduction count", floatCsvDf,
Expand Down

0 comments on commit 76c57f1

Please sign in to comment.