NVIDIA · abellina · Sep 21, 2020 · Sep 15, 2020 · Sep 15, 2020 · Sep 15, 2020
diff --git a/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/common/BenchUtils.scala b/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/common/BenchUtils.scala
@@ -26,7 +26,8 @@ import org.json4s.jackson.JsonMethods.parse
 import org.json4s.jackson.Serialization.writePretty
 
 import org.apache.spark.{SPARK_BUILD_USER, SPARK_VERSION}
-import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
+import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession}
+import org.apache.spark.sql.functions.col
 
 object BenchUtils {
 
@@ -246,6 +247,145 @@ object BenchUtils {
     }
   }
 
+  /**
+   * Perform a diff of the results collected from two DataFrames, allowing for differences in
+   * precision.
+   *
+   * The intended usage is to run timed benchmarks that write results to file and then separately
+   * use this utility to compare those result sets. This code performs a sort and a collect and
+   * is only suitable for data sets that can fit in the driver's memory. For larger datasets,
+   * a better approach would be to convert the results to single files, download them locally
+   * and adapt this Scala code to read those files directly (without using Spark).
+   *
+   * Example usage:
+   *
+   * <pre>
+   * scala> val cpu = spark.read.parquet("/data/q5-cpu")
+   * scala> val gpu = spark.read.parquet("/data/q5-gpu")
+   * scala> import com.nvidia.spark.rapids.tests.common._
+   * scala> BenchUtils.compareResults(cpu, gpu, ignoreOrdering=true, epsilon=0.0)
+   * Collecting rows from DataFrame
+   * Collected 989754 rows in 7.701 seconds
+   * Collecting rows from DataFrame
+   * Collected 989754 rows in 2.325 seconds
+   * Results match
+   * </pre>
+   *
+   * @param df1            DataFrame to compare.
+   * @param df2            DataFrame to compare.
+   * @param ignoreOrdering Sort the data collected from the DataFrames before comparing them.
+   * @param maxErrors      Maximum number of differences to report.
+   * @param epsilon        Allow for differences in precision when comparing floating point values.
+   */
+  def compareResults(
+      df1: DataFrame,
+      df2: DataFrame,
+      ignoreOrdering: Boolean,
+      maxErrors: Int = 10,
+      epsilon: Double = 0.00001): Unit = {
+
+    val result1: Seq[Seq[Any]] = collectResults(df1, ignoreOrdering)
+    val result2: Seq[Seq[Any]] = collectResults(df2, ignoreOrdering)
+
+    if (result1.length == result2.length) {
+      var errors = 0
+      var i = 0;
+      while (i < result1.length && errors < maxErrors) {
+        val l = result1(i)
+        val r = result2(i)
+        if (!rowEqual(l, r, epsilon)) {
+          println(s"Row $i:\n${l.mkString(",")}\n${r.mkString(",")}\n")
+          errors += 1
+          if (errors == maxErrors) {
+            println(s"Aborting comparison after reaching maximum of $maxErrors errors")
+          }
+        }
+        i += 1
+      }
+
+      if (errors==0) {
+        println(s"Results match")
+      }
+
+    } else {
+      println(s"Row counts do not match: ${result1.length} != ${result2.length}")
+    }
+  }
+
+  private def collectResults(df: DataFrame, ignoreOrdering: Boolean): Seq[Seq[Any]] = {
+    println("Collecting rows from DataFrame")
+    val t1 = System.currentTimeMillis()
+    val rows = if (ignoreOrdering) {
+      // let Spark do the sorting
+      df.sort(df.columns.map(col): _*).collect()
+    } else {
+      df.collect()
+    }
+    val t2 = System.currentTimeMillis()
+    println(s"Collected ${rows.length} rows in ${(t2-t1)/1000.0} seconds")
+    rows.map(_.toSeq)
+  }
+
+  private def rowEqual(row1: Seq[Any], row2: Seq[Any], epsilon: Double): Boolean = {
+    row1.zip(row2).forall {
+      case (l, r) => compare(l, r, epsilon)
+    }
+  }
+
+  // this is copied from SparkQueryCompareTestSuite
+  private def compare(expected: Any, actual: Any, epsilon: Double = 0.0): Boolean = {
+    def doublesAreEqualWithinPercentage(expected: Double, actual: Double): (String, Boolean) = {
+      if (!compare(expected, actual)) {
+        if (expected != 0) {
+          val v = Math.abs((expected - actual) / expected)
+          (s"\n\nABS($expected - $actual) / ABS($actual) == $v is not <= $epsilon ",  v <= epsilon)
+        } else {
+          val v = Math.abs(expected - actual)
+          (s"\n\nABS($expected - $actual) == $v is not <= $epsilon ",  v <= epsilon)
+        }
+      } else {
+        ("SUCCESS", true)
+      }
+    }
+    (expected, actual) match {
+      case (a: Float, b: Float) if a.isNaN && b.isNaN => true
+      case (a: Double, b: Double) if a.isNaN && b.isNaN => true
+      case (null, null) => true
+      case (null, _) => false
+      case (_, null) => false
+      case (a: Array[_], b: Array[_]) =>
+        a.length == b.length && a.zip(b).forall { case (l, r) => compare(l, r, epsilon) }
+      case (a: Map[_, _], b: Map[_, _]) =>
+        a.size == b.size && a.keys.forall { aKey =>
+          b.keys.find(bKey => compare(aKey, bKey))
+              .exists(bKey => compare(a(aKey), b(bKey)))
+        }
+      case (a: Iterable[_], b: Iterable[_]) =>
+        a.size == b.size && a.zip(b).forall { case (l, r) => compare(l, r, epsilon) }
+      case (a: Product, b: Product) =>
+        compare(a.productIterator.toSeq, b.productIterator.toSeq, epsilon)
+      case (a: Row, b: Row) =>
+        compare(a.toSeq, b.toSeq, epsilon)
+      // 0.0 == -0.0, turn float/double to bits before comparison, to distinguish 0.0 and -0.0.
+      case (a: Double, b: Double) if epsilon <= 0 =>
+        java.lang.Double.doubleToRawLongBits(a) == java.lang.Double.doubleToRawLongBits(b)
+      case (a: Double, b: Double) if epsilon > 0 =>
+        val ret = doublesAreEqualWithinPercentage(a, b)
+        if (!ret._2) {
+          System.err.println(ret._1 + " (double)")
+        }
+        ret._2
+      case (a: Float, b: Float) if epsilon <= 0 =>
+        java.lang.Float.floatToRawIntBits(a) == java.lang.Float.floatToRawIntBits(b)
+      case (a: Float, b: Float) if epsilon > 0 =>
+        val ret = doublesAreEqualWithinPercentage(a, b)
+        if (!ret._2) {
+          System.err.println(ret._1 + " (float)")
+        }
+        ret._2
+      case (a, b) => a == b
+    }
+  }
 }
 
 /** Top level benchmark report class */