From fab7f9a07edcccdf7217406d916706f80623066c Mon Sep 17 00:00:00 2001 From: mythrocks Date: Wed, 24 Jun 2020 10:41:54 -0700 Subject: [PATCH] Add conditional xfail test for DISTINCT aggregates with NaN (#261) SPARK-32038 reports a regression in Apache Spark (3.0.0), in failing to normalize NaN/Zero float values, during DISTINCT aggregations. This causes a mismatch in results between Apache Spark 3.0.0 on CPU, and the Rapids Accelerator (which returns the right results). SPARK-32038 was fixed in apache/spark#28876. This commit introduces a conditional xfail test that passes on Apache Spark 3.0.1 and 3.1+ (which fixes SPARK-32038), but produces an expected failure on Spark 3.0.0. --- .../src/main/python/hash_aggregate_test.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py index eb306a81ee3..60f7212b335 100644 --- a/integration_tests/src/main/python/hash_aggregate_test.py +++ b/integration_tests/src/main/python/hash_aggregate_test.py @@ -19,7 +19,7 @@ from pyspark.sql.types import * from marks import * import pyspark.sql.functions as f -from spark_session import with_cpu_session +from spark_session import with_cpu_session, with_spark_session _no_nans_float_conf = {'spark.rapids.sql.variableFloatAgg.enabled': 'true', 'spark.rapids.sql.hasNans': 'false', @@ -338,8 +338,11 @@ def test_hash_agg_with_nan_keys(data_gen): conf=_no_nans_float_conf) -@pytest.mark.xfail(reason="count(distinct floats) fails when there are NaN values in the aggregation column." - "(https://github.com/NVIDIA/spark-rapids/issues/194)") +@pytest.mark.xfail( + condition=with_spark_session(lambda spark : spark.sparkContext.version == "3.0.0"), + reason="[SPARK-32038][SQL] NormalizeFloatingNumbers should also work on distinct aggregate " + "(https://github.com/apache/spark/pull/28876) " + "Fixed in later Apache Spark releases.") @approximate_float @ignore_order @pytest.mark.parametrize('data_gen', [ _grpkey_doubles_with_nan_zero_grouping_keys], ids=idfn) @@ -354,6 +357,5 @@ def test_count_distinct_with_nan_floats(data_gen): 'from hash_agg_table group by a'), conf=_no_nans_float_conf) - # TODO: Literal tests # TODO: First and Last tests