From c1a0c614f6c4d09e88c8b3ca1be9cb3ac445198b Mon Sep 17 00:00:00 2001 From: MithunR Date: Thu, 30 Nov 2023 22:05:15 -0800 Subject: [PATCH] Skip fastparquet timestamp tests when plugin cannot read/write timestamps (#9831) * Skip fastparquet timestamp tests for non-UTC timezones. Fixes #9776. The tests in `fastparquet_compatibility_test.py` check for compatibility between Apache Spark, the Spark RAPIDS plugin, and fastparquet. In particular: 1. `test_reading_file_written_by_spark_cpu` checks if timestamp columns written with Apache Spark are read similarly with fastparquet and the plugin. 2. `test_reading_file_written_with_gpu` checks if timestamps written with the plugin are read the same on Apache Spark and fastparquet. If the timezone is not set to "UTC", and the system timezone isn't "UTC" either, the plugin falls back to CPU for read/write of Parquet timestamp columns. This would cause the above tests not to run: the plugin can neither read nor write timestamps on GPU. Further, fastparquet seems to interpret timestamps written from Spark as being in "UTC", regardless of the timezone settings. So on non-UTC timezones, Apache Spark and fastparquet get different results for the same input. For the two reasons above, it is best to only run the three-way timestamp comparison tests in setups with "UTC" timezone. This commit skips the timestamp tests described above, when a non-UTC timezone is detected. Signed-off-by: MithunR --- .../python/fastparquet_compatibility_test.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/integration_tests/src/main/python/fastparquet_compatibility_test.py b/integration_tests/src/main/python/fastparquet_compatibility_test.py index a12bd223778..11bc389fb0a 100644 --- a/integration_tests/src/main/python/fastparquet_compatibility_test.py +++ b/integration_tests/src/main/python/fastparquet_compatibility_test.py @@ -124,9 +124,12 @@ def read_with_fastparquet_or_plugin(spark): marks=pytest.mark.xfail(reason="fastparquet reads dates as timestamps.")), pytest.param(DateGen(nullable=False), marks=pytest.mark.xfail(reason="fastparquet reads far future dates (e.g. year=8705) incorrectly.")), - TimestampGen(nullable=False, - start=pandas_min_datetime, - end=pandas_max_datetime), # Vanilla case. + pytest.param(TimestampGen(nullable=False, + start=pandas_min_datetime, + end=pandas_max_datetime), + marks=pytest.mark.skipif(condition=is_not_utc(), + reason="fastparquet interprets timestamps in UTC timezone, regardless " + "of timezone settings")), # Vanilla case. pytest.param(TimestampGen(nullable=False, start=pandas_min_datetime, end=pandas_max_datetime), @@ -201,9 +204,12 @@ def test_reading_file_written_by_spark_cpu(data_gen, spark_tmp_path): marks=pytest.mark.xfail(reason="fastparquet reads dates as timestamps.")), pytest.param(DateGen(nullable=False), marks=pytest.mark.xfail(reason="fastparquet reads far future dates (e.g. year=8705) incorrectly.")), - TimestampGen(nullable=False, - start=pandas_min_datetime, - end=pandas_max_datetime), # Vanilla case. + pytest.param(TimestampGen(nullable=False, + start=pandas_min_datetime, + end=pandas_max_datetime), + marks=pytest.mark.skipif(condition=is_not_utc(), + reason="fastparquet interprets timestamps in UTC timezone, regardless " + "of timezone settings")), # Vanilla case. pytest.param(TimestampGen(nullable=False, start=datetime(1, 2, 1, tzinfo=timezone.utc), end=pandas_min_datetime),