Skip to content

Commit

Permalink
Skip fastparquet timestamp tests for non-UTC timezones.
Browse files Browse the repository at this point in the history
Fixes NVIDIA#9776.

`fastparquet` seems to read Parquet timestamp columns and interpret them
in the UTC timezone, regardless of timestamp settings.
The Spark RAPIDS plugin falls back on Apache Spark (CPU) to interpret
timestamp columns, when it detects that the timezone is non-UTC.
Apache Spark seems to correctly interpret the timestamps based on
timezone.

This causes the `fastparquet` timestamp tests to fail in cases where
the timezone is unspecified.

This commit xfails the timestamp tests when a non-UTC timezone is
detected.

Signed-off-by: MithunR <[email protected]>
  • Loading branch information
mythrocks committed Nov 21, 2023
1 parent 2667941 commit e094eb2
Showing 1 changed file with 19 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,13 @@ def read_with_fastparquet_or_plugin(spark):
return read_with_fastparquet_or_plugin


def is_timezone_utc():
from spark_init_internal import get_spark_i_know_what_i_am_doing
import time
spark = get_spark_i_know_what_i_am_doing()
return spark.conf.get("spark.sql.session.timeZone") == "UTC" and time.tzname[time.daylight] == "UTC"


@pytest.mark.skipif(condition=fastparquet_unavailable(),
reason="fastparquet is required for testing fastparquet compatibility")
@pytest.mark.skipif(condition=spark_version() < "3.4.0",
Expand All @@ -119,9 +126,12 @@ def read_with_fastparquet_or_plugin(spark):
marks=pytest.mark.xfail(reason="fastparquet reads dates as timestamps.")),
pytest.param(DateGen(nullable=False),
marks=pytest.mark.xfail(reason="fastparquet reads far future dates (e.g. year=8705) incorrectly.")),
TimestampGen(nullable=False,
start=pandas_min_datetime,
end=pandas_max_datetime), # Vanilla case.
pytest.param(TimestampGen(nullable=False,
start=pandas_min_datetime,
end=pandas_max_datetime),
marks=pytest.mark.xfail(condition=not is_timezone_utc(),
reason="fastparquet interprets timestamps in UTC timezone, regardless "
"of timezone settings")), # Vanilla case.
pytest.param(TimestampGen(nullable=False,
start=pandas_min_datetime,
end=pandas_max_datetime),
Expand Down Expand Up @@ -188,9 +198,12 @@ def test_reading_file_written_by_spark_cpu(data_gen, spark_tmp_path):
marks=pytest.mark.xfail(reason="fastparquet reads dates as timestamps.")),
pytest.param(DateGen(nullable=False),
marks=pytest.mark.xfail(reason="fastparquet reads far future dates (e.g. year=8705) incorrectly.")),
TimestampGen(nullable=False,
start=pandas_min_datetime,
end=pandas_max_datetime), # Vanilla case.
pytest.param(TimestampGen(nullable=False,
start=pandas_min_datetime,
end=pandas_max_datetime),
marks=pytest.mark.xfail(condition=not is_timezone_utc(),
reason="fastparquet interprets timestamps in UTC timezone, regardless "
"of timezone settings")), # Vanilla case.
pytest.param(TimestampGen(nullable=False,
start=datetime(1, 1, 1, tzinfo=timezone.utc),
end=pandas_min_datetime),
Expand Down

0 comments on commit e094eb2

Please sign in to comment.