Skip to content

Commit

Permalink
Fix skipping fixed_length_char ORC tests on [databricks] > 13.3
Browse files Browse the repository at this point in the history
Fixes NVIDIA#11528.

This commit addresses the failure of
`orc_test.py::test_project_fallback_when_reading_hive_fixed_length_char`
when run on Databricks 13.3+.

This test was skipped with an incorrect reason statement in NVIDIA#9646, to
address a failure on Databricks 13.3.  That failure was not the result
of a data mismatch.  It failed because the ProjectExec didn't fall back
to CPU.  (In fact, with Databricks 13.3 onwards, the ProjectExec is not
involved in `SELECT *` queries.)

As an aside, the same test is now skipped on Databricks 13.3 and 14.3,
because the error condition it tests does not fire on those versions.

Signed-off-by: MithunR <[email protected]>
  • Loading branch information
mythrocks committed Oct 24, 2024
1 parent 5ed0a12 commit b54b266
Showing 1 changed file with 17 additions and 3 deletions.
20 changes: 17 additions & 3 deletions integration_tests/src/main/python/orc_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -845,33 +845,47 @@ def test_simple_partitioned_read_for_multithreaded_combining(spark_tmp_path, kee
assert_gpu_and_cpu_are_equal_collect(
lambda spark: spark.read.orc(data_path), conf=all_confs)

@pytest.mark.skipif(is_spark_340_or_later() and (not (is_databricks_runtime() and spark_version() == "3.4.1")), reason="https://github.com/NVIDIA/spark-rapids/issues/8324")

@pytest.mark.skipif(is_spark_340_or_later() and not is_databricks_runtime(),
reason="https://github.com/NVIDIA/spark-rapids/issues/8324")
@pytest.mark.parametrize('data_file', ['fixed-length-char-column-from-hive.orc'])
@pytest.mark.parametrize('reader', [read_orc_df, read_orc_sql])
def test_read_hive_fixed_length_char(std_input_path, data_file, reader):
"""
Test that a file containing CHAR data is readable as STRING.
The plugin behaviour matches all Spark versions prior to 3.4.0,
and Databricks version 13.3 (i.e. 3.4.1) and after.
"""
assert_gpu_and_cpu_are_equal_collect(
reader(std_input_path + '/' + data_file),
conf={})


@allow_non_gpu("ProjectExec")
@pytest.mark.skipif(is_before_spark_340() or (is_databricks_runtime() and spark_version() == "3.4.1"), reason="https://github.com/NVIDIA/spark-rapids/issues/8324")
@pytest.mark.skipif(is_before_spark_340(),
reason="https://github.com/NVIDIA/spark-rapids/issues/8324")
@pytest.mark.skipif(is_databricks_version_or_later(13, 3),
reason="The SELECT * query does not involve ProjectExec "
"on Databricks versions >= 13.3. "
"Can't test Project fallback without ProjectExec.")
@pytest.mark.parametrize('data_file', ['fixed-length-char-column-from-hive.orc'])
@pytest.mark.parametrize('reader', [read_orc_df, read_orc_sql])
def test_project_fallback_when_reading_hive_fixed_length_char(std_input_path, data_file, reader):
"""
Test that a file containing CHAR data is readable as STRING.
Test that reading a file containing fixed-width CHAR data (e.g. CHAR(3)) as a STRING column
causes the ProjectExec to fall back to CPU.
Note: This test can be removed when
https://github.com/NVIDIA/spark-rapids/issues/8324 is resolved.
This test does not apply to Databricks >= 13.3, because there would be
no ProjectExec to fall back to CPU.
"""
assert_gpu_fallback_collect(
reader(std_input_path + '/' + data_file),
cpu_fallback_class_name="ProjectExec",
conf={})


@pytest.mark.parametrize('read_func', [read_orc_df, read_orc_sql])
@pytest.mark.parametrize('v1_enabled_list', ["", "orc"])
@pytest.mark.parametrize('orc_impl', ["native", "hive"])
Expand Down

0 comments on commit b54b266

Please sign in to comment.