Fix skipping fixed_length_char ORC tests on [databricks] > 13.3

Fixes NVIDIA#11528. This commit addresses the failure of `orc_test.py::test_project_fallback_when_reading_hive_fixed_length_char` when run on Databricks 13.3+. This test was skipped with an incorrect reason statement in NVIDIA#9646, to address a failure on Databricks 13.3. That failure was not the result of a data mismatch. It failed because the ProjectExec didn't fall back to CPU. (In fact, with Databricks 13.3 onwards, the ProjectExec is not involved in `SELECT *` queries.) As an aside, the same test is now skipped on Databricks 13.3 and 14.3, because the error condition it tests does not fire on those versions. Signed-off-by: MithunR <[email protected]>
mythrocks · Oct 24, 2024 · b54b266 · b54b266
1 parent 5ed0a12
commit b54b266
Showing 1 changed file with 17 additions and 3 deletions.
diff --git a/integration_tests/src/main/python/orc_test.py b/integration_tests/src/main/python/orc_test.py
@@ -845,33 +845,47 @@ def test_simple_partitioned_read_for_multithreaded_combining(spark_tmp_path, kee
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark: spark.read.orc(data_path), conf=all_confs)
 
-@pytest.mark.skipif(is_spark_340_or_later() and (not (is_databricks_runtime() and spark_version() == "3.4.1")), reason="https://github.com/NVIDIA/spark-rapids/issues/8324")
+
+@pytest.mark.skipif(is_spark_340_or_later() and not is_databricks_runtime(),
+                    reason="https://github.com/NVIDIA/spark-rapids/issues/8324")
 @pytest.mark.parametrize('data_file', ['fixed-length-char-column-from-hive.orc'])
 @pytest.mark.parametrize('reader', [read_orc_df, read_orc_sql])
 def test_read_hive_fixed_length_char(std_input_path, data_file, reader):
     """
     Test that a file containing CHAR data is readable as STRING.
+    The plugin behaviour matches all Spark versions prior to 3.4.0,
+    and Databricks version 13.3 (i.e. 3.4.1) and after.
     """
     assert_gpu_and_cpu_are_equal_collect(
         reader(std_input_path + '/' + data_file),
         conf={})
 
 
 @allow_non_gpu("ProjectExec")
-@pytest.mark.skipif(is_before_spark_340() or (is_databricks_runtime() and spark_version() == "3.4.1"), reason="https://github.com/NVIDIA/spark-rapids/issues/8324")
+@pytest.mark.skipif(is_before_spark_340(),
+                    reason="https://github.com/NVIDIA/spark-rapids/issues/8324")
+@pytest.mark.skipif(is_databricks_version_or_later(13, 3),
+                    reason="The SELECT * query does not involve ProjectExec "
+                           "on Databricks versions >= 13.3. "
+                           "Can't test Project fallback without ProjectExec.")
 @pytest.mark.parametrize('data_file', ['fixed-length-char-column-from-hive.orc'])
 @pytest.mark.parametrize('reader', [read_orc_df, read_orc_sql])
 def test_project_fallback_when_reading_hive_fixed_length_char(std_input_path, data_file, reader):
     """
-    Test that a file containing CHAR data is readable as STRING.
+    Test that reading a file containing fixed-width CHAR data (e.g. CHAR(3)) as a STRING column
+    causes the ProjectExec to fall back to CPU.
     Note: This test can be removed when
     https://github.com/NVIDIA/spark-rapids/issues/8324 is resolved.
+
+    This test does not apply to Databricks >= 13.3, because there would be
+    no ProjectExec to fall back to CPU.
     """
     assert_gpu_fallback_collect(
         reader(std_input_path + '/' + data_file),
         cpu_fallback_class_name="ProjectExec",
         conf={})
 
+
 @pytest.mark.parametrize('read_func', [read_orc_df, read_orc_sql])
 @pytest.mark.parametrize('v1_enabled_list', ["", "orc"])
 @pytest.mark.parametrize('orc_impl', ["native", "hive"])