add test on writing empty table to ORC/PAQUET (#4333)

Signed-off-by: sperlingxx <[email protected]>
NVIDIA · Dec 9, 2021 · 59b9db1 · 59b9db1
1 parent c7483f2
commit 59b9db1
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 0 deletions.
diff --git a/integration_tests/src/main/python/orc_write_test.py b/integration_tests/src/main/python/orc_write_test.py
@@ -152,3 +152,15 @@ def test_buckets_write_fallback(spark_tmp_path, spark_tmp_table_factory):
             data_path,
             'DataWritingCommandExec',
             conf = {'spark.rapids.sql.format.orc.write.enabled': True})
+
+@pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn)
+def test_write_empty_orc_round_trip(spark_tmp_path, orc_gens):
+    def create_empty_df(spark, path):
+        gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)]
+        return gen_df(spark, gen_list, length=0).write.orc(path)
+    data_path = spark_tmp_path + '/ORC_DATA'
+    assert_gpu_and_cpu_writes_are_equal_collect(
+        create_empty_df,
+        lambda spark, path: spark.read.orc(path),
+        data_path,
+        conf={'spark.rapids.sql.format.orc.write.enabled': True})
diff --git a/integration_tests/src/main/python/parquet_write_test.py b/integration_tests/src/main/python/parquet_write_test.py
@@ -389,3 +389,25 @@ def test_it(spark):
             if allow_non_empty or e.desc.find('non-empty directory') == -1:
                 raise e
     with_gpu_session(test_it, conf)
+
+@pytest.mark.parametrize('parquet_gens', parquet_write_gens_list, ids=idfn)
+@pytest.mark.parametrize('reader_confs', reader_opt_confs)
+@pytest.mark.parametrize('v1_enabled_list', ["", "parquet"])
+@pytest.mark.parametrize('ts_type', parquet_ts_write_options)
+def test_write_empty_parquet_round_trip(spark_tmp_path,
+                                        parquet_gens,
+                                        v1_enabled_list,
+                                        ts_type,
+                                        reader_confs):
+    def create_empty_df(spark, path):
+        gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)]
+        return gen_df(spark, gen_list, length=0).write.parquet(path)
+    data_path = spark_tmp_path + '/PARQUET_DATA'
+    all_confs = copy_and_update(reader_confs, writer_confs, {
+        'spark.sql.sources.useV1SourceList': v1_enabled_list,
+        'spark.sql.parquet.outputTimestampType': ts_type})
+    assert_gpu_and_cpu_writes_are_equal_collect(
+        create_empty_df,
+        lambda spark, path: spark.read.parquet(path),
+        data_path,
+        conf=all_confs)