Skip to content

Commit

Permalink
add test on writing empty table to ORC/PAQUET (#4333)
Browse files Browse the repository at this point in the history
Signed-off-by: sperlingxx <[email protected]>
  • Loading branch information
sperlingxx authored Dec 9, 2021
1 parent c7483f2 commit 59b9db1
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 0 deletions.
12 changes: 12 additions & 0 deletions integration_tests/src/main/python/orc_write_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,15 @@ def test_buckets_write_fallback(spark_tmp_path, spark_tmp_table_factory):
data_path,
'DataWritingCommandExec',
conf = {'spark.rapids.sql.format.orc.write.enabled': True})

@pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn)
def test_write_empty_orc_round_trip(spark_tmp_path, orc_gens):
def create_empty_df(spark, path):
gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)]
return gen_df(spark, gen_list, length=0).write.orc(path)
data_path = spark_tmp_path + '/ORC_DATA'
assert_gpu_and_cpu_writes_are_equal_collect(
create_empty_df,
lambda spark, path: spark.read.orc(path),
data_path,
conf={'spark.rapids.sql.format.orc.write.enabled': True})
22 changes: 22 additions & 0 deletions integration_tests/src/main/python/parquet_write_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,3 +389,25 @@ def test_it(spark):
if allow_non_empty or e.desc.find('non-empty directory') == -1:
raise e
with_gpu_session(test_it, conf)

@pytest.mark.parametrize('parquet_gens', parquet_write_gens_list, ids=idfn)
@pytest.mark.parametrize('reader_confs', reader_opt_confs)
@pytest.mark.parametrize('v1_enabled_list', ["", "parquet"])
@pytest.mark.parametrize('ts_type', parquet_ts_write_options)
def test_write_empty_parquet_round_trip(spark_tmp_path,
parquet_gens,
v1_enabled_list,
ts_type,
reader_confs):
def create_empty_df(spark, path):
gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)]
return gen_df(spark, gen_list, length=0).write.parquet(path)
data_path = spark_tmp_path + '/PARQUET_DATA'
all_confs = copy_and_update(reader_confs, writer_confs, {
'spark.sql.sources.useV1SourceList': v1_enabled_list,
'spark.sql.parquet.outputTimestampType': ts_type})
assert_gpu_and_cpu_writes_are_equal_collect(
create_empty_df,
lambda spark, path: spark.read.parquet(path),
data_path,
conf=all_confs)

0 comments on commit 59b9db1

Please sign in to comment.