Skip to content

Commit

Permalink
Rework some python tests of Parquet delta encodings (#15693)
Browse files Browse the repository at this point in the history
test_parquet.py currently takes around 55s to run on an RTXA6000 system. A large portion of that run time is in two tests of the Parquet DELTA_LENGTH_BYTE_ARRAY and DELTA_BYTE_ARRAY encodings. These tests are parameterized with varying row counts to test certain encoding edge cases, but the final two row counts (10,000, 50,000) are unnecessarily large to provide adequate test coverage. This PR reduces the number of row counts (some were redundant) and decreases the maximum row count to 1,000.  This drops the execution time to just under 26s on the same system.

This PR also corrects an oversight from #15239. DELTA_BYTE_ARRAY encoding should have been added to the tests at that time.

Authors:
  - Ed Seidl (https://github.com/etseidl)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #15693
  • Loading branch information
etseidl authored May 8, 2024
1 parent 5154661 commit d29af84
Showing 1 changed file with 32 additions and 23 deletions.
55 changes: 32 additions & 23 deletions python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1311,8 +1311,19 @@ def test_parquet_delta_byte_array(datadir):
assert_eq(cudf.read_parquet(fname), pd.read_parquet(fname))


# values chosen to exercise:
# 1 - header only, no bitpacked values
# 2 - one bitpacked value
# 23 - one partially filled miniblock
# 32 - almost full miniblock
# 33 - one full miniblock
# 34 - one full miniblock plus one value in new miniblock
# 128 - almost full block
# 129 - one full block
# 130 - one full block plus one value in new block
# 1000 - multiple blocks
def delta_num_rows():
return [1, 2, 23, 32, 33, 34, 64, 65, 66, 128, 129, 130, 20000, 50000]
return [1, 2, 23, 32, 33, 34, 128, 129, 130, 1000]


@pytest.mark.parametrize("nrows", delta_num_rows())
Expand Down Expand Up @@ -1412,17 +1423,16 @@ def test_delta_byte_array_roundtrip(
pcdf = cudf.from_pandas(test_pdf)
assert_eq(cdf, pcdf)

# Test DELTA_LENGTH_BYTE_ARRAY writing as well
if str_encoding == "DELTA_LENGTH_BYTE_ARRAY":
cudf_fname = tmpdir.join("cdfdeltaba.parquet")
pcdf.to_parquet(
cudf_fname,
compression="snappy",
header_version="2.0",
use_dictionary=False,
)
cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname))
assert_eq(cdf2, cdf)
# Write back out with cudf and make sure pyarrow can read it
cudf_fname = tmpdir.join("cdfdeltaba.parquet")
pcdf.to_parquet(
cudf_fname,
compression="snappy",
header_version="2.0",
use_dictionary=False,
)
cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname))
assert_eq(cdf2, cdf)


@pytest.mark.parametrize("nrows", delta_num_rows())
Expand Down Expand Up @@ -1479,17 +1489,16 @@ def string_list_gen_wrapped(x, y):
pcdf = cudf.from_pandas(test_pdf)
assert_eq(cdf, pcdf)

# Test DELTA_LENGTH_BYTE_ARRAY writing as well
if str_encoding == "DELTA_LENGTH_BYTE_ARRAY":
cudf_fname = tmpdir.join("cdfdeltaba.parquet")
pcdf.to_parquet(
cudf_fname,
compression="snappy",
header_version="2.0",
use_dictionary=False,
)
cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname))
assert_eq(cdf2, cdf)
# Write back out with cudf and make sure pyarrow can read it
cudf_fname = tmpdir.join("cdfdeltaba.parquet")
pcdf.to_parquet(
cudf_fname,
compression="snappy",
header_version="2.0",
use_dictionary=False,
)
cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname))
assert_eq(cdf2, cdf)


@pytest.mark.parametrize(
Expand Down

0 comments on commit d29af84

Please sign in to comment.