From d29af846ed7f881d7cedccd07f147bde39218101 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 7 May 2024 17:10:51 -0700 Subject: [PATCH] Rework some python tests of Parquet delta encodings (#15693) test_parquet.py currently takes around 55s to run on an RTXA6000 system. A large portion of that run time is in two tests of the Parquet DELTA_LENGTH_BYTE_ARRAY and DELTA_BYTE_ARRAY encodings. These tests are parameterized with varying row counts to test certain encoding edge cases, but the final two row counts (10,000, 50,000) are unnecessarily large to provide adequate test coverage. This PR reduces the number of row counts (some were redundant) and decreases the maximum row count to 1,000. This drops the execution time to just under 26s on the same system. This PR also corrects an oversight from #15239. DELTA_BYTE_ARRAY encoding should have been added to the tests at that time. Authors: - Ed Seidl (https://github.com/etseidl) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15693 --- python/cudf/cudf/tests/test_parquet.py | 55 +++++++++++++++----------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index f1b90b40991..1e175f5ff0d 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1311,8 +1311,19 @@ def test_parquet_delta_byte_array(datadir): assert_eq(cudf.read_parquet(fname), pd.read_parquet(fname)) +# values chosen to exercise: +# 1 - header only, no bitpacked values +# 2 - one bitpacked value +# 23 - one partially filled miniblock +# 32 - almost full miniblock +# 33 - one full miniblock +# 34 - one full miniblock plus one value in new miniblock +# 128 - almost full block +# 129 - one full block +# 130 - one full block plus one value in new block +# 1000 - multiple blocks def delta_num_rows(): - return [1, 2, 23, 32, 33, 34, 64, 65, 66, 128, 129, 130, 20000, 50000] + return [1, 2, 23, 32, 33, 34, 128, 129, 130, 1000] @pytest.mark.parametrize("nrows", delta_num_rows()) @@ -1412,17 +1423,16 @@ def test_delta_byte_array_roundtrip( pcdf = cudf.from_pandas(test_pdf) assert_eq(cdf, pcdf) - # Test DELTA_LENGTH_BYTE_ARRAY writing as well - if str_encoding == "DELTA_LENGTH_BYTE_ARRAY": - cudf_fname = tmpdir.join("cdfdeltaba.parquet") - pcdf.to_parquet( - cudf_fname, - compression="snappy", - header_version="2.0", - use_dictionary=False, - ) - cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname)) - assert_eq(cdf2, cdf) + # Write back out with cudf and make sure pyarrow can read it + cudf_fname = tmpdir.join("cdfdeltaba.parquet") + pcdf.to_parquet( + cudf_fname, + compression="snappy", + header_version="2.0", + use_dictionary=False, + ) + cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname)) + assert_eq(cdf2, cdf) @pytest.mark.parametrize("nrows", delta_num_rows()) @@ -1479,17 +1489,16 @@ def string_list_gen_wrapped(x, y): pcdf = cudf.from_pandas(test_pdf) assert_eq(cdf, pcdf) - # Test DELTA_LENGTH_BYTE_ARRAY writing as well - if str_encoding == "DELTA_LENGTH_BYTE_ARRAY": - cudf_fname = tmpdir.join("cdfdeltaba.parquet") - pcdf.to_parquet( - cudf_fname, - compression="snappy", - header_version="2.0", - use_dictionary=False, - ) - cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname)) - assert_eq(cdf2, cdf) + # Write back out with cudf and make sure pyarrow can read it + cudf_fname = tmpdir.join("cdfdeltaba.parquet") + pcdf.to_parquet( + cudf_fname, + compression="snappy", + header_version="2.0", + use_dictionary=False, + ) + cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname)) + assert_eq(cdf2, cdf) @pytest.mark.parametrize(