From 966771d2e9929713a5a32493b9ad4466a1b8e390 Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 7 May 2024 19:13:56 +0000 Subject: [PATCH 1/2] speed up delta encoding tests, fix an omission --- python/cudf/cudf/tests/test_parquet.py | 34 +++++++++++++++++--------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index f1b90b40991..9fa75fba656 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1311,8 +1311,19 @@ def test_parquet_delta_byte_array(datadir): assert_eq(cudf.read_parquet(fname), pd.read_parquet(fname)) +# values chosen to exercise: +# 1 - header only, no bitpacked values +# 2 - one bitpacked value +# 23 - one partially filled miniblock +# 32 - almost full miniblock +# 33 - one full miniblock +# 34 - one full miniblock plus one value in new miniblock +# 128 - almost full block +# 129 - one full block +# 130 - one full block plus one value in new block +# 1000 - multiple blocks def delta_num_rows(): - return [1, 2, 23, 32, 33, 34, 64, 65, 66, 128, 129, 130, 20000, 50000] + return [1, 2, 23, 32, 33, 34, 128, 129, 130, 1000] @pytest.mark.parametrize("nrows", delta_num_rows()) @@ -1412,17 +1423,16 @@ def test_delta_byte_array_roundtrip( pcdf = cudf.from_pandas(test_pdf) assert_eq(cdf, pcdf) - # Test DELTA_LENGTH_BYTE_ARRAY writing as well - if str_encoding == "DELTA_LENGTH_BYTE_ARRAY": - cudf_fname = tmpdir.join("cdfdeltaba.parquet") - pcdf.to_parquet( - cudf_fname, - compression="snappy", - header_version="2.0", - use_dictionary=False, - ) - cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname)) - assert_eq(cdf2, cdf) + # Write back out with cudf and make sure pyarrow can read it + cudf_fname = tmpdir.join("cdfdeltaba.parquet") + pcdf.to_parquet( + cudf_fname, + compression="snappy", + header_version="2.0", + use_dictionary=False, + ) + cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname)) + assert_eq(cdf2, cdf) @pytest.mark.parametrize("nrows", delta_num_rows()) From a64514d260c0b55356abc9af2efddf301d5d9e99 Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 7 May 2024 19:22:32 +0000 Subject: [PATCH 2/2] add more delta_byte_array --- python/cudf/cudf/tests/test_parquet.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 9fa75fba656..1e175f5ff0d 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1489,17 +1489,16 @@ def string_list_gen_wrapped(x, y): pcdf = cudf.from_pandas(test_pdf) assert_eq(cdf, pcdf) - # Test DELTA_LENGTH_BYTE_ARRAY writing as well - if str_encoding == "DELTA_LENGTH_BYTE_ARRAY": - cudf_fname = tmpdir.join("cdfdeltaba.parquet") - pcdf.to_parquet( - cudf_fname, - compression="snappy", - header_version="2.0", - use_dictionary=False, - ) - cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname)) - assert_eq(cdf2, cdf) + # Write back out with cudf and make sure pyarrow can read it + cudf_fname = tmpdir.join("cdfdeltaba.parquet") + pcdf.to_parquet( + cudf_fname, + compression="snappy", + header_version="2.0", + use_dictionary=False, + ) + cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname)) + assert_eq(cdf2, cdf) @pytest.mark.parametrize(