From 23fb4276aa8ec823f0a8ce5436cff8056e282cad Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 6 Apr 2022 12:46:50 -0700 Subject: [PATCH 1/4] add check --- cpp/src/io/parquet/reader_impl.cu | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index 56eb34bbe2f..4d1f0fd402d 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -1179,6 +1179,17 @@ rmm::device_buffer reader::impl::decompress_page_data( codec_stats{parquet::SNAPPY, 0, 0}, codec_stats{parquet::BROTLI, 0, 0}}; + auto is_codec_supported = [&](int8_t codec) { + if (codec == parquet::UNCOMPRESSED) return true; + return std::find_if(codecs.begin(), codecs.end(), [codec](auto& cstats) { + return codec == cstats.compression_type; + }) != codecs.end(); + }; + CUDF_EXPECTS( + std::all_of( + chunks.begin(), chunks.end(), [&](auto& chunk) { return is_codec_supported(chunk.codec); }), + "Unsupported compression type"); + for (auto& codec : codecs) { for_each_codec_page(codec.compression_type, [&](size_t page) { auto page_uncomp_size = pages[page].uncompressed_page_size; From d136afbfca5041dedd8fd7ff787fc4ed2669682e Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 6 Apr 2022 12:47:02 -0700 Subject: [PATCH 2/4] add test --- python/cudf/cudf/tests/data/parquet/test.zstd | Bin 0 -> 459 bytes python/cudf/cudf/tests/test_parquet.py | 7 +++++++ 2 files changed, 7 insertions(+) create mode 100644 python/cudf/cudf/tests/data/parquet/test.zstd diff --git a/python/cudf/cudf/tests/data/parquet/test.zstd b/python/cudf/cudf/tests/data/parquet/test.zstd new file mode 100644 index 0000000000000000000000000000000000000000..99b584aa557dc0837d70ac2f7d21a3898af7f279 GIT binary patch literal 459 zcmZWm%SyvQ6uqe$vgk(W3=_yA4760KV-rnZxN#%mse=bgp{cS(5(cs&H~AG0 zE7rwUaI4Otx{==^(x+4;1HeBFR^m@g0lhLS77t5gUlU@I4irw2EY-$cz3Ma@PR8Qk zO!=w~;^^N*HcJ!*oy~Qw(#e||=^P`>QQR$L{yx>RBeOi6_j6g3@lYpGCOh{FImXet zg~aZrhT~ihbV|f{o+Q{ys2^jJ>6-l0l%2(L`LG0WvvWKdrS2}G>nYDyo?GqBhe_o6 t81(Cne$a}s-D<|I0MYloX1%Aq^Qi4NqacnOx)~kWjzd45(l)xZ#$WzKX&3+i literal 0 HcmV?d00001 diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 58ba77d0b0e..3f64caaa07e 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2420,3 +2420,10 @@ def test_parquet_reader_decimal_columns(): expected = pd.read_parquet(buffer, columns=["col3", "col2", "col1"]) assert_eq(actual, expected) + + +def test_parquet_reader_unsupported_compression(datadir): + fname = datadir / "test.zstd" + + with pytest.raises(RuntimeError): + cudf.read_parquet(fname) From 031e9dac4c583e59646beb7b86aa323a891c94d1 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 6 Apr 2022 16:22:19 -0700 Subject: [PATCH 3/4] rename test file --- .../data/parquet/{test.zstd => spark_zstd.parquet} | Bin python/cudf/cudf/tests/test_parquet.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename python/cudf/cudf/tests/data/parquet/{test.zstd => spark_zstd.parquet} (100%) diff --git a/python/cudf/cudf/tests/data/parquet/test.zstd b/python/cudf/cudf/tests/data/parquet/spark_zstd.parquet similarity index 100% rename from python/cudf/cudf/tests/data/parquet/test.zstd rename to python/cudf/cudf/tests/data/parquet/spark_zstd.parquet diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 3f64caaa07e..727200293f7 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2423,7 +2423,7 @@ def test_parquet_reader_decimal_columns(): def test_parquet_reader_unsupported_compression(datadir): - fname = datadir / "test.zstd" + fname = datadir / "spark_zstd.parquet" with pytest.raises(RuntimeError): cudf.read_parquet(fname) From 82b437db223070f73c60afa571fec46c7aa01cdc Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 6 Apr 2022 16:34:22 -0700 Subject: [PATCH 4/4] clean up --- cpp/src/io/parquet/reader_impl.cu | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index 4d1f0fd402d..46b3206f731 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -1179,16 +1179,18 @@ rmm::device_buffer reader::impl::decompress_page_data( codec_stats{parquet::SNAPPY, 0, 0}, codec_stats{parquet::BROTLI, 0, 0}}; - auto is_codec_supported = [&](int8_t codec) { + auto is_codec_supported = [&codecs](int8_t codec) { if (codec == parquet::UNCOMPRESSED) return true; return std::find_if(codecs.begin(), codecs.end(), [codec](auto& cstats) { return codec == cstats.compression_type; }) != codecs.end(); }; - CUDF_EXPECTS( - std::all_of( - chunks.begin(), chunks.end(), [&](auto& chunk) { return is_codec_supported(chunk.codec); }), - "Unsupported compression type"); + CUDF_EXPECTS(std::all_of(chunks.begin(), + chunks.end(), + [&is_codec_supported](auto const& chunk) { + return is_codec_supported(chunk.codec); + }), + "Unsupported compression type"); for (auto& codec : codecs) { for_each_codec_page(codec.compression_type, [&](size_t page) {