From d2b9a1c0ded8381a641a0b142069a9826072b963 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 17 Aug 2022 00:27:28 -0700 Subject: [PATCH 01/66] add ZSTD compression to the adapter --- cpp/src/io/comp/nvcomp_adapter.cpp | 65 ++++++++++++++++++++++++------ 1 file changed, 53 insertions(+), 12 deletions(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 91deda50cf2..6a798662cb8 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -21,17 +21,29 @@ #include +#define NVCOMP_DEFLATE_HEADER +#if __has_include(NVCOMP_DEFLATE_HEADER) +#include NVCOMP_DEFLATE_HEADER +#endif + #define NVCOMP_ZSTD_HEADER #if __has_include(NVCOMP_ZSTD_HEADER) #include NVCOMP_ZSTD_HEADER -#define NVCOMP_HAS_ZSTD 1 +#endif + +#if NVCOMP_MAJOR_VERSION > 2 or (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION >= 3) +#define NVCOMP_HAS_ZSTD_DECOMP 1 #else -#define NVCOMP_HAS_ZSTD 0 +#define NVCOMP_HAS_ZSTD_DECOMP 0 #endif -#define NVCOMP_DEFLATE_HEADER -#if __has_include(NVCOMP_DEFLATE_HEADER) -#include NVCOMP_DEFLATE_HEADER +#if NVCOMP_MAJOR_VERSION > 2 or (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION >= 4) +#define NVCOMP_HAS_ZSTD_COMP 1 +#else +#define NVCOMP_HAS_ZSTD_COMP 0 +#endif + +#if NVCOMP_MAJOR_VERSION > 2 or (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION >= 3) #define NVCOMP_HAS_DEFLATE 1 #else #define NVCOMP_HAS_DEFLATE 0 @@ -63,7 +75,7 @@ nvcompStatus_t batched_decompress_get_temp_size_ex(compression_type compression, case compression_type::SNAPPY: return nvcompBatchedSnappyDecompressGetTempSizeEx(std::forward(args)...); case compression_type::ZSTD: -#if NVCOMP_HAS_ZSTD +#if NVCOMP_HAS_ZSTD_DECOMP return nvcompBatchedZstdDecompressGetTempSizeEx(std::forward(args)...); #else CUDF_FAIL("Unsupported compression type"); @@ -83,7 +95,7 @@ auto batched_decompress_get_temp_size(compression_type compression, Args&&... ar case compression_type::SNAPPY: return nvcompBatchedSnappyDecompressGetTempSize(std::forward(args)...); case compression_type::ZSTD: -#if NVCOMP_HAS_ZSTD +#if NVCOMP_HAS_ZSTD_DECOMP return nvcompBatchedZstdDecompressGetTempSize(std::forward(args)...); #else CUDF_FAIL("Unsupported compression type"); @@ -106,7 +118,7 @@ auto batched_decompress_async(compression_type compression, Args&&... args) case compression_type::SNAPPY: return nvcompBatchedSnappyDecompressAsync(std::forward(args)...); case compression_type::ZSTD: -#if NVCOMP_HAS_ZSTD +#if NVCOMP_HAS_ZSTD_DECOMP return nvcompBatchedZstdDecompressAsync(std::forward(args)...); #else CUDF_FAIL("Unsupported compression type"); @@ -153,7 +165,7 @@ void batched_decompress(compression_type compression, { // TODO Consolidate config use to a common location if (compression == compression_type::ZSTD) { -#if NVCOMP_HAS_ZSTD +#if NVCOMP_HAS_ZSTD_DECOMP #if NVCOMP_ZSTD_IS_EXPERIMENTAL CUDF_EXPECTS(cudf::io::detail::nvcomp_integration::is_all_enabled(), "Zstandard compression is experimental, you can enable it through " @@ -210,7 +222,14 @@ auto batched_compress_temp_size(compression_type compression, #else CUDF_FAIL("Unsupported compression type"); #endif - case compression_type::ZSTD: [[fallthrough]]; + case compression_type::ZSTD: +#if NVCOMP_HAS_ZSTD_COMP + nvcomp_status = nvcompBatchedZstdCompressGetTempSize( + batch_size, max_uncompressed_chunk_bytes, nvcompBatchedZstdDefaultOpts, &temp_size); + break; +#else + CUDF_FAIL("Unsupported compression type"); +#endif default: CUDF_FAIL("Unsupported compression type"); } @@ -238,7 +257,14 @@ size_t batched_compress_get_max_output_chunk_size(compression_type compression, #else CUDF_FAIL("Unsupported compression type"); #endif - case compression_type::ZSTD: [[fallthrough]]; + case compression_type::ZSTD: +#if NVCOMP_HAS_ZSTD_COMP + status = nvcompBatchedZstdCompressGetMaxOutputChunkSize( + max_uncompressed_chunk_bytes, nvcompBatchedZstdDefaultOpts, &max_comp_chunk_size); + break; +#else + CUDF_FAIL("Unsupported compression type"); +#endif default: CUDF_FAIL("Unsupported compression type"); } @@ -289,7 +315,22 @@ static void batched_compress_async(compression_type compression, #else CUDF_FAIL("Unsupported compression type"); #endif - case compression_type::ZSTD: [[fallthrough]]; + case compression_type::ZSTD: +#if NVCOMP_HAS_ZSTD_COMP + nvcomp_status = nvcompBatchedZstdCompressAsync(device_uncompressed_ptrs, + device_uncompressed_bytes, + max_uncompressed_chunk_bytes, + batch_size, + device_temp_ptr, + temp_bytes, + device_compressed_ptrs, + device_compressed_bytes, + nvcompBatchedZstdDefaultOpts, + stream.value()); + break; +#else + CUDF_FAIL("Unsupported compression type"); +#endif default: CUDF_FAIL("Unsupported compression type"); } CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "Error in compression"); From fd4c440075fe51b720447b7bb8d7df1555b7c4ba Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 17 Aug 2022 00:28:25 -0700 Subject: [PATCH 02/66] C++ changes --- cpp/src/io/orc/stripe_enc.cu | 3 +++ cpp/src/io/orc/writer_impl.cu | 2 ++ 2 files changed, 5 insertions(+) diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 5e9a6f8df6b..45fc2f28cee 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -1345,6 +1345,9 @@ void CompressOrcDataStreams(uint8_t* compressed_data, } else if (compression == ZLIB and detail::nvcomp_integration::is_all_enabled()) { nvcomp::batched_compress( nvcomp::compression_type::DEFLATE, comp_in, comp_out, comp_stat, comp_blk_size, stream); + } else if (compression == ZSTD and detail::nvcomp_integration::is_all_enabled()) { + nvcomp::batched_compress( + nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_stat, comp_blk_size, stream); } else if (compression != NONE) { CUDF_FAIL("Unsupported compression type"); } diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 996b5f43b48..1f0c61e5e9f 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -92,6 +92,7 @@ orc::CompressionKind to_orc_compression(compression_type compression) case compression_type::AUTO: case compression_type::SNAPPY: return orc::CompressionKind::SNAPPY; case compression_type::ZLIB: return orc::CompressionKind::ZLIB; + case compression_type::ZSTD: return orc::CompressionKind::ZSTD; case compression_type::NONE: return orc::CompressionKind::NONE; default: CUDF_FAIL("Unsupported compression type"); return orc::CompressionKind::NONE; } @@ -2010,6 +2011,7 @@ auto to_nvcomp_compression_type(CompressionKind compression_kind) { if (compression_kind == SNAPPY) return nvcomp::compression_type::SNAPPY; if (compression_kind == ZLIB) return nvcomp::compression_type::DEFLATE; + if (compression_kind == ZSTD) return nvcomp::compression_type::ZSTD; CUDF_FAIL("Unsupported compression type"); } From 61d607ba101cb53f441a616dff98f54ba7c33814 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 17 Aug 2022 00:28:34 -0700 Subject: [PATCH 03/66] Python changes --- python/cudf/cudf/_lib/orc.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index 11c70317a39..195648e0841 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -161,6 +161,8 @@ cdef compression_type _get_comp_type(object compression): return compression_type.SNAPPY elif compression == "ZLIB": return compression_type.ZLIB + elif compression == "ZSTD": + return compression_type.ZSTD else: raise ValueError(f"Unsupported `compression` type {compression}") From 782b43510b6d2a2e43f56219a1948b7d48c0e4aa Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 17 Aug 2022 14:13:20 -0700 Subject: [PATCH 04/66] cmake workaround --- cpp/cmake/thirdparty/get_nvcomp.cmake | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/cpp/cmake/thirdparty/get_nvcomp.cmake b/cpp/cmake/thirdparty/get_nvcomp.cmake index 41bbf44abc8..82baaa6b043 100644 --- a/cpp/cmake/thirdparty/get_nvcomp.cmake +++ b/cpp/cmake/thirdparty/get_nvcomp.cmake @@ -14,18 +14,12 @@ # This function finds nvcomp and sets any additional necessary environment variables. function(find_and_configure_nvcomp) - - include(${rapids-cmake-dir}/cpm/nvcomp.cmake) - rapids_cpm_nvcomp( + # WORKAROUND; DO NOT MERGE + rapids_find_package(nvcomp REQUIRED BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports - USE_PROPRIETARY_BINARY ${CUDF_USE_PROPRIETARY_NVCOMP} ) - # Per-thread default stream - if(TARGET nvcomp AND CUDF_USE_PER_THREAD_DEFAULT_STREAM) - target_compile_definitions(nvcomp PRIVATE CUDA_API_PER_THREAD_DEFAULT_STREAM) - endif() endfunction() find_and_configure_nvcomp() From 2478bd616f69c2a5cea23d1c4bf8dc6d6d1076ad Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 17 Aug 2022 15:26:31 -0700 Subject: [PATCH 05/66] untested Parquet C++ --- cpp/src/io/parquet/writer_impl.cu | 96 ++++++------------------------- 1 file changed, 18 insertions(+), 78 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 755cec0636c..92bf34b80dc 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -24,6 +24,7 @@ #include "compact_protocol_reader.hpp" #include "compact_protocol_writer.hpp" +#include #include #include #include @@ -79,6 +80,7 @@ parquet::Compression to_parquet_compression(compression_type compression) switch (compression) { case compression_type::AUTO: case compression_type::SNAPPY: return parquet::Compression::SNAPPY; + case compression_type::ZSTD: return parquet::Compression::ZSTD; case compression_type::NONE: return parquet::Compression::UNCOMPRESSED; default: CUDF_FAIL("Unsupported compression type"); } @@ -1139,83 +1141,6 @@ void writer::impl::init_encoder_pages(hostdevice_2dvector& stream.synchronize(); } -void snappy_compress(device_span const> comp_in, - device_span const> comp_out, - device_span comp_stats, - size_t max_page_uncomp_data_size, - rmm::cuda_stream_view stream) -{ - size_t num_comp_pages = comp_in.size(); - try { - size_t temp_size; - nvcompStatus_t nvcomp_status = nvcompBatchedSnappyCompressGetTempSize( - num_comp_pages, max_page_uncomp_data_size, nvcompBatchedSnappyDefaultOpts, &temp_size); - - CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, - "Error in getting snappy compression scratch size"); - - // Not needed now but nvcomp API makes no promises about future - rmm::device_buffer scratch(temp_size, stream); - // Analogous to comp_in.srcDevice - rmm::device_uvector uncompressed_data_ptrs(num_comp_pages, stream); - // Analogous to comp_in.srcSize - rmm::device_uvector uncompressed_data_sizes(num_comp_pages, stream); - // Analogous to comp_in.dstDevice - rmm::device_uvector compressed_data_ptrs(num_comp_pages, stream); - // Analogous to comp_stat.bytes_written - rmm::device_uvector compressed_bytes_written(num_comp_pages, stream); - // nvcomp does not currently use comp_in.dstSize. Cannot assume that the output will fit in - // the space allocated unless one uses the API nvcompBatchedSnappyCompressGetOutputSize() - - // Prepare the vectors - auto comp_it = - thrust::make_zip_iterator(uncompressed_data_ptrs.begin(), uncompressed_data_sizes.begin()); - thrust::transform( - rmm::exec_policy(stream), - comp_in.begin(), - comp_in.end(), - comp_it, - [] __device__(auto const& in) { return thrust::make_tuple(in.data(), in.size()); }); - - thrust::transform(rmm::exec_policy(stream), - comp_out.begin(), - comp_out.end(), - compressed_data_ptrs.begin(), - [] __device__(auto const& out) { return out.data(); }); - nvcomp_status = nvcompBatchedSnappyCompressAsync(uncompressed_data_ptrs.data(), - uncompressed_data_sizes.data(), - max_page_uncomp_data_size, - num_comp_pages, - scratch.data(), // Not needed rn but future - scratch.size(), - compressed_data_ptrs.data(), - compressed_bytes_written.data(), - nvcompBatchedSnappyDefaultOpts, - stream.value()); - CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "Error in snappy compression"); - - // nvcomp also doesn't use comp_out.status . It guarantees that given enough output space, - // compression will succeed. - // The other `comp_out` field is `reserved` which is for internal cuIO debugging and can be 0. - thrust::transform(rmm::exec_policy(stream), - compressed_bytes_written.begin(), - compressed_bytes_written.end(), - comp_stats.begin(), - [] __device__(size_t size) { - decompress_status status{}; - status.bytes_written = size; - return status; - }); - return; - } catch (...) { - // If we reach this then there was an error in compressing so set an error status for each page - thrust::for_each(rmm::exec_policy(stream), - comp_stats.begin(), - comp_stats.end(), - [] __device__(decompress_status & stat) { stat.status = 1; }); - }; -} - void writer::impl::encode_pages(hostdevice_2dvector& chunks, device_span pages, size_t max_page_uncomp_data_size, @@ -1245,11 +1170,26 @@ void writer::impl::encode_pages(hostdevice_2dvector& chunks switch (compression_) { case parquet::Compression::SNAPPY: if (nvcomp_integration::is_stable_enabled()) { - snappy_compress(comp_in, comp_out, comp_stats, max_page_uncomp_data_size, stream); + nvcomp::batched_compress(nvcomp::compression_type::SNAPPY, + comp_in, + comp_out, + comp_stats, + max_page_uncomp_data_size, + stream); } else { gpu_snap(comp_in, comp_out, comp_stats, stream); } break; + case parquet::Compression::ZSTD: + if (nvcomp_integration::is_all_enabled()) { + nvcomp::batched_compress(nvcomp::compression_type::ZSTD, + comp_in, + comp_out, + comp_stats, + max_page_uncomp_data_size, + stream); + } + break; default: break; } // TBD: Not clear if the official spec actually allows dynamically turning off compression at the From 6291190f8fd01ea09b9472a1345bef7e7902fd18 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 17 Aug 2022 15:26:38 -0700 Subject: [PATCH 06/66] tmp tests --- cpp/tests/io/orc_test.cpp | 11 +++++------ cpp/tests/io/parquet_test.cpp | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 76ffc92e243..cab5066d93d 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1521,16 +1521,15 @@ TEST_F(OrcReaderTest, EmptyColumnsParam) std::vector out_buffer; cudf_io::orc_writer_options args = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{&out_buffer}, *expected); + cudf_io::orc_writer_options::builder(cudf_io::sink_info{&out_buffer}, *expected) + .compression(cudf_io::compression_type::ZSTD); cudf_io::write_orc(args); - cudf_io::orc_reader_options read_opts = - cudf_io::orc_reader_options::builder(cudf_io::source_info{out_buffer.data(), out_buffer.size()}) - .columns({}); + cudf_io::orc_reader_options read_opts = cudf_io::orc_reader_options::builder( + cudf_io::source_info{out_buffer.data(), out_buffer.size()}); auto const result = cudf_io::read_orc(read_opts); - EXPECT_EQ(result.tbl->num_columns(), 0); - EXPECT_EQ(result.tbl->num_rows(), 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); } CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index c218c4088bb..79b647a2106 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -4204,4 +4204,22 @@ TEST_F(ParquetReaderTest, StructByteArray) CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); } +TEST_F(ParquetReaderTest, ZSTD) +{ + srand(31337); + auto const expected = create_random_fixed_table(2, 4, false); + + std::vector out_buffer; + cudf_io::parquet_writer_options args = + cudf_io::parquet_writer_options::builder(cudf_io::sink_info{&out_buffer}, *expected) + .compression(cudf_io::compression_type::ZSTD); + cudf_io::write_parquet(args); + + cudf_io::parquet_reader_options read_opts = cudf_io::parquet_reader_options::builder( + cudf_io::source_info{out_buffer.data(), out_buffer.size()}); + auto const result = cudf_io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); +} + CUDF_TEST_PROGRAM_MAIN() From 4641c11c5db5a861b09e2c28d4b185b2afd574ef Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 17 Aug 2022 17:03:50 -0700 Subject: [PATCH 07/66] Parquet Python --- python/cudf/cudf/_lib/parquet.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index c87f58bb16c..10cd47aef91 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -668,6 +668,8 @@ cdef cudf_io_types.compression_type _get_comp_type(object compression): return cudf_io_types.compression_type.NONE elif compression == "snappy": return cudf_io_types.compression_type.SNAPPY + elif compression == "ZSTD": + return cudf_io_types.compression_type.ZSTD else: raise ValueError("Unsupported `compression` type") From 70c30f641f12e0b9aea9b5f92fa0e19818a8893a Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 17 Aug 2022 17:20:21 -0700 Subject: [PATCH 08/66] revert temp tests --- cpp/tests/io/orc_test.cpp | 11 ++++++----- cpp/tests/io/parquet_test.cpp | 18 ------------------ 2 files changed, 6 insertions(+), 23 deletions(-) diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index cab5066d93d..76ffc92e243 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1521,15 +1521,16 @@ TEST_F(OrcReaderTest, EmptyColumnsParam) std::vector out_buffer; cudf_io::orc_writer_options args = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{&out_buffer}, *expected) - .compression(cudf_io::compression_type::ZSTD); + cudf_io::orc_writer_options::builder(cudf_io::sink_info{&out_buffer}, *expected); cudf_io::write_orc(args); - cudf_io::orc_reader_options read_opts = cudf_io::orc_reader_options::builder( - cudf_io::source_info{out_buffer.data(), out_buffer.size()}); + cudf_io::orc_reader_options read_opts = + cudf_io::orc_reader_options::builder(cudf_io::source_info{out_buffer.data(), out_buffer.size()}) + .columns({}); auto const result = cudf_io::read_orc(read_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); + EXPECT_EQ(result.tbl->num_columns(), 0); + EXPECT_EQ(result.tbl->num_rows(), 0); } CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 79b647a2106..c218c4088bb 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -4204,22 +4204,4 @@ TEST_F(ParquetReaderTest, StructByteArray) CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); } -TEST_F(ParquetReaderTest, ZSTD) -{ - srand(31337); - auto const expected = create_random_fixed_table(2, 4, false); - - std::vector out_buffer; - cudf_io::parquet_writer_options args = - cudf_io::parquet_writer_options::builder(cudf_io::sink_info{&out_buffer}, *expected) - .compression(cudf_io::compression_type::ZSTD); - cudf_io::write_parquet(args); - - cudf_io::parquet_reader_options read_opts = cudf_io::parquet_reader_options::builder( - cudf_io::source_info{out_buffer.data(), out_buffer.size()}); - auto const result = cudf_io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); -} - CUDF_TEST_PROGRAM_MAIN() From 198f169273d49d5336fc9747aa3f273795590378 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 17 Aug 2022 17:24:27 -0700 Subject: [PATCH 09/66] py tests --- cpp/src/io/parquet/writer_impl.cu | 2 +- python/cudf/cudf/tests/test_orc.py | 13 +++++-------- python/cudf/cudf/tests/test_parquet.py | 11 +++++++++++ 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 92bf34b80dc..ee26ffb0de1 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1180,7 +1180,7 @@ void writer::impl::encode_pages(hostdevice_2dvector& chunks gpu_snap(comp_in, comp_out, comp_stats, stream); } break; - case parquet::Compression::ZSTD: + case parquet::Compression::ZSTD: if (nvcomp_integration::is_all_enabled()) { nvcomp::batched_compress(nvcomp::compression_type::ZSTD, comp_in, diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 4373ef9afdf..bc7662f29e3 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1745,19 +1745,16 @@ def test_writer_protobuf_large_rowindexentry(): assert_frame_equal(df, got) -def test_orc_writer_zlib_compression(list_struct_buff): +@pytest.mark.parametrize("compression", ["ZLIB", "ZSTD"]) +def test_orc_writer_nvcomp(list_struct_buff, compression): expected = cudf.read_orc(list_struct_buff) try: - # save with ZLIB compression buff = BytesIO() - expected.to_orc(buff, compression="ZLIB") + expected.to_orc(buff, compression=compression) got = cudf.read_orc(buff) assert_eq(expected, got) - except RuntimeError as e: - if "Unsupported compression type" in str(e): - pytest.mark.xfail(reason="nvcomp build doesn't have deflate") - else: - raise e + except RuntimeError: + pytest.mark.xfail(reason="Newer nvCOMP version is required") @pytest.mark.parametrize("index", [True, False, None]) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 44b13823346..4fb0a5d5552 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2518,3 +2518,14 @@ def test_parquet_columns_and_index_param(index, columns): got = cudf.read_parquet(buffer, columns=columns) assert_eq(expected, got, check_index_type=True) + + +def test_parquet_writer_nvcomp(list_struct_buff, compression): + expected = cudf.read_parquet(datadir / "spark_zstd.parquet") + try: + buff = BytesIO() + expected.to_parquet(buff, compression="ZSTD") + got = cudf.read_parquet(buff) + assert_eq(expected, got) + except RuntimeError: + pytest.mark.xfail(reason="Newer nvCOMP version is required") From 0dbf2a07d132b94f05caf8c9e5a6991581248788 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 17 Aug 2022 17:26:00 -0700 Subject: [PATCH 10/66] style :D --- cpp/cmake/thirdparty/get_nvcomp.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/cmake/thirdparty/get_nvcomp.cmake b/cpp/cmake/thirdparty/get_nvcomp.cmake index 82baaa6b043..f04720c4835 100644 --- a/cpp/cmake/thirdparty/get_nvcomp.cmake +++ b/cpp/cmake/thirdparty/get_nvcomp.cmake @@ -15,7 +15,8 @@ # This function finds nvcomp and sets any additional necessary environment variables. function(find_and_configure_nvcomp) # WORKAROUND; DO NOT MERGE - rapids_find_package(nvcomp REQUIRED + rapids_find_package( + nvcomp REQUIRED BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports ) From f000bb8703eaac95c7634261899ae746880fb7ce Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 18 Aug 2022 12:00:58 -0700 Subject: [PATCH 11/66] update java compression types --- .../java/ai/rapids/cudf/CompressionType.java | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/CompressionType.java b/java/src/main/java/ai/rapids/cudf/CompressionType.java index 48f980d7f71..d722d6d3adb 100644 --- a/java/src/main/java/ai/rapids/cudf/CompressionType.java +++ b/java/src/main/java/ai/rapids/cudf/CompressionType.java @@ -44,11 +44,21 @@ public enum CompressionType { ZIP(6), /** XZ format using LZMA(2) algorithm */ - XZ(7); + XZ(7), + + /** ZLIB format, using DEFLATE algorithm */ + ZLIB(8), + + /** LZ4 format, using LZ77 */ + LZ4(9), + + /** Lempel–Ziv–Oberhumer format */ + LZO(10), + + /** Zstandard format */ + ZSTD(11); final int nativeId; - CompressionType(int nativeId) { - this.nativeId = nativeId; - } + CompressionType(int nativeId) { this.nativeId = nativeId; } } From a5f81ed72bc9231f125a82d1a701440be14bc297 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 18 Aug 2022 14:18:16 -0700 Subject: [PATCH 12/66] compression block limit in ORC --- cpp/src/io/orc/writer_impl.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 1f0c61e5e9f..a24164b06ef 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -108,6 +108,7 @@ constexpr size_t compression_block_size(orc::CompressionKind compression) switch (compression) { case orc::CompressionKind::NONE: return 0; case orc::CompressionKind::ZLIB: return 64 * 1024; + case orc::CompressionKind::ZSTD: return 64 * 1024; default: return 256 * 1024; } } From 377864f34236e73746332f8251d89dfab6da81f1 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 18 Aug 2022 14:19:14 -0700 Subject: [PATCH 13/66] ORC compression check --- cpp/src/io/orc/stripe_enc.cu | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 45fc2f28cee..25985dfb646 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -31,6 +31,7 @@ #include #include +#include #include #include @@ -1352,6 +1353,12 @@ void CompressOrcDataStreams(uint8_t* compressed_data, CUDF_FAIL("Unsupported compression type"); } + CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream), + comp_stat.begin(), + comp_stat.end(), + [] __device__(auto const& stat) { return stat.status == 0; }), + "Error during decompression"); + dim3 dim_block_compact(1024, 1); gpuCompactCompressedBlocks<<>>( strm_desc, comp_in, comp_out, comp_stat, compressed_data, comp_blk_size, max_comp_blk_size); From 78293d9ab744ea65f73c9d6b5992d1692bca8a5f Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 18 Aug 2022 14:22:30 -0700 Subject: [PATCH 14/66] Parquet page limit, compression check, correct output size + HACK --- cpp/src/io/parquet/writer_impl.cu | 55 ++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index ee26ffb0de1..a882653dd4c 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -44,12 +44,11 @@ #include #include -#include - #include #include #include #include +#include #include #include @@ -940,11 +939,27 @@ void writer::impl::gather_fragment_statistics( stream.synchronize(); } +auto to_nvcomp_compression_type(Compression codec) +{ + if (codec == Compression::SNAPPY) return nvcomp::compression_type::SNAPPY; + if (codec == Compression::ZSTD) return nvcomp::compression_type::ZSTD; + CUDF_FAIL("Unsupported compression type"); +} + +size_t get_compress_max_output_chunk_size(Compression codec, uint32_t compression_blocksize) +{ + if (codec == Compression::UNCOMPRESSED) return 0; + + return batched_compress_get_max_output_chunk_size(to_nvcomp_compression_type(codec), + compression_blocksize); +} + auto init_page_sizes(hostdevice_2dvector& chunks, device_span col_desc, uint32_t num_columns, size_t max_page_size_bytes, size_type max_page_size_rows, + Compression compression_codec, rmm::cuda_stream_view stream) { if (chunks.is_empty()) { return hostdevice_vector{}; } @@ -989,12 +1004,13 @@ auto init_page_sizes(hostdevice_2dvector& chunks, // Get per-page max compressed size hostdevice_vector comp_page_sizes(num_pages, stream); - std::transform(page_sizes.begin(), page_sizes.end(), comp_page_sizes.begin(), [](auto page_size) { - size_t page_comp_max_size = 0; - nvcompBatchedSnappyCompressGetMaxOutputChunkSize( - page_size, nvcompBatchedSnappyDefaultOpts, &page_comp_max_size); - return page_comp_max_size; - }); + std::transform(page_sizes.begin(), + page_sizes.end(), + comp_page_sizes.begin(), + [compression_codec](auto page_size) { + return get_compress_max_output_chunk_size(compression_codec, page_size) + + 16; // DO NOT MERGE + }); comp_page_sizes.host_to_device(stream); // Use per-page max compressed size to calculate chunk.compressed_size @@ -1141,6 +1157,15 @@ void writer::impl::init_encoder_pages(hostdevice_2dvector& stream.synchronize(); } +void compress_check(device_span stats, rmm::cuda_stream_view stream) +{ + CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream), + stats.begin(), + stats.end(), + [] __device__(auto const& stat) { return stat.status == 0; }), + "Error during decompression"); +} + void writer::impl::encode_pages(hostdevice_2dvector& chunks, device_span pages, size_t max_page_uncomp_data_size, @@ -1190,8 +1215,10 @@ void writer::impl::encode_pages(hostdevice_2dvector& chunks stream); } break; - default: break; + default: CUDF_FAIL("invalid compression type"); } + compress_check(comp_stats, stream); + // TBD: Not clear if the official spec actually allows dynamically turning off compression at the // chunk-level auto d_chunks_in_batch = chunks.device_view().subspan(first_rowgroup, rowgroups_in_batch); @@ -1232,6 +1259,9 @@ writer::impl::impl(std::vector> sinks, single_write_mode(mode == SingleWriteMode::YES), out_sink_(std::move(sinks)) { + if (options.get_compression() == compression_type::ZSTD) { + max_page_size_bytes = std::min(max_page_size_bytes, 64 * 1024ul); + } if (options.get_metadata()) { table_meta = std::make_unique(*options.get_metadata()); } @@ -1256,6 +1286,9 @@ writer::impl::impl(std::vector> sinks, single_write_mode(mode == SingleWriteMode::YES), out_sink_(std::move(sinks)) { + if (options.get_compression() == compression_type::ZSTD) { + max_page_size_bytes = std::min(max_page_size_bytes, 64 * 1024ul); + } if (options.get_metadata()) { table_meta = std::make_unique(*options.get_metadata()); } @@ -1499,8 +1532,8 @@ void writer::impl::write(table_view const& table, std::vector co } // Build chunk dictionaries and count pages - hostdevice_vector comp_page_sizes = - init_page_sizes(chunks, col_desc, num_columns, max_page_size_bytes, max_page_size_rows, stream); + hostdevice_vector comp_page_sizes = init_page_sizes( + chunks, col_desc, num_columns, max_page_size_bytes, max_page_size_rows, compression_, stream); // Get the maximum page size across all chunks size_type max_page_uncomp_data_size = From 89a0b0b5e61f1240abe6e2fb0274d40eb7e3a2f6 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 18 Aug 2022 14:23:08 -0700 Subject: [PATCH 15/66] fixed Pq test --- python/cudf/cudf/tests/test_parquet.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 4fb0a5d5552..ce62474d739 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2520,12 +2520,20 @@ def test_parquet_columns_and_index_param(index, columns): assert_eq(expected, got, check_index_type=True) -def test_parquet_writer_nvcomp(list_struct_buff, compression): - expected = cudf.read_parquet(datadir / "spark_zstd.parquet") - try: - buff = BytesIO() - expected.to_parquet(buff, compression="ZSTD") - got = cudf.read_parquet(buff) - assert_eq(expected, got) - except RuntimeError: - pytest.mark.xfail(reason="Newer nvCOMP version is required") +def test_parquet_writer_zstd(tmpdir): + num_rows = 25000 + list_size = 7 + data = [ + struct_gen([string_gen, int_gen, string_gen], 0, list_size, False) + for i in range(num_rows) + ] + tmp = pa.Table.from_pydict({"los": data}) + fname = tmpdir.join("zstd.parquet") + pa.parquet.write_table(tmp, fname) + assert os.path.exists(fname) + expected = cudf.read_parquet(fname) + + buff = BytesIO() + expected.to_parquet(buff, compression="ZSTD") + got = cudf.read_parquet(buff) + assert_eq(expected, got) \ No newline at end of file From d9b2dcea6c048da942b8657888833a2756f67150 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 19 Aug 2022 12:12:26 -0700 Subject: [PATCH 16/66] fix Parquet w/o compression; check scratch buffer alignment --- cpp/src/io/comp/nvcomp_adapter.cpp | 7 +++++++ cpp/src/io/parquet/writer_impl.cu | 1 + 2 files changed, 8 insertions(+) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 6a798662cb8..15a1bd04689 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -336,6 +336,12 @@ static void batched_compress_async(compression_type compression, CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "Error in compression"); } +inline bool is_aligned(const void* ptr, std::uintptr_t alignment) noexcept +{ + auto iptr = reinterpret_cast(ptr); + return !(iptr % alignment); +} + void batched_compress(compression_type compression, device_span const> inputs, device_span const> outputs, @@ -347,6 +353,7 @@ void batched_compress(compression_type compression, auto const temp_size = batched_compress_temp_size(compression, num_chunks, max_uncomp_chunk_size); rmm::device_buffer scratch(temp_size, stream); + CUDF_EXPECTS(is_aligned(scratch.data(), 8), "misaligned scratch"); rmm::device_uvector actual_compressed_data_sizes(num_chunks, stream); auto const nvcomp_args = create_batched_nvcomp_args(inputs, outputs, stream); diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index a882653dd4c..4061feb4cf8 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1215,6 +1215,7 @@ void writer::impl::encode_pages(hostdevice_2dvector& chunks stream); } break; + case parquet::Compression::UNCOMPRESSED: break; default: CUDF_FAIL("invalid compression type"); } compress_check(comp_stats, stream); From bc75ce7360ee987e20d16b82aa2357d50d3dca89 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 19 Aug 2022 13:21:15 -0700 Subject: [PATCH 17/66] comp_in padding --- cpp/src/io/parquet/page_enc.cu | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 8181c76c065..4c75c2b640b 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -211,6 +211,8 @@ __global__ void __launch_bounds__(128) if (frag_id < num_fragments_per_column and lane_id == 0) groups[column_id][frag_id] = *g; } +constexpr size_t nvcomp_pad(size_t size) { return (size + 3) & ~3; } + // blockDim {128,1,1} __global__ void __launch_bounds__(128) gpuInitPages(device_2dspan chunks, @@ -278,7 +280,7 @@ __global__ void __launch_bounds__(128) page_g.num_rows = ck_g.num_dict_entries; page_g.num_leaf_values = ck_g.num_dict_entries; page_g.num_values = ck_g.num_dict_entries; // TODO: shouldn't matter for dict page - page_offset += page_g.max_hdr_size + page_g.max_data_size; + page_offset += nvcomp_pad(page_g.max_hdr_size + page_g.max_data_size); if (not comp_page_sizes.empty()) { comp_page_offset += page_g.max_hdr_size + comp_page_sizes[ck_g.first_page]; } @@ -354,7 +356,9 @@ __global__ void __launch_bounds__(128) } page_g.max_hdr_size += stats_hdr_len; } - page_g.page_data = ck_g.uncompressed_bfr + page_offset; + // pad max_hdr_size + page_g.max_hdr_size = nvcomp_pad(page_g.max_hdr_size); + page_g.page_data = ck_g.uncompressed_bfr + page_offset; if (not comp_page_sizes.empty()) { page_g.compressed_data = ck_g.compressed_bfr + comp_page_offset; } @@ -378,7 +382,7 @@ __global__ void __launch_bounds__(128) pagestats_g.start_chunk = ck_g.first_fragment + page_start; pagestats_g.num_chunks = page_g.num_fragments; - page_offset += page_g.max_hdr_size + page_g.max_data_size; + page_offset += nvcomp_pad(page_g.max_hdr_size + page_g.max_data_size); if (not comp_page_sizes.empty()) { comp_page_offset += page_g.max_hdr_size + comp_page_sizes[ck_g.first_page + num_pages]; } @@ -416,7 +420,7 @@ __global__ void __launch_bounds__(128) __syncwarp(); if (!t) { if (ck_g.ck_stat_size == 0 && ck_g.stats) { - uint32_t ck_stat_size = 48 + 2 * ck_max_stats_len; + uint32_t ck_stat_size = nvcomp_pad(48 + 2 * ck_max_stats_len); page_offset += ck_stat_size; comp_page_offset += ck_stat_size; ck_g.ck_stat_size = ck_stat_size; @@ -1115,7 +1119,8 @@ __global__ void __launch_bounds__(128, 8) uint32_t compressed_bfr_size = GetMaxCompressedBfrSize(actual_data_size); s->page.max_data_size = actual_data_size; if (not comp_in.empty()) { - comp_in[blockIdx.x] = {base, actual_data_size}; + comp_in[blockIdx.x] = {base, actual_data_size}; + printf("%lX ", (long)base); comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size, compressed_bfr_size}; } pages[blockIdx.x] = s->page; From e5c9e1a3aa6e0a1b9981aa1e4ad89e2533682021 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 19 Aug 2022 13:22:22 -0700 Subject: [PATCH 18/66] remove printf --- cpp/src/io/parquet/page_enc.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 4c75c2b640b..322a620407d 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -1120,7 +1120,7 @@ __global__ void __launch_bounds__(128, 8) s->page.max_data_size = actual_data_size; if (not comp_in.empty()) { comp_in[blockIdx.x] = {base, actual_data_size}; - printf("%lX ", (long)base); + // printf("%lX ", (long)base); comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size, compressed_bfr_size}; } pages[blockIdx.x] = s->page; From d56e6b3afd56b786f6825e0673ba8868940b359a Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 19 Aug 2022 14:06:43 -0700 Subject: [PATCH 19/66] style --- python/cudf/cudf/tests/test_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index ce62474d739..85bf2eb207d 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2536,4 +2536,4 @@ def test_parquet_writer_zstd(tmpdir): buff = BytesIO() expected.to_parquet(buff, compression="ZSTD") got = cudf.read_parquet(buff) - assert_eq(expected, got) \ No newline at end of file + assert_eq(expected, got) From d059f58758266496ce778b6ea128791ac9f9246b Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 19 Aug 2022 14:13:29 -0700 Subject: [PATCH 20/66] reduce scope of comp_in/out in ORC writer --- cpp/src/io/orc/orc_gpu.hpp | 4 ---- cpp/src/io/orc/stripe_enc.cu | 7 ++++--- cpp/src/io/orc/writer_impl.cu | 4 ---- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp index 9de7dfffc0c..7f970afda1e 100644 --- a/cpp/src/io/orc/orc_gpu.hpp +++ b/cpp/src/io/orc/orc_gpu.hpp @@ -350,8 +350,6 @@ void CompactOrcDataStreams(device_2dspan strm_desc, * @param[in] max_comp_blk_size Max size of any block after compression * @param[in,out] strm_desc StripeStream device array [stripe][stream] * @param[in,out] enc_streams chunk streams device array [column][rowgroup] - * @param[out] comp_in Per-block compression input buffers - * @param[out] comp_out Per-block compression output buffers * @param[out] comp_stat Per-block compression status * @param[in] stream CUDA stream used for device memory operations and kernel launches */ @@ -362,8 +360,6 @@ void CompressOrcDataStreams(uint8_t* compressed_data, uint32_t max_comp_blk_size, device_2dspan strm_desc, device_2dspan enc_streams, - device_span> comp_in, - device_span> comp_out, device_span comp_stat, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 25985dfb646..e2a7c8c26a6 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -1310,11 +1310,12 @@ void CompressOrcDataStreams(uint8_t* compressed_data, uint32_t max_comp_blk_size, device_2dspan strm_desc, device_2dspan enc_streams, - device_span> comp_in, - device_span> comp_out, device_span comp_stat, rmm::cuda_stream_view stream) { + rmm::device_uvector> comp_in(num_compressed_blocks, stream); + rmm::device_uvector> comp_out(num_compressed_blocks, stream); + dim3 dim_block_init(256, 1); dim3 dim_grid(strm_desc.size().first, strm_desc.size().second); gpuInitCompressionBlocks<<>>(strm_desc, @@ -1357,7 +1358,7 @@ void CompressOrcDataStreams(uint8_t* compressed_data, comp_stat.begin(), comp_stat.end(), [] __device__(auto const& stat) { return stat.status == 0; }), - "Error during decompression"); + "Error during compression"); dim3 dim_block_compact(1024, 1); gpuCompactCompressedBlocks<<>>( diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index a24164b06ef..677e44d936d 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -2179,8 +2179,6 @@ void writer::impl::write(table_view const& table) // Compress the data streams rmm::device_buffer compressed_data(compressed_bfr_size, stream); - hostdevice_vector> comp_in(num_compressed_blocks, stream); - hostdevice_vector> comp_out(num_compressed_blocks, stream); hostdevice_vector comp_stats(num_compressed_blocks, stream); if (compression_kind_ != NONE) { strm_descs.host_to_device(stream); @@ -2191,8 +2189,6 @@ void writer::impl::write(table_view const& table) max_compressed_block_size, strm_descs, enc_data.streams, - comp_in, - comp_out, comp_stats, stream); strm_descs.device_to_host(stream); From 9629fe9fb4e2702ab83f1c18c7e9a70455ed0061 Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 22 Aug 2022 12:31:22 -0700 Subject: [PATCH 21/66] nvcomp input dump --- cpp/src/io/comp/nvcomp_adapter.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 15a1bd04689..6608862a8f6 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -21,6 +21,8 @@ #include +#include + #define NVCOMP_DEFLATE_HEADER #if __has_include(NVCOMP_DEFLATE_HEADER) #include NVCOMP_DEFLATE_HEADER @@ -369,6 +371,23 @@ void batched_compress(compression_type compression, actual_compressed_data_sizes.data(), stream.value()); + if (detail::getenv_or("DUMP_NVCOMP_INPUT", 0)) { + std::vector> h_inputs(num_chunks); + cudaMemcpy(h_inputs.data(), + inputs.data(), + sizeof(device_span) * num_chunks, + cudaMemcpyDeviceToHost); + stream.synchronize(); + int idx = 0; + for (auto& input : h_inputs) { + std::vector h_input(input.size()); + cudaMemcpy( + h_inputs.data(), inputs.data(), sizeof(uint8_t) * input.size(), cudaMemcpyDeviceToHost); + std::ofstream myFile("comp_in" + std::to_string(idx++), std::ios::out | std::ios::binary); + myFile.write(reinterpret_cast(h_input.data()), h_input.size()); + } + } + convert_status(std::nullopt, actual_compressed_data_sizes, statuses, stream); } From 1fcb732318f0018b9ca9ca92dc3c6028deeca72d Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 22 Aug 2022 13:47:31 -0700 Subject: [PATCH 22/66] fix dump --- cpp/src/io/comp/nvcomp_adapter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 6608862a8f6..9e59fa46bed 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -382,7 +382,7 @@ void batched_compress(compression_type compression, for (auto& input : h_inputs) { std::vector h_input(input.size()); cudaMemcpy( - h_inputs.data(), inputs.data(), sizeof(uint8_t) * input.size(), cudaMemcpyDeviceToHost); + h_input.data(), inputs.data(), sizeof(uint8_t) * input.size(), cudaMemcpyDeviceToHost); std::ofstream myFile("comp_in" + std::to_string(idx++), std::ios::out | std::ios::binary); myFile.write(reinterpret_cast(h_input.data()), h_input.size()); } From f75f1071b69c30d120f620977a01b5166b5cf6c3 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 23 Aug 2022 11:39:49 -0700 Subject: [PATCH 23/66] remove output chunk size workaround --- cpp/src/io/parquet/writer_impl.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 5dc39085dee..3dc3936ccb5 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -979,8 +979,7 @@ auto init_page_sizes(hostdevice_2dvector& chunks, page_sizes.end(), comp_page_sizes.begin(), [compression_codec](auto page_size) { - return get_compress_max_output_chunk_size(compression_codec, page_size) + - 16; // DO NOT MERGE + return get_compress_max_output_chunk_size(compression_codec, page_size); }); comp_page_sizes.host_to_device(stream); From 3c4ad50ce5562482cd211d938ca41caab1951e41 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 23 Aug 2022 13:29:59 -0700 Subject: [PATCH 24/66] fix dump some more --- cpp/src/io/comp/nvcomp_adapter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 9e59fa46bed..0904be6a373 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -382,7 +382,7 @@ void batched_compress(compression_type compression, for (auto& input : h_inputs) { std::vector h_input(input.size()); cudaMemcpy( - h_input.data(), inputs.data(), sizeof(uint8_t) * input.size(), cudaMemcpyDeviceToHost); + h_input.data(), input.data(), sizeof(uint8_t) * input.size(), cudaMemcpyDeviceToHost); std::ofstream myFile("comp_in" + std::to_string(idx++), std::ios::out | std::ios::binary); myFile.write(reinterpret_cast(h_input.data()), h_input.size()); } From a550ee2368e819c5fdce2daacf8992ffe51f41e2 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 23 Aug 2022 13:48:55 -0700 Subject: [PATCH 25/66] improve dump naming --- cpp/src/io/comp/nvcomp_adapter.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 0904be6a373..93229cecc6b 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -351,6 +351,7 @@ void batched_compress(compression_type compression, uint32_t max_uncomp_chunk_size, rmm::cuda_stream_view stream) { + static int batch_idx = 0; auto const num_chunks = inputs.size(); auto const temp_size = batched_compress_temp_size(compression, num_chunks, max_uncomp_chunk_size); @@ -383,11 +384,12 @@ void batched_compress(compression_type compression, std::vector h_input(input.size()); cudaMemcpy( h_input.data(), input.data(), sizeof(uint8_t) * input.size(), cudaMemcpyDeviceToHost); - std::ofstream myFile("comp_in" + std::to_string(idx++), std::ios::out | std::ios::binary); + std::ofstream myFile("comp_in_" + std::to_string(batch_idx) + "_" + std::to_string(idx++), + std::ios::out | std::ios::binary); myFile.write(reinterpret_cast(h_input.data()), h_input.size()); } } - + ++batch_idx; convert_status(std::nullopt, actual_compressed_data_sizes, statuses, stream); } From 56af125d031a56f57dabeaeea4d2a87d57043395 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 24 Aug 2022 17:04:12 -0700 Subject: [PATCH 26/66] nvcomp output dump --- cpp/src/io/comp/nvcomp_adapter.cpp | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 93229cecc6b..554ace809f0 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -372,13 +372,13 @@ void batched_compress(compression_type compression, actual_compressed_data_sizes.data(), stream.value()); + stream.synchronize(); if (detail::getenv_or("DUMP_NVCOMP_INPUT", 0)) { std::vector> h_inputs(num_chunks); cudaMemcpy(h_inputs.data(), inputs.data(), sizeof(device_span) * num_chunks, cudaMemcpyDeviceToHost); - stream.synchronize(); int idx = 0; for (auto& input : h_inputs) { std::vector h_input(input.size()); @@ -389,6 +389,32 @@ void batched_compress(compression_type compression, myFile.write(reinterpret_cast(h_input.data()), h_input.size()); } } + + if (detail::getenv_or("DUMP_NVCOMP_OUTPUT", 0)) { + std::vector> h_outputs(num_chunks); + cudaMemcpy(h_outputs.data(), + outputs.data(), + sizeof(device_span) * num_chunks, + cudaMemcpyDeviceToHost); + + std::vector actual_sizes(num_chunks); + cudaMemcpy(actual_sizes.data(), + actual_compressed_data_sizes.data(), + sizeof(size_t) * num_chunks, + cudaMemcpyDeviceToHost); + + int idx = 0; + for (auto i = 0u; i < num_chunks; ++i) { + std::vector h_output(actual_sizes[i]); + cudaMemcpy(h_output.data(), + h_outputs[i].data(), + sizeof(uint8_t) * actual_sizes[i], + cudaMemcpyDeviceToHost); + std::ofstream myFile("comp_out_" + std::to_string(batch_idx) + "_" + std::to_string(idx++), + std::ios::out | std::ios::binary); + myFile.write(reinterpret_cast(h_output.data()), actual_sizes[i]); + } + } ++batch_idx; convert_status(std::nullopt, actual_compressed_data_sizes, statuses, stream); } From 48a4f1d1564a44d4fe9213b01c504c36f6a77a53 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 24 Aug 2022 17:04:31 -0700 Subject: [PATCH 27/66] clean up --- cpp/src/io/parquet/page_enc.cu | 12 +++++------- cpp/src/io/parquet/parquet_gpu.hpp | 9 --------- 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index c2e3d60219d..a96fa1a45d2 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -1217,14 +1217,12 @@ __global__ void __launch_bounds__(128, 8) } } if (t == 0) { - uint8_t* base = s->page.page_data + s->page.max_hdr_size; - auto actual_data_size = static_cast(s->cur - base); - uint32_t compressed_bfr_size = GetMaxCompressedBfrSize(actual_data_size); - s->page.max_data_size = actual_data_size; + uint8_t* base = s->page.page_data + s->page.max_hdr_size; + auto actual_data_size = static_cast(s->cur - base); + s->page.max_data_size = actual_data_size; if (not comp_in.empty()) { - comp_in[blockIdx.x] = {base, actual_data_size}; - // printf("%lX ", (long)base); - comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size, compressed_bfr_size}; + comp_in[blockIdx.x] = {base, actual_data_size}; + comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size, 0}; // unused } pages[blockIdx.x] = s->page; if (not comp_stats.empty()) { diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 823cb8fcc2b..6d4e116c711 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -309,15 +309,6 @@ inline size_type __device__ row_to_value_idx(size_type idx, return idx; } -/** - * @brief Return worst-case compressed size of compressed data given the uncompressed size - */ -inline size_t __device__ __host__ GetMaxCompressedBfrSize(size_t uncomp_size, - uint32_t num_pages = 1) -{ - return uncomp_size + (uncomp_size >> 7) + num_pages * 8; -} - struct EncPage; /** From 3ccc6b940bc46ed0a82835b4fbeb81af06c0e8b5 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 25 Aug 2022 18:35:54 -0700 Subject: [PATCH 28/66] skip pages too large for ZSTD --- cpp/src/io/comp/nvcomp_adapter.cpp | 11 +++-- cpp/src/io/comp/nvcomp_adapter.cu | 72 +++++++++++++++++++++--------- cpp/src/io/comp/nvcomp_adapter.cuh | 19 +++++++- cpp/src/io/comp/nvcomp_adapter.hpp | 2 - cpp/src/io/orc/stripe_enc.cu | 7 ++- cpp/src/io/parquet/writer_impl.cu | 18 +++----- 6 files changed, 83 insertions(+), 46 deletions(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 554ace809f0..cb383d70766 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -348,18 +348,20 @@ void batched_compress(compression_type compression, device_span const> inputs, device_span const> outputs, device_span statuses, - uint32_t max_uncomp_chunk_size, rmm::cuda_stream_view stream) { static int batch_idx = 0; auto const num_chunks = inputs.size(); + auto nvcomp_args = create_batched_nvcomp_args(inputs, outputs, stream); + + auto const max_uncomp_chunk_size = filter_inputs(nvcomp_args.input_data_sizes, statuses, stream); + auto const temp_size = batched_compress_temp_size(compression, num_chunks, max_uncomp_chunk_size); rmm::device_buffer scratch(temp_size, stream); CUDF_EXPECTS(is_aligned(scratch.data(), 8), "misaligned scratch"); rmm::device_uvector actual_compressed_data_sizes(num_chunks, stream); - auto const nvcomp_args = create_batched_nvcomp_args(inputs, outputs, stream); batched_compress_async(compression, nvcomp_args.input_data_ptrs.data(), @@ -372,8 +374,8 @@ void batched_compress(compression_type compression, actual_compressed_data_sizes.data(), stream.value()); - stream.synchronize(); if (detail::getenv_or("DUMP_NVCOMP_INPUT", 0)) { + stream.synchronize(); std::vector> h_inputs(num_chunks); cudaMemcpy(h_inputs.data(), inputs.data(), @@ -391,6 +393,7 @@ void batched_compress(compression_type compression, } if (detail::getenv_or("DUMP_NVCOMP_OUTPUT", 0)) { + stream.synchronize(); std::vector> h_outputs(num_chunks); cudaMemcpy(h_outputs.data(), outputs.data(), @@ -416,7 +419,7 @@ void batched_compress(compression_type compression, } } ++batch_idx; - convert_status(std::nullopt, actual_compressed_data_sizes, statuses, stream); + convert_status(actual_compressed_data_sizes, statuses, stream); } } // namespace cudf::io::nvcomp diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu index 30551dc31cf..384f6b90b4c 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cu +++ b/cpp/src/io/comp/nvcomp_adapter.cu @@ -57,31 +57,59 @@ batched_args create_batched_nvcomp_args(device_span c std::move(output_data_sizes)}; } -void convert_status(std::optional> nvcomp_stats, +void convert_status(device_span nvcomp_stats, device_span actual_uncompressed_sizes, device_span cudf_stats, rmm::cuda_stream_view stream) { - if (nvcomp_stats.has_value()) { - thrust::transform( - rmm::exec_policy(stream), - nvcomp_stats->begin(), - nvcomp_stats->end(), - actual_uncompressed_sizes.begin(), - cudf_stats.begin(), - [] __device__(auto const& status, auto const& size) { - return decompress_status{size, status == nvcompStatus_t::nvcompSuccess ? 0u : 1u}; - }); - } else { - thrust::transform(rmm::exec_policy(stream), - actual_uncompressed_sizes.begin(), - actual_uncompressed_sizes.end(), - cudf_stats.begin(), - [] __device__(size_t size) { - decompress_status status{}; - status.bytes_written = size; - return status; - }); - } + thrust::transform_if( + rmm::exec_policy(stream), + nvcomp_stats.begin(), + nvcomp_stats.end(), + actual_uncompressed_sizes.begin(), + cudf_stats.begin(), + cudf_stats.begin(), + [] __device__(auto const& nvcomp_status, auto const& size) { + return decompress_status{size, nvcomp_status == nvcompStatus_t::nvcompSuccess ? 0u : 1u}; + }, + [] __device__(auto const& cudf_status) { return cudf_status.status != 2; }); +} + +void convert_status(device_span actual_uncompressed_sizes, + device_span cudf_stats, + rmm::cuda_stream_view stream) +{ + thrust::transform_if( + rmm::exec_policy(stream), + actual_uncompressed_sizes.begin(), + actual_uncompressed_sizes.end(), + cudf_stats.begin(), + cudf_stats.begin(), + [] __device__(auto const& size) { return decompress_status{size}; }, + [] __device__(auto const& cudf_status) { return cudf_status.status != 2; }); } + +size_t filter_inputs(device_span input_sizes, + device_span statuses, + rmm::cuda_stream_view stream) +{ + auto status_size_it = thrust::make_zip_iterator(input_sizes.begin(), statuses.begin()); + thrust::transform_if( + rmm::exec_policy(stream), + statuses.begin(), + statuses.end(), + input_sizes.begin(), + status_size_it, + [] __device__(auto const& status) { + return thrust::pair{0, decompress_status{0, 2}}; + }, + [] __device__(auto const& input_size) { return input_size > 64 * 1024; }); + + return thrust::reduce(rmm::exec_policy(stream), + input_sizes.begin(), + input_sizes.end(), + 0ul, + thrust::maximum()); +} + } // namespace cudf::io::nvcomp diff --git a/cpp/src/io/comp/nvcomp_adapter.cuh b/cpp/src/io/comp/nvcomp_adapter.cuh index 1cc65d41a51..faeaf6f701d 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cuh +++ b/cpp/src/io/comp/nvcomp_adapter.cuh @@ -50,8 +50,25 @@ batched_args create_batched_nvcomp_args(device_span c /** * @brief Convert nvcomp statuses into cuIO compression statuses. */ -void convert_status(std::optional> nvcomp_stats, +void convert_status(device_span nvcomp_stats, device_span actual_uncompressed_sizes, device_span cudf_stats, rmm::cuda_stream_view stream); + +/** + * @brief Fill the status array based on the uncompressed sizes. + */ +void convert_status(device_span actual_uncompressed_sizes, + device_span cudf_stats, + rmm::cuda_stream_view stream); + +/** + * @brief Mark unsupported input chunks for skipping. + * + * Returns the size of the largest remaining input chunk. + */ +size_t filter_inputs(device_span input_sizes, + device_span statuses, + rmm::cuda_stream_view stream); + } // namespace cudf::io::nvcomp diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp index 40a85a3ac37..61dc085fe63 100644 --- a/cpp/src/io/comp/nvcomp_adapter.hpp +++ b/cpp/src/io/comp/nvcomp_adapter.hpp @@ -61,14 +61,12 @@ size_t batched_compress_get_max_output_chunk_size(compression_type compression, * @param[in] inputs List of input buffers * @param[out] outputs List of output buffers * @param[out] statuses List of output status structures - * @param[in] max_uncomp_chunk_size Size of the largest uncompressed chunk in the batch * @param[in] stream CUDA stream to use */ void batched_compress(compression_type compression, device_span const> inputs, device_span const> outputs, device_span statuses, - uint32_t max_uncomp_chunk_size, rmm::cuda_stream_view stream); } // namespace cudf::io::nvcomp diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index e2a7c8c26a6..96a37183874 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -1331,7 +1331,7 @@ void CompressOrcDataStreams(uint8_t* compressed_data, try { if (detail::nvcomp_integration::is_stable_enabled()) { nvcomp::batched_compress( - nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_stat, comp_blk_size, stream); + nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_stat, stream); } else { gpu_snap(comp_in, comp_out, comp_stat, stream); } @@ -1346,10 +1346,9 @@ void CompressOrcDataStreams(uint8_t* compressed_data, } } else if (compression == ZLIB and detail::nvcomp_integration::is_all_enabled()) { nvcomp::batched_compress( - nvcomp::compression_type::DEFLATE, comp_in, comp_out, comp_stat, comp_blk_size, stream); + nvcomp::compression_type::DEFLATE, comp_in, comp_out, comp_stat, stream); } else if (compression == ZSTD and detail::nvcomp_integration::is_all_enabled()) { - nvcomp::batched_compress( - nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_stat, comp_blk_size, stream); + nvcomp::batched_compress(nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_stat, stream); } else if (compression != NONE) { CUDF_FAIL("Unsupported compression type"); } diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 3dc3936ccb5..7081e4a5291 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1133,7 +1133,7 @@ void compress_check(device_span stats, rmm::cuda_stream CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream), stats.begin(), stats.end(), - [] __device__(auto const& stat) { return stat.status == 0; }), + [] __device__(auto const& stat) { return stat.status != 1; }), "Error during decompression"); } @@ -1166,24 +1166,16 @@ void writer::impl::encode_pages(hostdevice_2dvector& chunks switch (compression_) { case parquet::Compression::SNAPPY: if (nvcomp_integration::is_stable_enabled()) { - nvcomp::batched_compress(nvcomp::compression_type::SNAPPY, - comp_in, - comp_out, - comp_stats, - max_page_uncomp_data_size, - stream); + nvcomp::batched_compress( + nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_stats, stream); } else { gpu_snap(comp_in, comp_out, comp_stats, stream); } break; case parquet::Compression::ZSTD: if (nvcomp_integration::is_all_enabled()) { - nvcomp::batched_compress(nvcomp::compression_type::ZSTD, - comp_in, - comp_out, - comp_stats, - max_page_uncomp_data_size, - stream); + nvcomp::batched_compress( + nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_stats, stream); } break; case parquet::Compression::UNCOMPRESSED: break; From e3fe520b1ed8004e1bf43c14d134b809763da732 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 25 Aug 2022 19:01:26 -0700 Subject: [PATCH 29/66] only skip compression of large chunks for ZSTD --- cpp/src/io/comp/nvcomp_adapter.cpp | 20 +++++++++++++++++++- cpp/src/io/comp/nvcomp_adapter.cu | 27 ++++++++++++++++----------- cpp/src/io/comp/nvcomp_adapter.cuh | 1 + 3 files changed, 36 insertions(+), 12 deletions(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index cb383d70766..0937ff6341c 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -344,6 +344,21 @@ inline bool is_aligned(const void* ptr, std::uintptr_t alignment) noexcept return !(iptr % alignment); } +std::optional max_allowed_compression_chunk_size(compression_type compression) +{ + switch (compression) { + case compression_type::ZSTD: +#if NVCOMP_HAS_ZSTD_COMP + return nvcompZstdMaxAllowedChunkSize; +#else + CUDF_FAIL("Unsupported compression type"); +#endif + case compression_type::SNAPPY: return std::nullopt; + case compression_type::DEFLATE: return std::nullopt; + default: return std::nullopt; + } +} + void batched_compress(compression_type compression, device_span const> inputs, device_span const> outputs, @@ -355,7 +370,10 @@ void batched_compress(compression_type compression, auto nvcomp_args = create_batched_nvcomp_args(inputs, outputs, stream); - auto const max_uncomp_chunk_size = filter_inputs(nvcomp_args.input_data_sizes, statuses, stream); + auto const max_uncomp_chunk_size = filter_inputs(nvcomp_args.input_data_sizes, + statuses, + max_allowed_compression_chunk_size(compression), + stream); auto const temp_size = batched_compress_temp_size(compression, num_chunks, max_uncomp_chunk_size); rmm::device_buffer scratch(temp_size, stream); diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu index 384f6b90b4c..506630b52b8 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cu +++ b/cpp/src/io/comp/nvcomp_adapter.cu @@ -91,19 +91,24 @@ void convert_status(device_span actual_uncompressed_sizes, size_t filter_inputs(device_span input_sizes, device_span statuses, + std::optional max_valid_input_size, rmm::cuda_stream_view stream) { - auto status_size_it = thrust::make_zip_iterator(input_sizes.begin(), statuses.begin()); - thrust::transform_if( - rmm::exec_policy(stream), - statuses.begin(), - statuses.end(), - input_sizes.begin(), - status_size_it, - [] __device__(auto const& status) { - return thrust::pair{0, decompress_status{0, 2}}; - }, - [] __device__(auto const& input_size) { return input_size > 64 * 1024; }); + if (max_valid_input_size.has_value()) { + auto status_size_it = thrust::make_zip_iterator(input_sizes.begin(), statuses.begin()); + thrust::transform_if( + rmm::exec_policy(stream), + statuses.begin(), + statuses.end(), + input_sizes.begin(), + status_size_it, + [] __device__(auto const& status) { + return thrust::pair{0, decompress_status{0, 2}}; + }, + [max_size = max_valid_input_size.value()] __device__(size_t input_size) { + return input_size > max_size; + }); + } return thrust::reduce(rmm::exec_policy(stream), input_sizes.begin(), diff --git a/cpp/src/io/comp/nvcomp_adapter.cuh b/cpp/src/io/comp/nvcomp_adapter.cuh index faeaf6f701d..7e847a5f5a8 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cuh +++ b/cpp/src/io/comp/nvcomp_adapter.cuh @@ -69,6 +69,7 @@ void convert_status(device_span actual_uncompressed_sizes, */ size_t filter_inputs(device_span input_sizes, device_span statuses, + std::optional max_valid_input_size, rmm::cuda_stream_view stream); } // namespace cudf::io::nvcomp From 86812cb67b08070d53f512deba78888addee4f1e Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 25 Aug 2022 19:10:05 -0700 Subject: [PATCH 30/66] page fragment scaling (tmp!) --- cpp/src/io/parquet/writer_impl.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 7081e4a5291..365800db34b 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1372,13 +1372,15 @@ void writer::impl::write(table_view const& table, std::vector co // iteratively reduce this value if the largest fragment exceeds the max page size limit (we // ideally want the page size to be below 1MB so as to have enough pages to get good // compression/decompression performance). - using cudf::io::parquet::gpu::max_page_fragment_size; + auto max_page_fragment_size = + (cudf::io::parquet::gpu::max_page_fragment_size * max_page_size_bytes) / + default_max_page_size_bytes; std::vector num_frag_in_part; std::transform(partitions.begin(), partitions.end(), std::back_inserter(num_frag_in_part), - [](auto const& part) { + [max_page_fragment_size](auto const& part) { return util::div_rounding_up_unsafe(part.num_rows, max_page_fragment_size); }); From 4fce5573db10d59bc7dd4121a72c6b9ee6a2fedb Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 26 Aug 2022 13:46:14 -0700 Subject: [PATCH 31/66] avoid calling get max comp size for oversized pages --- cpp/src/io/comp/nvcomp_adapter.cpp | 34 +++++++++++++++++------------- cpp/src/io/parquet/writer_impl.cu | 2 +- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 0937ff6341c..c5591a32715 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -240,10 +240,29 @@ auto batched_compress_temp_size(compression_type compression, return temp_size; } +constexpr std::optional max_allowed_compression_chunk_size(compression_type compression) +{ + switch (compression) { + case compression_type::ZSTD: +#if NVCOMP_HAS_ZSTD_COMP + return nvcompZstdMaxAllowedChunkSize; +#else + CUDF_FAIL("Unsupported compression type"); +#endif + case compression_type::SNAPPY: return std::nullopt; + case compression_type::DEFLATE: return std::nullopt; + default: return std::nullopt; + } +} + // Dispatcher for nvcompBatchedCompressGetMaxOutputChunkSize size_t batched_compress_get_max_output_chunk_size(compression_type compression, uint32_t max_uncompressed_chunk_bytes) { + max_uncompressed_chunk_bytes = std::min( + max_allowed_compression_chunk_size(compression).value_or(max_uncompressed_chunk_bytes), + max_uncompressed_chunk_bytes); + size_t max_comp_chunk_size = 0; nvcompStatus_t status = nvcompStatus_t::nvcompSuccess; switch (compression) { @@ -344,21 +363,6 @@ inline bool is_aligned(const void* ptr, std::uintptr_t alignment) noexcept return !(iptr % alignment); } -std::optional max_allowed_compression_chunk_size(compression_type compression) -{ - switch (compression) { - case compression_type::ZSTD: -#if NVCOMP_HAS_ZSTD_COMP - return nvcompZstdMaxAllowedChunkSize; -#else - CUDF_FAIL("Unsupported compression type"); -#endif - case compression_type::SNAPPY: return std::nullopt; - case compression_type::DEFLATE: return std::nullopt; - default: return std::nullopt; - } -} - void batched_compress(compression_type compression, device_span const> inputs, device_span const> outputs, diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 365800db34b..5ee8a4f8b50 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1134,7 +1134,7 @@ void compress_check(device_span stats, rmm::cuda_stream stats.begin(), stats.end(), [] __device__(auto const& stat) { return stat.status != 1; }), - "Error during decompression"); + "Error during compression"); } void writer::impl::encode_pages(hostdevice_2dvector& chunks, From 3fd04a6eaf6baa759d93c825290b892c64e6465d Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 26 Aug 2022 13:53:32 -0700 Subject: [PATCH 32/66] remove constexpr --- cpp/src/io/comp/nvcomp_adapter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index c5591a32715..dcc6ee70742 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -240,7 +240,7 @@ auto batched_compress_temp_size(compression_type compression, return temp_size; } -constexpr std::optional max_allowed_compression_chunk_size(compression_type compression) +std::optional max_allowed_compression_chunk_size(compression_type compression) { switch (compression) { case compression_type::ZSTD: From d977da2511aaff0df96d0a32853216c715d53c4f Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 26 Aug 2022 15:22:30 -0700 Subject: [PATCH 33/66] rename --- cpp/src/io/comp/nvcomp_adapter.cpp | 25 +++++++++++-------------- cpp/src/io/comp/nvcomp_adapter.hpp | 4 ++-- cpp/src/io/orc/writer_impl.cu | 9 ++++----- cpp/src/io/parquet/writer_impl.cu | 8 ++++---- 4 files changed, 21 insertions(+), 25 deletions(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index dcc6ee70742..e02cea56ccd 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -240,7 +240,7 @@ auto batched_compress_temp_size(compression_type compression, return temp_size; } -std::optional max_allowed_compression_chunk_size(compression_type compression) +std::optional max_allowed_chunk_size(compression_type compression) { switch (compression) { case compression_type::ZSTD: @@ -255,25 +255,24 @@ std::optional max_allowed_compression_chunk_size(compression_type compre } } -// Dispatcher for nvcompBatchedCompressGetMaxOutputChunkSize -size_t batched_compress_get_max_output_chunk_size(compression_type compression, - uint32_t max_uncompressed_chunk_bytes) +size_t batched_compress_max_output_chunk_size(compression_type compression, + uint32_t max_uncompressed_chunk_bytes) { - max_uncompressed_chunk_bytes = std::min( - max_allowed_compression_chunk_size(compression).value_or(max_uncompressed_chunk_bytes), - max_uncompressed_chunk_bytes); + auto const capped_uncomp_bytes = + std::min(max_allowed_chunk_size(compression).value_or(max_uncompressed_chunk_bytes), + max_uncompressed_chunk_bytes); size_t max_comp_chunk_size = 0; nvcompStatus_t status = nvcompStatus_t::nvcompSuccess; switch (compression) { case compression_type::SNAPPY: status = nvcompBatchedSnappyCompressGetMaxOutputChunkSize( - max_uncompressed_chunk_bytes, nvcompBatchedSnappyDefaultOpts, &max_comp_chunk_size); + capped_uncomp_bytes, nvcompBatchedSnappyDefaultOpts, &max_comp_chunk_size); break; case compression_type::DEFLATE: #if NVCOMP_HAS_DEFLATE status = nvcompBatchedDeflateCompressGetMaxOutputChunkSize( - max_uncompressed_chunk_bytes, nvcompBatchedDeflateDefaultOpts, &max_comp_chunk_size); + capped_uncomp_bytes, nvcompBatchedDeflateDefaultOpts, &max_comp_chunk_size); break; #else CUDF_FAIL("Unsupported compression type"); @@ -281,7 +280,7 @@ size_t batched_compress_get_max_output_chunk_size(compression_type compression, case compression_type::ZSTD: #if NVCOMP_HAS_ZSTD_COMP status = nvcompBatchedZstdCompressGetMaxOutputChunkSize( - max_uncompressed_chunk_bytes, nvcompBatchedZstdDefaultOpts, &max_comp_chunk_size); + capped_uncomp_bytes, nvcompBatchedZstdDefaultOpts, &max_comp_chunk_size); break; #else CUDF_FAIL("Unsupported compression type"); @@ -374,10 +373,8 @@ void batched_compress(compression_type compression, auto nvcomp_args = create_batched_nvcomp_args(inputs, outputs, stream); - auto const max_uncomp_chunk_size = filter_inputs(nvcomp_args.input_data_sizes, - statuses, - max_allowed_compression_chunk_size(compression), - stream); + auto const max_uncomp_chunk_size = filter_inputs( + nvcomp_args.input_data_sizes, statuses, max_allowed_chunk_size(compression), stream); auto const temp_size = batched_compress_temp_size(compression, num_chunks, max_uncomp_chunk_size); rmm::device_buffer scratch(temp_size, stream); diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp index 61dc085fe63..2eb1cd073c0 100644 --- a/cpp/src/io/comp/nvcomp_adapter.hpp +++ b/cpp/src/io/comp/nvcomp_adapter.hpp @@ -51,8 +51,8 @@ void batched_decompress(compression_type compression, * @param compression Compression type * @param max_uncomp_chunk_size Size of the largest uncompressed chunk in the batch */ -size_t batched_compress_get_max_output_chunk_size(compression_type compression, - uint32_t max_uncomp_chunk_size); +size_t batched_compress_max_output_chunk_size(compression_type compression, + uint32_t max_uncomp_chunk_size); /** * @brief Device batch compression of given type. diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 677e44d936d..ea5e00f15f2 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -2016,13 +2016,12 @@ auto to_nvcomp_compression_type(CompressionKind compression_kind) CUDF_FAIL("Unsupported compression type"); } -size_t get_compress_max_output_chunk_size(CompressionKind compression_kind, - uint32_t compression_blocksize) +size_t max_compression_output_size(CompressionKind compression_kind, uint32_t compression_blocksize) { if (compression_kind == NONE) return 0; - return batched_compress_get_max_output_chunk_size(to_nvcomp_compression_type(compression_kind), - compression_blocksize); + return batched_compress_max_output_chunk_size(to_nvcomp_compression_type(compression_kind), + compression_blocksize); } void writer::impl::persisted_statistics::persist(int num_table_rows, @@ -2143,7 +2142,7 @@ void writer::impl::write(table_view const& table) size_t compressed_bfr_size = 0; size_t num_compressed_blocks = 0; auto const max_compressed_block_size = - get_compress_max_output_chunk_size(compression_kind_, compression_blocksize_); + max_compression_output_size(compression_kind_, compression_blocksize_); auto stream_output = [&]() { size_t max_stream_size = 0; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 5ee8a4f8b50..7aa6f3e2edf 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -917,12 +917,12 @@ auto to_nvcomp_compression_type(Compression codec) CUDF_FAIL("Unsupported compression type"); } -size_t get_compress_max_output_chunk_size(Compression codec, uint32_t compression_blocksize) +size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize) { if (codec == Compression::UNCOMPRESSED) return 0; - return batched_compress_get_max_output_chunk_size(to_nvcomp_compression_type(codec), - compression_blocksize); + return batched_compress_max_output_chunk_size(to_nvcomp_compression_type(codec), + compression_blocksize); } auto init_page_sizes(hostdevice_2dvector& chunks, @@ -979,7 +979,7 @@ auto init_page_sizes(hostdevice_2dvector& chunks, page_sizes.end(), comp_page_sizes.begin(), [compression_codec](auto page_size) { - return get_compress_max_output_chunk_size(compression_codec, page_size); + return max_compression_output_size(compression_codec, page_size); }); comp_page_sizes.host_to_device(stream); From 868d1659021fe070b491d8320a67bf4e7c39626d Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 29 Aug 2022 02:25:33 -0700 Subject: [PATCH 34/66] ORC alignment --- cpp/src/io/orc/writer_impl.cu | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index ea5e00f15f2..82a7276b9ea 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -568,7 +568,7 @@ orc_streams writer::impl::create_streams(host_span columns, [&](gpu::StreamIndexType index_type, StreamKind kind, TypeKind type_kind, size_t size) { const auto base = column.index() * gpu::CI_NUM_STREAMS; ids[base + index_type] = streams.size(); - streams.push_back(orc::Stream{kind, column.id(), size}); + streams.push_back(orc::Stream{kind, column.id(), (size == 0) ? 0 : size + 3 * segmentation.num_rowgroups()}); types.push_back(type_kind); }; @@ -875,7 +875,8 @@ encoded_data encode_columns(orc_table_view const& orc_table, hostdevice_2dvector chunks(num_columns, segmentation.num_rowgroups(), stream); auto const stream_offsets = streams.compute_offsets(orc_table.columns, segmentation.num_rowgroups()); - rmm::device_uvector encoded_data(stream_offsets.data_size(), stream); + rmm::device_uvector encoded_data( + stream_offsets.data_size() + 3 * streams.size() * segmentation.num_rowgroups(), stream); auto const aligned_rowgroups = calculate_aligned_rowgroup_bounds(orc_table, segmentation, stream); @@ -1021,10 +1022,30 @@ encoded_data encode_columns(orc_table_view const& orc_table, strm.lengths[strm_type] = 0; strm.data_ptrs[strm_type] = nullptr; } + if (long(strm.data_ptrs[strm_type]) % 4) { + strm.data_ptrs[strm_type] += (4 - long(strm.data_ptrs[strm_type]) % 4); + } } } } } + + /*for (size_t col_idx = 0; col_idx < num_columns; col_idx++) { + auto col_streams = chunk_streams[col_idx]; + for (auto const& stripe : segmentation.stripes) { + for (auto rg_idx_it = stripe.cbegin(); rg_idx_it < stripe.cend(); ++rg_idx_it) { + auto const rg_idx = *rg_idx_it; + auto& strm = col_streams[rg_idx]; + for (int strm_type = 0; strm_type < gpu::CI_NUM_STREAMS; ++strm_type) { + if (long(strm.data_ptrs[strm_type]) % 4) + std::cout << strm_type << ' ' << std::hex << long(strm.data_ptrs[strm_type]) + << std::endl; + if (strm.lengths[strm_type] % 4) + std::cout << strm_type << ' ' << std::hex << strm.lengths[strm_type] << std::endl; + } + } + } + }*/ chunk_streams.host_to_device(stream); if (orc_table.num_rows() > 0) { From 40307fd5c78ea0a62eadf2ba8292f606242750c4 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 30 Aug 2022 11:58:27 -0700 Subject: [PATCH 35/66] initialize decompression_status arrays --- cpp/src/io/avro/reader_impl.cu | 4 ++++ cpp/src/io/comp/uncomp.cpp | 4 +++- cpp/src/io/orc/reader_impl.cu | 2 ++ cpp/src/io/orc/writer_impl.cu | 5 ++++- cpp/src/io/parquet/reader_impl.cu | 6 ++---- cpp/src/io/parquet/writer_impl.cu | 2 ++ 6 files changed, 17 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu index e5b73dc9360..0a2fb94cb41 100644 --- a/cpp/src/io/avro/reader_impl.cu +++ b/cpp/src/io/avro/reader_impl.cu @@ -165,6 +165,10 @@ rmm::device_buffer decompress_data(datasource& source, auto inflate_in = hostdevice_vector>(meta.block_list.size(), stream); auto inflate_out = hostdevice_vector>(meta.block_list.size(), stream); auto inflate_stats = hostdevice_vector(meta.block_list.size(), stream); + thrust::fill(rmm::exec_policy(stream), + inflate_stats.d_begin(), + inflate_stats.d_end(), + decompress_status{0, 1}); // Guess an initial maximum uncompressed block size. We estimate the compression factor is two // and round up to the next multiple of 4096 bytes. diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp index 6f33c9f1de9..b251b092db1 100644 --- a/cpp/src/io/comp/uncomp.cpp +++ b/cpp/src/io/comp/uncomp.cpp @@ -520,7 +520,9 @@ size_t decompress_zstd(host_span src, hd_dsts[0] = d_dst; hd_dsts.host_to_device(stream); - auto hd_stats = hostdevice_vector(1, stream); + auto hd_stats = hostdevice_vector(1, stream); + hd_stats[0] = decompress_status{0, 1}; + hd_stats.host_to_device(stream); auto const max_uncomp_page_size = dst.size(); nvcomp::batched_decompress(nvcomp::compression_type::ZSTD, hd_srcs, diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 5df9b0dad7a..2daa76ff189 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -338,6 +338,8 @@ rmm::device_buffer reader::impl::decompress_stripe_data( rmm::device_uvector> inflate_out( num_compressed_blocks + num_uncompressed_blocks, stream); rmm::device_uvector inflate_stats(num_compressed_blocks, stream); + thrust::fill( + rmm::exec_policy(stream), inflate_stats.begin(), inflate_stats.end(), decompress_status{0, 1}); // Parse again to populate the decompression input/output buffers size_t decomp_offset = 0; diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index d56234982b3..25416c974c7 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -569,7 +569,8 @@ orc_streams writer::impl::create_streams(host_span columns, [&](gpu::StreamIndexType index_type, StreamKind kind, TypeKind type_kind, size_t size) { const auto base = column.index() * gpu::CI_NUM_STREAMS; ids[base + index_type] = streams.size(); - streams.push_back(orc::Stream{kind, column.id(), (size == 0) ? 0 : size + 3 * segmentation.num_rowgroups()}); + streams.push_back(orc::Stream{ + kind, column.id(), (size == 0) ? 0 : size + 3 * segmentation.num_rowgroups()}); types.push_back(type_kind); }; @@ -2201,6 +2202,8 @@ void writer::impl::write(table_view const& table) // Compress the data streams rmm::device_buffer compressed_data(compressed_bfr_size, stream); hostdevice_vector comp_stats(num_compressed_blocks, stream); + thrust::fill( + rmm::exec_policy(stream), comp_stats.d_begin(), comp_stats.d_end(), decompress_status{0, 1}); if (compression_kind_ != NONE) { strm_descs.host_to_device(stream); gpu::CompressOrcDataStreams(static_cast(compressed_data.data()), diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index d926bd10807..828995d6065 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -1142,10 +1142,8 @@ rmm::device_buffer reader::impl::decompress_page_data( comp_out.reserve(num_comp_pages); rmm::device_uvector comp_stats(num_comp_pages, _stream); - thrust::fill(rmm::exec_policy(_stream), - comp_stats.begin(), - comp_stats.end(), - decompress_status{0, static_cast(-1000), 0}); + thrust::fill( + rmm::exec_policy(_stream), comp_stats.begin(), comp_stats.end(), decompress_status{0, 1}); size_t decomp_offset = 0; int32_t start_pos = 0; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 7aa6f3e2edf..f16e4bb64ba 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1161,6 +1161,8 @@ void writer::impl::encode_pages(hostdevice_2dvector& chunks rmm::device_uvector> comp_in(max_comp_pages, stream); rmm::device_uvector> comp_out(max_comp_pages, stream); rmm::device_uvector comp_stats(max_comp_pages, stream); + thrust::fill( + rmm::exec_policy(stream), comp_stats.begin(), comp_stats.end(), decompress_status{0, 1}); gpu::EncodePages(batch_pages, comp_in, comp_out, comp_stats, stream); switch (compression_) { From 5bc095c90c6c9a52430dbdc0ee813ef7159f848a Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 30 Aug 2022 13:39:34 -0700 Subject: [PATCH 36/66] comp status refactor --- cpp/src/io/avro/reader_impl.cu | 10 ++++------ cpp/src/io/comp/debrotli.cu | 7 ++++--- cpp/src/io/comp/gpuinflate.cu | 12 ++++++++---- cpp/src/io/comp/gpuinflate.hpp | 19 +++++++++++++------ cpp/src/io/comp/nvcomp_adapter.cpp | 4 ++-- cpp/src/io/comp/nvcomp_adapter.cu | 23 +++++++++++++++-------- cpp/src/io/comp/nvcomp_adapter.cuh | 6 +++--- cpp/src/io/comp/nvcomp_adapter.hpp | 4 ++-- cpp/src/io/comp/snap.cu | 9 +++++---- cpp/src/io/comp/uncomp.cpp | 6 +++--- cpp/src/io/comp/unsnap.cu | 7 ++++--- cpp/src/io/orc/orc_gpu.hpp | 14 +++++++------- cpp/src/io/orc/reader_impl.cu | 14 ++++++++------ cpp/src/io/orc/stripe_enc.cu | 27 +++++++++++---------------- cpp/src/io/orc/stripe_init.cu | 4 ---- cpp/src/io/orc/writer_impl.cu | 10 ++++++---- cpp/src/io/orc/writer_impl.hpp | 2 +- cpp/src/io/parquet/page_enc.cu | 12 ++++++------ cpp/src/io/parquet/parquet_gpu.hpp | 6 +++--- cpp/src/io/parquet/reader_impl.cu | 18 +++++++++++------- cpp/src/io/parquet/writer_impl.cu | 18 +++++------------- 21 files changed, 121 insertions(+), 111 deletions(-) diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu index 0a2fb94cb41..7fcdf1bf29a 100644 --- a/cpp/src/io/avro/reader_impl.cu +++ b/cpp/src/io/avro/reader_impl.cu @@ -164,11 +164,11 @@ rmm::device_buffer decompress_data(datasource& source, if (meta.codec == "deflate") { auto inflate_in = hostdevice_vector>(meta.block_list.size(), stream); auto inflate_out = hostdevice_vector>(meta.block_list.size(), stream); - auto inflate_stats = hostdevice_vector(meta.block_list.size(), stream); + auto inflate_stats = hostdevice_vector(meta.block_list.size(), stream); thrust::fill(rmm::exec_policy(stream), inflate_stats.d_begin(), inflate_stats.d_end(), - decompress_status{0, 1}); + compression_result{0, compression_status::FAILURE}); // Guess an initial maximum uncompressed block size. We estimate the compression factor is two // and round up to the next multiple of 4096 bytes. @@ -194,8 +194,6 @@ rmm::device_buffer decompress_data(datasource& source, for (int loop_cnt = 0; loop_cnt < 2; loop_cnt++) { inflate_out.host_to_device(stream); - CUDF_CUDA_TRY(cudaMemsetAsync( - inflate_stats.device_ptr(), 0, inflate_stats.memory_size(), stream.value())); gpuinflate(inflate_in, inflate_out, inflate_stats, gzip_header_included::NO, stream); inflate_stats.device_to_host(stream, true); @@ -208,9 +206,9 @@ rmm::device_buffer decompress_data(datasource& source, inflate_stats.begin(), std::back_inserter(actual_uncomp_sizes), [](auto const& inf_out, auto const& inf_stats) { - // If error status is 1 (buffer too small), the `bytes_written` field + // If error status is OUTPUT_OVERFLOW, the `bytes_written` field // actually contains the uncompressed data size - return inf_stats.status == 1 + return inf_stats.status == compression_status::OUTPUT_OVERFLOW ? std::max(inf_out.size(), inf_stats.bytes_written) : inf_out.size(); }); diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu index 07dc2cc9870..8ece1fad2e0 100644 --- a/cpp/src/io/comp/debrotli.cu +++ b/cpp/src/io/comp/debrotli.cu @@ -1914,7 +1914,7 @@ static __device__ void ProcessCommands(debrotli_state_s* s, const brotli_diction __global__ void __launch_bounds__(block_size, 2) gpu_debrotli_kernel(device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span statuses, uint8_t* scratch, uint32_t scratch_size) { @@ -2017,7 +2017,8 @@ __global__ void __launch_bounds__(block_size, 2) // Output decompression status if (!t) { statuses[block_id].bytes_written = s->out - s->outbase; - statuses[block_id].status = s->error; + statuses[block_id].status = + (s->error == 0) ? compression_status::SUCCESS : compression_status::FAILURE; // Return ext heap used by last block (statistics) statuses[block_id].reserved = s->fb_size; } @@ -2079,7 +2080,7 @@ size_t __host__ get_gpu_debrotli_scratch_size(int max_num_inputs) void gpu_debrotli(device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span statuses, void* scratch, size_t scratch_size, rmm::cuda_stream_view stream) diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu index 16f4ea84f7f..16450ccb41a 100644 --- a/cpp/src/io/comp/gpuinflate.cu +++ b/cpp/src/io/comp/gpuinflate.cu @@ -1027,7 +1027,7 @@ template __global__ void __launch_bounds__(block_size) inflate_kernel(device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span statuses, gzip_header_included parse_hdr) { __shared__ __align__(16) inflate_state_s state_g; @@ -1134,8 +1134,12 @@ __global__ void __launch_bounds__(block_size) state->err = 1; } statuses[z].bytes_written = state->out - state->outbase; - statuses[z].status = state->err; - statuses[z].reserved = (int)(state->end - state->cur); // Here mainly for debug purposes + if (state->err == 1) { + statuses[z].status = compression_status::OUTPUT_OVERFLOW; + } else { + statuses[z].status = (state->err) ? compression_status::SUCCESS : compression_status::FAILURE; + } + statuses[z].reserved = (int)(state->end - state->cur); // Here mainly for debug purposes } } @@ -1200,7 +1204,7 @@ __global__ void __launch_bounds__(1024) void gpuinflate(device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span statuses, gzip_header_included parse_hdr, rmm::cuda_stream_view stream) { diff --git a/cpp/src/io/comp/gpuinflate.hpp b/cpp/src/io/comp/gpuinflate.hpp index 3870b2ac3b3..97b14c091e4 100644 --- a/cpp/src/io/comp/gpuinflate.hpp +++ b/cpp/src/io/comp/gpuinflate.hpp @@ -25,12 +25,19 @@ namespace cudf { namespace io { +enum class compression_status : uint8_t { + SUCCESS, + FAILURE, + SKIPPED, + OUTPUT_OVERFLOW, +}; + /** * @brief Output parameters for the decompression interface */ -struct decompress_status { +struct compression_result { uint64_t bytes_written; - uint32_t status; + compression_status status; uint32_t reserved; }; @@ -50,7 +57,7 @@ enum class gzip_header_included { NO, YES }; */ void gpuinflate(device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span statuses, gzip_header_included parse_hdr, rmm::cuda_stream_view stream); @@ -78,7 +85,7 @@ void gpu_copy_uncompressed_blocks(device_span const> */ void gpu_unsnap(device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span statuses, rmm::cuda_stream_view stream); /** @@ -105,7 +112,7 @@ size_t get_gpu_debrotli_scratch_size(int max_num_inputs = 0); */ void gpu_debrotli(device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span statuses, void* scratch, size_t scratch_size, rmm::cuda_stream_view stream); @@ -123,7 +130,7 @@ void gpu_debrotli(device_span const> inputs, */ void gpu_snap(device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span statuses, rmm::cuda_stream_view stream); } // namespace io diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index e02cea56ccd..fc4909ec90c 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -160,7 +160,7 @@ size_t batched_decompress_temp_size(compression_type compression, void batched_decompress(compression_type compression, device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span statuses, size_t max_uncomp_chunk_size, size_t max_total_uncomp_size, rmm::cuda_stream_view stream) @@ -365,7 +365,7 @@ inline bool is_aligned(const void* ptr, std::uintptr_t alignment) noexcept void batched_compress(compression_type compression, device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span statuses, rmm::cuda_stream_view stream) { static int batch_idx = 0; diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu index 506630b52b8..fb264bc4730 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cu +++ b/cpp/src/io/comp/nvcomp_adapter.cu @@ -59,7 +59,7 @@ batched_args create_batched_nvcomp_args(device_span c void convert_status(device_span nvcomp_stats, device_span actual_uncompressed_sizes, - device_span cudf_stats, + device_span cudf_stats, rmm::cuda_stream_view stream) { thrust::transform_if( @@ -70,13 +70,18 @@ void convert_status(device_span nvcomp_stats, cudf_stats.begin(), cudf_stats.begin(), [] __device__(auto const& nvcomp_status, auto const& size) { - return decompress_status{size, nvcomp_status == nvcompStatus_t::nvcompSuccess ? 0u : 1u}; + return compression_result{size, + nvcomp_status == nvcompStatus_t::nvcompSuccess + ? compression_status::SUCCESS + : compression_status::FAILURE}; }, - [] __device__(auto const& cudf_status) { return cudf_status.status != 2; }); + [] __device__(auto const& cudf_status) { + return cudf_status.status != compression_status::SKIPPED; + }); } void convert_status(device_span actual_uncompressed_sizes, - device_span cudf_stats, + device_span cudf_stats, rmm::cuda_stream_view stream) { thrust::transform_if( @@ -85,12 +90,14 @@ void convert_status(device_span actual_uncompressed_sizes, actual_uncompressed_sizes.end(), cudf_stats.begin(), cudf_stats.begin(), - [] __device__(auto const& size) { return decompress_status{size}; }, - [] __device__(auto const& cudf_status) { return cudf_status.status != 2; }); + [] __device__(auto const& size) { return compression_result{size}; }, + [] __device__(auto const& cudf_status) { + return cudf_status.status != compression_status::SKIPPED; + }); } size_t filter_inputs(device_span input_sizes, - device_span statuses, + device_span statuses, std::optional max_valid_input_size, rmm::cuda_stream_view stream) { @@ -103,7 +110,7 @@ size_t filter_inputs(device_span input_sizes, input_sizes.begin(), status_size_it, [] __device__(auto const& status) { - return thrust::pair{0, decompress_status{0, 2}}; + return thrust::pair{0, compression_result{0, compression_status::SKIPPED}}; }, [max_size = max_valid_input_size.value()] __device__(size_t input_size) { return input_size > max_size; diff --git a/cpp/src/io/comp/nvcomp_adapter.cuh b/cpp/src/io/comp/nvcomp_adapter.cuh index 7e847a5f5a8..9d4282216b1 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cuh +++ b/cpp/src/io/comp/nvcomp_adapter.cuh @@ -52,14 +52,14 @@ batched_args create_batched_nvcomp_args(device_span c */ void convert_status(device_span nvcomp_stats, device_span actual_uncompressed_sizes, - device_span cudf_stats, + device_span cudf_stats, rmm::cuda_stream_view stream); /** * @brief Fill the status array based on the uncompressed sizes. */ void convert_status(device_span actual_uncompressed_sizes, - device_span cudf_stats, + device_span cudf_stats, rmm::cuda_stream_view stream); /** @@ -68,7 +68,7 @@ void convert_status(device_span actual_uncompressed_sizes, * Returns the size of the largest remaining input chunk. */ size_t filter_inputs(device_span input_sizes, - device_span statuses, + device_span statuses, std::optional max_valid_input_size, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp index 2eb1cd073c0..9eb7f3e2bbf 100644 --- a/cpp/src/io/comp/nvcomp_adapter.hpp +++ b/cpp/src/io/comp/nvcomp_adapter.hpp @@ -40,7 +40,7 @@ enum class compression_type { SNAPPY, ZSTD, DEFLATE }; void batched_decompress(compression_type compression, device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span statuses, size_t max_uncomp_chunk_size, size_t max_total_uncomp_size, rmm::cuda_stream_view stream); @@ -66,7 +66,7 @@ size_t batched_compress_max_output_chunk_size(compression_type compression, void batched_compress(compression_type compression, device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span statuses, rmm::cuda_stream_view stream); } // namespace cudf::io::nvcomp diff --git a/cpp/src/io/comp/snap.cu b/cpp/src/io/comp/snap.cu index 820a7f937d7..3696f8e53e4 100644 --- a/cpp/src/io/comp/snap.cu +++ b/cpp/src/io/comp/snap.cu @@ -260,7 +260,7 @@ static __device__ uint32_t Match60(const uint8_t* src1, __global__ void __launch_bounds__(128) snap_kernel(device_span const> inputs, device_span const> outputs, - device_span statuses) + device_span statuses) { __shared__ __align__(16) snap_state_s state_g; @@ -338,14 +338,15 @@ __global__ void __launch_bounds__(128) __syncthreads(); if (!t) { statuses[blockIdx.x].bytes_written = s->dst - s->dst_base; - statuses[blockIdx.x].status = (s->dst > s->end) ? 1 : 0; - statuses[blockIdx.x].reserved = 0; + statuses[blockIdx.x].status = + (s->dst > s->end) ? compression_status::FAILURE : compression_status::SUCCESS; + statuses[blockIdx.x].reserved = 0; } } void gpu_snap(device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span statuses, rmm::cuda_stream_view stream) { dim3 dim_block(128, 1); // 4 warps per stream, 1 stream per block diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp index b251b092db1..8e58f86317c 100644 --- a/cpp/src/io/comp/uncomp.cpp +++ b/cpp/src/io/comp/uncomp.cpp @@ -520,8 +520,8 @@ size_t decompress_zstd(host_span src, hd_dsts[0] = d_dst; hd_dsts.host_to_device(stream); - auto hd_stats = hostdevice_vector(1, stream); - hd_stats[0] = decompress_status{0, 1}; + auto hd_stats = hostdevice_vector(1, stream); + hd_stats[0] = compression_result{0, compression_status::FAILURE}; hd_stats.host_to_device(stream); auto const max_uncomp_page_size = dst.size(); nvcomp::batched_decompress(nvcomp::compression_type::ZSTD, @@ -533,7 +533,7 @@ size_t decompress_zstd(host_span src, stream); hd_stats.device_to_host(stream, true); - CUDF_EXPECTS(hd_stats[0].status == 0, "ZSTD decompression failed"); + CUDF_EXPECTS(hd_stats[0].status == compression_status::SUCCESS, "ZSTD decompression failed"); // Copy temporary output to `dst` CUDF_CUDA_TRY(cudaMemcpyAsync( diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu index 98011a57ea8..8a3fe717d9e 100644 --- a/cpp/src/io/comp/unsnap.cu +++ b/cpp/src/io/comp/unsnap.cu @@ -627,7 +627,7 @@ template __global__ void __launch_bounds__(block_size) unsnap_kernel(device_span const> inputs, device_span const> outputs, - device_span statuses) + device_span statuses) { __shared__ __align__(16) unsnap_state_s state_g; __shared__ cub::WarpReduce::TempStorage temp_storage; @@ -699,7 +699,8 @@ __global__ void __launch_bounds__(block_size) } if (!t) { statuses[strm_id].bytes_written = s->uncompressed_size - s->bytes_left; - statuses[strm_id].status = s->error; + statuses[strm_id].status = + (s->error == 0) ? compression_status::SUCCESS : compression_status::FAILURE; if (log_cyclecount) { statuses[strm_id].reserved = clock() - s->tstart; } else { @@ -710,7 +711,7 @@ __global__ void __launch_bounds__(block_size) void gpu_unsnap(device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span statuses, rmm::cuda_stream_view stream) { dim3 dim_block(128, 1); // 4 warps per stream, 1 stream per block diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp index 7f970afda1e..b3c45257153 100644 --- a/cpp/src/io/orc/orc_gpu.hpp +++ b/cpp/src/io/orc/orc_gpu.hpp @@ -56,12 +56,12 @@ struct CompressedStreamInfo { } const uint8_t* compressed_data; // [in] base ptr to compressed stream data uint8_t* uncompressed_data; // [in] base ptr to uncompressed stream data or NULL if not known yet - size_t compressed_data_size; // [in] compressed data size for this stream - device_span* dec_in_ctl; // [in] input buffer to decompress - device_span* dec_out_ctl; // [in] output buffer to decompress into - device_span decstatus; // [in] results of decompression - device_span* copy_in_ctl; // [out] input buffer to copy - device_span* copy_out_ctl; // [out] output buffer to copy to + size_t compressed_data_size; // [in] compressed data size for this stream + device_span* dec_in_ctl; // [in] input buffer to decompress + device_span* dec_out_ctl; // [in] output buffer to decompress into + device_span decstatus; // [in] results of decompression + device_span* copy_in_ctl; // [out] input buffer to copy + device_span* copy_out_ctl; // [out] output buffer to copy to uint32_t num_compressed_blocks; // [in,out] number of entries in decctl(in), number of compressed // blocks(out) uint32_t num_uncompressed_blocks; // [in,out] number of entries in dec_in_ctl(in), number of @@ -360,7 +360,7 @@ void CompressOrcDataStreams(uint8_t* compressed_data, uint32_t max_comp_blk_size, device_2dspan strm_desc, device_2dspan enc_streams, - device_span comp_stat, + device_span comp_stat, rmm::cuda_stream_view stream); /** diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 2daa76ff189..80f0958202a 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -262,18 +262,18 @@ auto decimal_column_type(std::vector const& decimal128_columns, } // namespace -__global__ void decompress_check_kernel(device_span stats, +__global__ void decompress_check_kernel(device_span stats, bool* any_block_failure) { auto tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < stats.size()) { - if (stats[tid].status != 0) { + if (stats[tid].status != compression_status::SUCCESS) { *any_block_failure = true; // Doesn't need to be atomic } } } -void decompress_check(device_span stats, +void decompress_check(device_span stats, bool* any_block_failure, rmm::cuda_stream_view stream) { @@ -337,9 +337,11 @@ rmm::device_buffer reader::impl::decompress_stripe_data( num_compressed_blocks + num_uncompressed_blocks, stream); rmm::device_uvector> inflate_out( num_compressed_blocks + num_uncompressed_blocks, stream); - rmm::device_uvector inflate_stats(num_compressed_blocks, stream); - thrust::fill( - rmm::exec_policy(stream), inflate_stats.begin(), inflate_stats.end(), decompress_status{0, 1}); + rmm::device_uvector inflate_stats(num_compressed_blocks, stream); + thrust::fill(rmm::exec_policy(stream), + inflate_stats.begin(), + inflate_stats.end(), + compression_result{0, compression_status::FAILURE}); // Parse again to populate the decompression input/output buffers size_t decomp_offset = 0; diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 96a37183874..cfe4d83ff20 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -1154,7 +1154,7 @@ __global__ void __launch_bounds__(256) device_2dspan streams, // const? device_span> inputs, device_span> outputs, - device_span statuses, + device_span statuses, uint8_t* compressed_bfr, uint32_t comp_blk_size, uint32_t max_comp_blk_size) @@ -1181,7 +1181,7 @@ __global__ void __launch_bounds__(256) inputs[ss.first_block + b] = {src + b * comp_blk_size, blk_size}; auto const dst_offset = b * compressed_block_size(max_comp_blk_size) + padded_block_header_size; outputs[ss.first_block + b] = {dst + dst_offset, max_comp_blk_size}; - statuses[ss.first_block + b] = {blk_size, 1, 0}; + statuses[ss.first_block + b] = {0, compression_status::FAILURE}; } } @@ -1203,7 +1203,7 @@ __global__ void __launch_bounds__(1024) gpuCompactCompressedBlocks(device_2dspan strm_desc, device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span statuses, uint8_t* compressed_bfr, uint32_t comp_blk_size, uint32_t max_comp_blk_size) @@ -1229,11 +1229,11 @@ __global__ void __launch_bounds__(1024) if (t == 0) { auto const src_len = min(comp_blk_size, ss.stream_size - min(b * comp_blk_size, ss.stream_size)); - auto dst_len = (statuses[ss.first_block + b].status == 0) + auto dst_len = (statuses[ss.first_block + b].status == compression_status::SUCCESS) ? statuses[ss.first_block + b].bytes_written : src_len; uint32_t blk_size24{}; - if (statuses[ss.first_block + b].status == 0) { + if (statuses[ss.first_block + b].status == compression_status::SUCCESS) { // Copy from uncompressed source src = inputs[ss.first_block + b].data(); statuses[ss.first_block + b].bytes_written = src_len; @@ -1310,7 +1310,7 @@ void CompressOrcDataStreams(uint8_t* compressed_data, uint32_t max_comp_blk_size, device_2dspan strm_desc, device_2dspan enc_streams, - device_span comp_stat, + device_span comp_stat, rmm::cuda_stream_view stream) { rmm::device_uvector> comp_in(num_compressed_blocks, stream); @@ -1337,10 +1337,11 @@ void CompressOrcDataStreams(uint8_t* compressed_data, } } catch (...) { // There was an error in compressing so set an error status for each block - thrust::for_each(rmm::exec_policy(stream), - comp_stat.begin(), - comp_stat.end(), - [] __device__(decompress_status & stat) { stat.status = 1; }); + thrust::for_each( + rmm::exec_policy(stream), + comp_stat.begin(), + comp_stat.end(), + [] __device__(compression_result & stat) { stat.status = compression_status::FAILURE; }); // Since SNAPPY is the default compression (may not be explicitly requested), fall back to // writing without compression } @@ -1353,12 +1354,6 @@ void CompressOrcDataStreams(uint8_t* compressed_data, CUDF_FAIL("Unsupported compression type"); } - CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream), - comp_stat.begin(), - comp_stat.end(), - [] __device__(auto const& stat) { return stat.status == 0; }), - "Error during compression"); - dim3 dim_block_compact(1024, 1); gpuCompactCompressedBlocks<<>>( strm_desc, comp_in, comp_out, comp_stat, compressed_data, comp_blk_size, max_comp_blk_size); diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu index edae60bfa6d..0166644c386 100644 --- a/cpp/src/io/orc/stripe_init.cu +++ b/cpp/src/io/orc/stripe_init.cu @@ -178,10 +178,6 @@ __global__ void __launch_bounds__(128, 8) uncompressed_size_actual = block_len; } else { if (num_compressed_blocks > max_compressed_blocks) { break; } - if (shuffle((lane_id == 0) ? dec_status[num_compressed_blocks].status : 0) != 0) { - // Decompression failed, not much point in doing anything else - break; - } uint32_t const dst_size = dec_out[num_compressed_blocks].size(); uncompressed_size_est = shuffle((lane_id == 0) ? dst_size : 0); uint32_t const bytes_written = dec_status[num_compressed_blocks].bytes_written; diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 25416c974c7..4486074bb4c 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -1364,7 +1364,7 @@ void writer::impl::write_index_stream(int32_t stripe_id, file_segmentation const& segmentation, host_2dspan enc_streams, host_2dspan strm_desc, - host_span comp_out, + host_span comp_out, std::vector const& rg_stats, StripeInformation* stripe, orc_streams* streams, @@ -2201,9 +2201,11 @@ void writer::impl::write(table_view const& table) // Compress the data streams rmm::device_buffer compressed_data(compressed_bfr_size, stream); - hostdevice_vector comp_stats(num_compressed_blocks, stream); - thrust::fill( - rmm::exec_policy(stream), comp_stats.d_begin(), comp_stats.d_end(), decompress_status{0, 1}); + hostdevice_vector comp_stats(num_compressed_blocks, stream); + thrust::fill(rmm::exec_policy(stream), + comp_stats.d_begin(), + comp_stats.d_end(), + compression_result{0, compression_status::FAILURE}); if (compression_kind_ != NONE) { strm_descs.host_to_device(stream); gpu::CompressOrcDataStreams(static_cast(compressed_data.data()), diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp index ed360a77632..dc8aad33af0 100644 --- a/cpp/src/io/orc/writer_impl.hpp +++ b/cpp/src/io/orc/writer_impl.hpp @@ -390,7 +390,7 @@ class writer::impl { file_segmentation const& segmentation, host_2dspan enc_streams, host_2dspan strm_desc, - host_span comp_out, + host_span comp_out, std::vector const& rg_stats, StripeInformation* stripe, orc_streams* streams, diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index a96fa1a45d2..6c003c7d678 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -870,7 +870,7 @@ __global__ void __launch_bounds__(128, 8) gpuEncodePages(device_span pages, device_span> comp_in, device_span> comp_out, - device_span comp_stats) + device_span comp_stats) { __shared__ __align__(8) page_enc_state_s state_g; using block_scan = cub::BlockScan; @@ -1226,7 +1226,7 @@ __global__ void __launch_bounds__(128, 8) } pages[blockIdx.x] = s->page; if (not comp_stats.empty()) { - comp_stats[blockIdx.x] = {0, ~0u}; + comp_stats[blockIdx.x] = {0, compression_status::FAILURE}; pages[blockIdx.x].comp_stat = &comp_stats[blockIdx.x]; } } @@ -1263,7 +1263,7 @@ __global__ void __launch_bounds__(128) gpuDecideCompression(device_spanbytes_written; - if (comp_status->status != 0) { atomicAdd(&error_count, 1); } + if (comp_status->status != compression_status::SUCCESS) { atomicAdd(&error_count, 1); } } } uncompressed_data_size = warp_reduce(temp_storage[0]).Sum(uncompressed_data_size); @@ -1680,7 +1680,7 @@ __device__ uint8_t* EncodeStatistics(uint8_t* start, // blockDim(128, 1, 1) __global__ void __launch_bounds__(128) gpuEncodePageHeaders(device_span pages, - device_span comp_stat, + device_span comp_stat, device_span page_stats, const statistics_chunk* chunk_stats) { @@ -2065,7 +2065,7 @@ void InitEncoderPages(device_2dspan chunks, void EncodePages(device_span pages, device_span> comp_in, device_span> comp_out, - device_span comp_stats, + device_span comp_stats, rmm::cuda_stream_view stream) { auto num_pages = pages.size(); @@ -2080,7 +2080,7 @@ void DecideCompression(device_span chunks, rmm::cuda_stream_view } void EncodePageHeaders(device_span pages, - device_span comp_stats, + device_span comp_stats, device_span page_stats, const statistics_chunk* chunk_stats, rmm::cuda_stream_view stream) diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 6d4e116c711..14e3584900a 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -367,7 +367,7 @@ struct EncPage { uint32_t num_leaf_values; //!< Values in page. Different from num_rows in case of nested types uint32_t num_values; //!< Number of def/rep level values in page. Includes null/empty elements in //!< non-leaf levels - decompress_status* comp_stat; //!< Ptr to compression status + compression_result* comp_stat; //!< Ptr to compression status }; /** @@ -543,7 +543,7 @@ void InitEncoderPages(cudf::detail::device_2dspan chunks, void EncodePages(device_span pages, device_span> comp_in, device_span> comp_out, - device_span comp_stats, + device_span comp_stats, rmm::cuda_stream_view stream); /** @@ -564,7 +564,7 @@ void DecideCompression(device_span chunks, rmm::cuda_stream_view * @param[in] stream CUDA stream to use, default 0 */ void EncodePageHeaders(device_span pages, - device_span comp_stats, + device_span comp_stats, device_span page_stats, const statistics_chunk* chunk_stats, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index 828995d6065..d475c3bc750 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -247,13 +247,15 @@ std::tuple conversion_info(type_id column_type_id, return std::make_tuple(type_width, clock_rate, converted_type); } -inline void decompress_check(device_span stats, +inline void decompress_check(device_span stats, rmm::cuda_stream_view stream) { CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream), stats.begin(), stats.end(), - [] __device__(auto const& stat) { return stat.status == 0; }), + [] __device__(auto const& stat) { + return stat.status == compression_status::SUCCESS; + }), "Error during decompression"); } } // namespace @@ -1141,9 +1143,11 @@ rmm::device_buffer reader::impl::decompress_page_data( std::vector> comp_out; comp_out.reserve(num_comp_pages); - rmm::device_uvector comp_stats(num_comp_pages, _stream); - thrust::fill( - rmm::exec_policy(_stream), comp_stats.begin(), comp_stats.end(), decompress_status{0, 1}); + rmm::device_uvector comp_stats(num_comp_pages, _stream); + thrust::fill(rmm::exec_policy(_stream), + comp_stats.begin(), + comp_stats.end(), + compression_result{0, compression_status::FAILURE}); size_t decomp_offset = 0; int32_t start_pos = 0; @@ -1167,8 +1171,8 @@ rmm::device_buffer reader::impl::decompress_page_data( host_span const> comp_out_view(comp_out.data() + start_pos, codec.num_pages); auto const d_comp_out = cudf::detail::make_device_uvector_async(comp_out_view, _stream); - device_span d_comp_stats_view(comp_stats.data() + start_pos, - codec.num_pages); + device_span d_comp_stats_view(comp_stats.data() + start_pos, + codec.num_pages); switch (codec.compression_type) { case parquet::GZIP: diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index f16e4bb64ba..f87717956e4 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1128,15 +1128,6 @@ void writer::impl::init_encoder_pages(hostdevice_2dvector& stream.synchronize(); } -void compress_check(device_span stats, rmm::cuda_stream_view stream) -{ - CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream), - stats.begin(), - stats.end(), - [] __device__(auto const& stat) { return stat.status != 1; }), - "Error during compression"); -} - void writer::impl::encode_pages(hostdevice_2dvector& chunks, device_span pages, size_t max_page_uncomp_data_size, @@ -1160,9 +1151,11 @@ void writer::impl::encode_pages(hostdevice_2dvector& chunks rmm::device_uvector> comp_in(max_comp_pages, stream); rmm::device_uvector> comp_out(max_comp_pages, stream); - rmm::device_uvector comp_stats(max_comp_pages, stream); - thrust::fill( - rmm::exec_policy(stream), comp_stats.begin(), comp_stats.end(), decompress_status{0, 1}); + rmm::device_uvector comp_stats(max_comp_pages, stream); + thrust::fill(rmm::exec_policy(stream), + comp_stats.begin(), + comp_stats.end(), + compression_result{0, compression_status::FAILURE}); gpu::EncodePages(batch_pages, comp_in, comp_out, comp_stats, stream); switch (compression_) { @@ -1183,7 +1176,6 @@ void writer::impl::encode_pages(hostdevice_2dvector& chunks case parquet::Compression::UNCOMPRESSED: break; default: CUDF_FAIL("invalid compression type"); } - compress_check(comp_stats, stream); // TBD: Not clear if the official spec actually allows dynamically turning off compression at the // chunk-level From ca365bf33e03c579df44d82fb2426d79c659d2a2 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 30 Aug 2022 13:45:35 -0700 Subject: [PATCH 37/66] docs --- cpp/src/io/comp/gpuinflate.hpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/comp/gpuinflate.hpp b/cpp/src/io/comp/gpuinflate.hpp index 97b14c091e4..d0e4e27e678 100644 --- a/cpp/src/io/comp/gpuinflate.hpp +++ b/cpp/src/io/comp/gpuinflate.hpp @@ -25,15 +25,18 @@ namespace cudf { namespace io { +/** + * @brief Status of a compression/decompression operation. + */ enum class compression_status : uint8_t { - SUCCESS, - FAILURE, - SKIPPED, - OUTPUT_OVERFLOW, + SUCCESS, ///< Successful, output is valid + FAILURE, ///< Failed, output is invalid (e.g. input is unsupported in some way) + SKIPPED, ///< Operation skipped (if conversion, uncompressed data can be used) + OUTPUT_OVERFLOW, ///< Output buffer is too small; operation can succeed with larger output }; /** - * @brief Output parameters for the decompression interface + * @brief Descriptor of compression/decompression result. */ struct compression_result { uint64_t bytes_written; From 1293396efbbaea94ba0a72e4bea6d65133857d96 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 30 Aug 2022 13:57:53 -0700 Subject: [PATCH 38/66] remove comp debug dump --- cpp/src/io/comp/nvcomp_adapter.cpp | 48 +----------------------------- 1 file changed, 1 insertion(+), 47 deletions(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index fc4909ec90c..44762e9e456 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -368,7 +368,6 @@ void batched_compress(compression_type compression, device_span statuses, rmm::cuda_stream_view stream) { - static int batch_idx = 0; auto const num_chunks = inputs.size(); auto nvcomp_args = create_batched_nvcomp_args(inputs, outputs, stream); @@ -378,7 +377,7 @@ void batched_compress(compression_type compression, auto const temp_size = batched_compress_temp_size(compression, num_chunks, max_uncomp_chunk_size); rmm::device_buffer scratch(temp_size, stream); - CUDF_EXPECTS(is_aligned(scratch.data(), 8), "misaligned scratch"); + CUDF_EXPECTS(is_aligned(scratch.data(), 8), "misaligned scratch buffer"); rmm::device_uvector actual_compressed_data_sizes(num_chunks, stream); @@ -393,51 +392,6 @@ void batched_compress(compression_type compression, actual_compressed_data_sizes.data(), stream.value()); - if (detail::getenv_or("DUMP_NVCOMP_INPUT", 0)) { - stream.synchronize(); - std::vector> h_inputs(num_chunks); - cudaMemcpy(h_inputs.data(), - inputs.data(), - sizeof(device_span) * num_chunks, - cudaMemcpyDeviceToHost); - int idx = 0; - for (auto& input : h_inputs) { - std::vector h_input(input.size()); - cudaMemcpy( - h_input.data(), input.data(), sizeof(uint8_t) * input.size(), cudaMemcpyDeviceToHost); - std::ofstream myFile("comp_in_" + std::to_string(batch_idx) + "_" + std::to_string(idx++), - std::ios::out | std::ios::binary); - myFile.write(reinterpret_cast(h_input.data()), h_input.size()); - } - } - - if (detail::getenv_or("DUMP_NVCOMP_OUTPUT", 0)) { - stream.synchronize(); - std::vector> h_outputs(num_chunks); - cudaMemcpy(h_outputs.data(), - outputs.data(), - sizeof(device_span) * num_chunks, - cudaMemcpyDeviceToHost); - - std::vector actual_sizes(num_chunks); - cudaMemcpy(actual_sizes.data(), - actual_compressed_data_sizes.data(), - sizeof(size_t) * num_chunks, - cudaMemcpyDeviceToHost); - - int idx = 0; - for (auto i = 0u; i < num_chunks; ++i) { - std::vector h_output(actual_sizes[i]); - cudaMemcpy(h_output.data(), - h_outputs[i].data(), - sizeof(uint8_t) * actual_sizes[i], - cudaMemcpyDeviceToHost); - std::ofstream myFile("comp_out_" + std::to_string(batch_idx) + "_" + std::to_string(idx++), - std::ios::out | std::ios::binary); - myFile.write(reinterpret_cast(h_output.data()), actual_sizes[i]); - } - } - ++batch_idx; convert_status(actual_compressed_data_sizes, statuses, stream); } From b10ad1dfbd6a450dddbe6f9e3bc792e2538bf814 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 30 Aug 2022 16:36:09 -0700 Subject: [PATCH 39/66] remove encoded_data padding --- cpp/src/io/orc/writer_impl.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 4486074bb4c..d464dab98f7 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -877,8 +877,7 @@ encoded_data encode_columns(orc_table_view const& orc_table, hostdevice_2dvector chunks(num_columns, segmentation.num_rowgroups(), stream); auto const stream_offsets = streams.compute_offsets(orc_table.columns, segmentation.num_rowgroups()); - rmm::device_uvector encoded_data( - stream_offsets.data_size() + 3 * streams.size() * segmentation.num_rowgroups(), stream); + rmm::device_uvector encoded_data(stream_offsets.data_size(), stream); auto const aligned_rowgroups = calculate_aligned_rowgroup_bounds(orc_table, segmentation, stream); From 1016b2c4306be19f8fb0f759f9ca9edb849dcba0 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 30 Aug 2022 16:43:29 -0700 Subject: [PATCH 40/66] more renaming --- cpp/src/io/comp/nvcomp_adapter.cpp | 6 +++--- cpp/src/io/comp/nvcomp_adapter.cu | 28 ++++++++++++++-------------- cpp/src/io/comp/nvcomp_adapter.cuh | 26 +++++++++++++------------- 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 44762e9e456..70ab33dc9df 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -201,7 +201,7 @@ void batched_decompress(compression_type compression, stream.value()); CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "unable to perform decompression"); - convert_status(nvcomp_statuses, actual_uncompressed_data_sizes, statuses, stream); + update_compression_results(nvcomp_statuses, actual_uncompressed_data_sizes, statuses, stream); } // Dispatcher for nvcompBatchedCompressGetTempSize @@ -372,7 +372,7 @@ void batched_compress(compression_type compression, auto nvcomp_args = create_batched_nvcomp_args(inputs, outputs, stream); - auto const max_uncomp_chunk_size = filter_inputs( + auto const max_uncomp_chunk_size = skip_unsupported_inputs( nvcomp_args.input_data_sizes, statuses, max_allowed_chunk_size(compression), stream); auto const temp_size = batched_compress_temp_size(compression, num_chunks, max_uncomp_chunk_size); @@ -392,7 +392,7 @@ void batched_compress(compression_type compression, actual_compressed_data_sizes.data(), stream.value()); - convert_status(actual_compressed_data_sizes, statuses, stream); + update_compression_results(actual_compressed_data_sizes, statuses, stream); } } // namespace cudf::io::nvcomp diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu index fb264bc4730..2e59d7b1acf 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cu +++ b/cpp/src/io/comp/nvcomp_adapter.cu @@ -57,16 +57,16 @@ batched_args create_batched_nvcomp_args(device_span c std::move(output_data_sizes)}; } -void convert_status(device_span nvcomp_stats, - device_span actual_uncompressed_sizes, - device_span cudf_stats, - rmm::cuda_stream_view stream) +void update_compression_results(device_span nvcomp_stats, + device_span actual_output_sizes, + device_span cudf_stats, + rmm::cuda_stream_view stream) { thrust::transform_if( rmm::exec_policy(stream), nvcomp_stats.begin(), nvcomp_stats.end(), - actual_uncompressed_sizes.begin(), + actual_output_sizes.begin(), cudf_stats.begin(), cudf_stats.begin(), [] __device__(auto const& nvcomp_status, auto const& size) { @@ -80,14 +80,14 @@ void convert_status(device_span nvcomp_stats, }); } -void convert_status(device_span actual_uncompressed_sizes, - device_span cudf_stats, - rmm::cuda_stream_view stream) +void update_compression_results(device_span actual_output_sizes, + device_span cudf_stats, + rmm::cuda_stream_view stream) { thrust::transform_if( rmm::exec_policy(stream), - actual_uncompressed_sizes.begin(), - actual_uncompressed_sizes.end(), + actual_output_sizes.begin(), + actual_output_sizes.end(), cudf_stats.begin(), cudf_stats.begin(), [] __device__(auto const& size) { return compression_result{size}; }, @@ -96,10 +96,10 @@ void convert_status(device_span actual_uncompressed_sizes, }); } -size_t filter_inputs(device_span input_sizes, - device_span statuses, - std::optional max_valid_input_size, - rmm::cuda_stream_view stream) +size_t skip_unsupported_inputs(device_span input_sizes, + device_span statuses, + std::optional max_valid_input_size, + rmm::cuda_stream_view stream) { if (max_valid_input_size.has_value()) { auto status_size_it = thrust::make_zip_iterator(input_sizes.begin(), statuses.begin()); diff --git a/cpp/src/io/comp/nvcomp_adapter.cuh b/cpp/src/io/comp/nvcomp_adapter.cuh index 9d4282216b1..4938a30b8b3 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cuh +++ b/cpp/src/io/comp/nvcomp_adapter.cuh @@ -48,28 +48,28 @@ batched_args create_batched_nvcomp_args(device_span c rmm::cuda_stream_view stream); /** - * @brief Convert nvcomp statuses into cuIO compression statuses. + * @brief Convert nvcomp statuses and output sizes into cuIO compression results. */ -void convert_status(device_span nvcomp_stats, - device_span actual_uncompressed_sizes, - device_span cudf_stats, - rmm::cuda_stream_view stream); +void update_compression_results(device_span nvcomp_stats, + device_span actual_output_sizes, + device_span cudf_stats, + rmm::cuda_stream_view stream); /** - * @brief Fill the status array based on the uncompressed sizes. + * @brief Fill the result array based on the actual output sizes. */ -void convert_status(device_span actual_uncompressed_sizes, - device_span cudf_stats, - rmm::cuda_stream_view stream); +void update_compression_results(device_span actual_output_sizes, + device_span cudf_stats, + rmm::cuda_stream_view stream); /** * @brief Mark unsupported input chunks for skipping. * * Returns the size of the largest remaining input chunk. */ -size_t filter_inputs(device_span input_sizes, - device_span statuses, - std::optional max_valid_input_size, - rmm::cuda_stream_view stream); +size_t skip_unsupported_inputs(device_span input_sizes, + device_span statuses, + std::optional max_valid_input_size, + rmm::cuda_stream_view stream); } // namespace cudf::io::nvcomp From 6d21074b497955b2655a84bc9b8330ed38a419e1 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 30 Aug 2022 16:43:42 -0700 Subject: [PATCH 41/66] comment --- cpp/src/io/parquet/page_enc.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 6c003c7d678..5b84b62643d 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -1222,7 +1222,7 @@ __global__ void __launch_bounds__(128, 8) s->page.max_data_size = actual_data_size; if (not comp_in.empty()) { comp_in[blockIdx.x] = {base, actual_data_size}; - comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size, 0}; // unused + comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size, 0}; // size is unused } pages[blockIdx.x] = s->page; if (not comp_stats.empty()) { From caf11028b2a0e4af3d063acac7c96349faae5a7b Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 30 Aug 2022 17:25:52 -0700 Subject: [PATCH 42/66] undo cmake workaround --- cpp/cmake/thirdparty/get_nvcomp.cmake | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cpp/cmake/thirdparty/get_nvcomp.cmake b/cpp/cmake/thirdparty/get_nvcomp.cmake index f04720c4835..41bbf44abc8 100644 --- a/cpp/cmake/thirdparty/get_nvcomp.cmake +++ b/cpp/cmake/thirdparty/get_nvcomp.cmake @@ -14,13 +14,18 @@ # This function finds nvcomp and sets any additional necessary environment variables. function(find_and_configure_nvcomp) - # WORKAROUND; DO NOT MERGE - rapids_find_package( - nvcomp REQUIRED + + include(${rapids-cmake-dir}/cpm/nvcomp.cmake) + rapids_cpm_nvcomp( BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports + USE_PROPRIETARY_BINARY ${CUDF_USE_PROPRIETARY_NVCOMP} ) + # Per-thread default stream + if(TARGET nvcomp AND CUDF_USE_PER_THREAD_DEFAULT_STREAM) + target_compile_definitions(nvcomp PRIVATE CUDA_API_PER_THREAD_DEFAULT_STREAM) + endif() endfunction() find_and_configure_nvcomp() From 4e3eb902eb7e035b819be4bdae3bdff7043ec8c8 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 30 Aug 2022 18:45:51 -0700 Subject: [PATCH 43/66] update test --- cpp/tests/io/comp/decomp_test.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/tests/io/comp/decomp_test.cpp b/cpp/tests/io/comp/decomp_test.cpp index 134f262cb13..c51a7854e25 100644 --- a/cpp/tests/io/comp/decomp_test.cpp +++ b/cpp/tests/io/comp/decomp_test.cpp @@ -58,7 +58,7 @@ struct DecompressTest : public cudf::test::BaseFixture { inf_out[0] = dst; inf_out.host_to_device(stream); - hostdevice_vector inf_stat(1, stream); + hostdevice_vector inf_stat(1, stream); inf_stat[0] = {}; inf_stat.host_to_device(stream); @@ -66,7 +66,7 @@ struct DecompressTest : public cudf::test::BaseFixture { cudaMemcpyAsync( decompressed->data(), dst.data(), dst.size(), cudaMemcpyDeviceToHost, stream.value()); inf_stat.device_to_host(stream, true); - ASSERT_EQ(inf_stat[0].status, 0); + ASSERT_EQ(inf_stat[0].status, cudf::io::compression_status::SUCCESS); } }; @@ -76,7 +76,7 @@ struct DecompressTest : public cudf::test::BaseFixture { struct GzipDecompressTest : public DecompressTest { void dispatch(device_span> d_inf_in, device_span> d_inf_out, - device_span d_inf_stat) + device_span d_inf_stat) { cudf::io::gpuinflate(d_inf_in, d_inf_out, @@ -92,7 +92,7 @@ struct GzipDecompressTest : public DecompressTest { struct SnappyDecompressTest : public DecompressTest { void dispatch(device_span> d_inf_in, device_span> d_inf_out, - device_span d_inf_stat) + device_span d_inf_stat) { cudf::io::gpu_unsnap(d_inf_in, d_inf_out, d_inf_stat, cudf::default_stream_value); } @@ -104,7 +104,7 @@ struct SnappyDecompressTest : public DecompressTest { struct BrotliDecompressTest : public DecompressTest { void dispatch(device_span> d_inf_in, device_span> d_inf_out, - device_span d_inf_stat) + device_span d_inf_stat) { rmm::device_buffer d_scratch{cudf::io::get_gpu_debrotli_scratch_size(1), cudf::default_stream_value}; From 812e5b695a8076be68b917b3bd31d0abb609dee7 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 31 Aug 2022 00:14:54 -0700 Subject: [PATCH 44/66] status return fix --- cpp/src/io/comp/gpuinflate.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu index 16450ccb41a..0e815f02bc5 100644 --- a/cpp/src/io/comp/gpuinflate.cu +++ b/cpp/src/io/comp/gpuinflate.cu @@ -1137,7 +1137,8 @@ __global__ void __launch_bounds__(block_size) if (state->err == 1) { statuses[z].status = compression_status::OUTPUT_OVERFLOW; } else { - statuses[z].status = (state->err) ? compression_status::SUCCESS : compression_status::FAILURE; + statuses[z].status = + (state->err == 0) ? compression_status::SUCCESS : compression_status::FAILURE; } statuses[z].reserved = (int)(state->end - state->cur); // Here mainly for debug purposes } From 13b211aae01641178457b2e4bcb3007d749acebf Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 31 Aug 2022 14:52:30 -0700 Subject: [PATCH 45/66] print skipped count [TEMP] --- cpp/src/io/comp/nvcomp_adapter.cu | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu index 2e59d7b1acf..a2ccce00894 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cu +++ b/cpp/src/io/comp/nvcomp_adapter.cu @@ -19,6 +19,7 @@ #include +#include #include #include #include @@ -117,6 +118,12 @@ size_t skip_unsupported_inputs(device_span input_sizes, }); } + auto const skipped_num = thrust::count_if( + rmm::exec_policy(stream), statuses.begin(), statuses.end(), [] __device__(auto const& x) { + return x.status == compression_status::SKIPPED; + }); + std::cout << "Skipped " << skipped_num << " out of " << statuses.size() << std::endl; + return thrust::reduce(rmm::exec_policy(stream), input_sizes.begin(), input_sizes.end(), From c8b8a33019892ddf20bd420e5e75705810d6c37a Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 31 Aug 2022 17:46:45 -0700 Subject: [PATCH 46/66] alignment --- cpp/src/io/comp/nvcomp_adapter.cpp | 20 ++++++++ cpp/src/io/comp/nvcomp_adapter.hpp | 22 ++++++++- cpp/src/io/orc/orc_common.hpp | 7 --- cpp/src/io/orc/orc_gpu.hpp | 2 + cpp/src/io/orc/stripe_enc.cu | 31 +++++++++---- cpp/src/io/orc/writer_impl.cu | 73 +++++++++++++++++------------- cpp/src/io/parquet/page_enc.cu | 19 ++++---- cpp/src/io/parquet/parquet_gpu.hpp | 2 + cpp/src/io/parquet/writer_impl.cu | 18 +++++++- 9 files changed, 133 insertions(+), 61 deletions(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 70ab33dc9df..4c6256b427f 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -293,6 +293,16 @@ size_t batched_compress_max_output_chunk_size(compression_type compression, return max_comp_chunk_size; } +size_t compress_input_alignment_bits(compression_type compression) +{ + switch (compression) { + case compression_type::DEFLATE: return 8; + case compression_type::SNAPPY: return 0; + case compression_type::ZSTD: return 2; + default: CUDF_FAIL("Unsupported compression type"); + } +} + // Dispatcher for nvcompBatchedCompressAsync static void batched_compress_async(compression_type compression, const void* const* device_uncompressed_ptrs, @@ -395,4 +405,14 @@ void batched_compress(compression_type compression, update_compression_results(actual_compressed_data_sizes, statuses, stream); } +bool is_compression_enabled(compression_type compression) +{ + switch (compression) { + case compression_type::DEFLATE: return detail::nvcomp_integration::is_all_enabled(); + case compression_type::SNAPPY: return detail::nvcomp_integration::is_stable_enabled(); + case compression_type::ZSTD: return detail::nvcomp_integration::is_all_enabled(); + default: return false; + } +} + } // namespace cudf::io::nvcomp diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp index 9eb7f3e2bbf..30c8a2d4382 100644 --- a/cpp/src/io/comp/nvcomp_adapter.hpp +++ b/cpp/src/io/comp/nvcomp_adapter.hpp @@ -26,6 +26,16 @@ namespace cudf::io::nvcomp { enum class compression_type { SNAPPY, ZSTD, DEFLATE }; +/** + * @brief Whether the given compression type is enabled through nvCOMP. + * + * Result depends on nvCOMP version and environment variables. + * + * @param compression Compression type + * @returns true if nvCOMP use is enabled; false otherwise + */ +[[nodiscard]] bool is_compression_enabled(compression_type compression); + /** * @brief Device batch decompression of given type. * @@ -51,8 +61,16 @@ void batched_decompress(compression_type compression, * @param compression Compression type * @param max_uncomp_chunk_size Size of the largest uncompressed chunk in the batch */ -size_t batched_compress_max_output_chunk_size(compression_type compression, - uint32_t max_uncomp_chunk_size); +[[nodiscard]] size_t batched_compress_max_output_chunk_size(compression_type compression, + uint32_t max_uncomp_chunk_size); + +/** + * @brief Gets input alignment requirements for the given compression type. + * + * @param compression Compression type + * @returns required alignment, in bits + */ +[[nodiscard]] size_t compress_input_alignment_bits(compression_type compression); /** * @brief Device batch compression of given type. diff --git a/cpp/src/io/orc/orc_common.hpp b/cpp/src/io/orc/orc_common.hpp index 29a4ad6ed78..c2898b362a6 100644 --- a/cpp/src/io/orc/orc_common.hpp +++ b/cpp/src/io/orc/orc_common.hpp @@ -24,13 +24,6 @@ namespace orc { static constexpr uint32_t block_header_size = 3; -constexpr uint32_t compressed_block_size(uint32_t compressed_data_size) -{ - return ((compressed_data_size + block_header_size + 0xFF) & ~0xFF); -} - -static constexpr uint32_t padded_block_header_size = compressed_block_size(0); - enum CompressionKind : uint8_t { NONE = 0, ZLIB = 1, diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp index b3c45257153..436f680812a 100644 --- a/cpp/src/io/orc/orc_gpu.hpp +++ b/cpp/src/io/orc/orc_gpu.hpp @@ -348,6 +348,7 @@ void CompactOrcDataStreams(device_2dspan strm_desc, * @param[in] compression Type of compression * @param[in] comp_blk_size Compression block size * @param[in] max_comp_blk_size Max size of any block after compression + * @param[in] block_align Required alignment for uncompressed blocks * @param[in,out] strm_desc StripeStream device array [stripe][stream] * @param[in,out] enc_streams chunk streams device array [column][rowgroup] * @param[out] comp_stat Per-block compression status @@ -358,6 +359,7 @@ void CompressOrcDataStreams(uint8_t* compressed_data, CompressionKind compression, uint32_t comp_blk_size, uint32_t max_comp_blk_size, + uint32_t block_align, device_2dspan strm_desc, device_2dspan enc_streams, device_span comp_stat, diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index cfe4d83ff20..927e0a658a5 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -17,14 +17,16 @@ #include "orc_common.hpp" #include "orc_gpu.hpp" -#include -#include -#include #include #include #include #include +#include +#include +#include +#include + #include #include #include @@ -1147,6 +1149,7 @@ __global__ void __launch_bounds__(1024) * @param[in] compressed_bfr Compression output buffer * @param[in] comp_blk_size Compression block size * @param[in] max_comp_blk_size Max size of any block after compression + * @param[in] block_align Required alignment for uncompressed blocks */ // blockDim {256,1,1} __global__ void __launch_bounds__(256) @@ -1157,11 +1160,15 @@ __global__ void __launch_bounds__(256) device_span statuses, uint8_t* compressed_bfr, uint32_t comp_blk_size, - uint32_t max_comp_blk_size) + uint32_t max_comp_blk_size, + uint32_t block_align) { __shared__ __align__(16) StripeStream ss; __shared__ uint8_t* volatile uncomp_base_g; + auto const padded_block_header_size = util::round_up_unsafe(block_header_size, block_align); + auto const padded_comp_block_size = util::round_up_unsafe(max_comp_blk_size, block_align); + auto const stripe_id = blockIdx.x; auto const stream_id = blockIdx.y; uint32_t t = threadIdx.x; @@ -1178,8 +1185,8 @@ __global__ void __launch_bounds__(256) num_blocks = (ss.stream_size > 0) ? (ss.stream_size - 1) / comp_blk_size + 1 : 1; for (uint32_t b = t; b < num_blocks; b += 256) { uint32_t blk_size = min(comp_blk_size, ss.stream_size - min(b * comp_blk_size, ss.stream_size)); - inputs[ss.first_block + b] = {src + b * comp_blk_size, blk_size}; - auto const dst_offset = b * compressed_block_size(max_comp_blk_size) + padded_block_header_size; + inputs[ss.first_block + b] = {src + b * comp_blk_size, blk_size}; + auto const dst_offset = b * padded_comp_block_size + padded_block_header_size; outputs[ss.first_block + b] = {dst + dst_offset, max_comp_blk_size}; statuses[ss.first_block + b] = {0, compression_status::FAILURE}; } @@ -1308,6 +1315,7 @@ void CompressOrcDataStreams(uint8_t* compressed_data, CompressionKind compression, uint32_t comp_blk_size, uint32_t max_comp_blk_size, + uint32_t block_align, device_2dspan strm_desc, device_2dspan enc_streams, device_span comp_stat, @@ -1325,11 +1333,12 @@ void CompressOrcDataStreams(uint8_t* compressed_data, comp_stat, compressed_data, comp_blk_size, - max_comp_blk_size); + max_comp_blk_size, + block_align); if (compression == SNAPPY) { try { - if (detail::nvcomp_integration::is_stable_enabled()) { + if (nvcomp::is_compression_enabled(nvcomp::compression_type::SNAPPY)) { nvcomp::batched_compress( nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_stat, stream); } else { @@ -1345,10 +1354,12 @@ void CompressOrcDataStreams(uint8_t* compressed_data, // Since SNAPPY is the default compression (may not be explicitly requested), fall back to // writing without compression } - } else if (compression == ZLIB and detail::nvcomp_integration::is_all_enabled()) { + } else if (compression == ZLIB and + nvcomp::is_compression_enabled(nvcomp::compression_type::DEFLATE)) { nvcomp::batched_compress( nvcomp::compression_type::DEFLATE, comp_in, comp_out, comp_stat, stream); - } else if (compression == ZSTD and detail::nvcomp_integration::is_all_enabled()) { + } else if (compression == ZSTD and + nvcomp::is_compression_enabled(nvcomp::compression_type::ZSTD)) { nvcomp::batched_compress(nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_stat, stream); } else if (compression != NONE) { CUDF_FAIL("Unsupported compression type"); diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index d464dab98f7..a6f9a60823a 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -522,6 +522,24 @@ constexpr size_t RLE_stream_size(TypeKind kind, size_t count) } } +auto to_nvcomp_compression_type(CompressionKind compression_kind) +{ + if (compression_kind == SNAPPY) return nvcomp::compression_type::SNAPPY; + if (compression_kind == ZLIB) return nvcomp::compression_type::DEFLATE; + if (compression_kind == ZSTD) return nvcomp::compression_type::ZSTD; + CUDF_FAIL("Unsupported compression type"); +} + +auto block_alignment(CompressionKind compression_kind) +{ + if (compression_kind == NONE or + not nvcomp::is_compression_enabled(to_nvcomp_compression_type(compression_kind))) { + return 1u; + } + + return 1u << nvcomp::compress_input_alignment_bits(to_nvcomp_compression_type(compression_kind)); +} + orc_streams writer::impl::create_streams(host_span columns, file_segmentation const& segmentation, std::map const& decimal_column_sizes) @@ -567,10 +585,13 @@ orc_streams writer::impl::create_streams(host_span columns, auto add_stream = [&](gpu::StreamIndexType index_type, StreamKind kind, TypeKind type_kind, size_t size) { - const auto base = column.index() * gpu::CI_NUM_STREAMS; - ids[base + index_type] = streams.size(); + auto const max_alignment_padding = block_alignment(compression_kind_) - 1; + const auto base = column.index() * gpu::CI_NUM_STREAMS; + ids[base + index_type] = streams.size(); streams.push_back(orc::Stream{ - kind, column.id(), (size == 0) ? 0 : size + 3 * segmentation.num_rowgroups()}); + kind, + column.id(), + (size == 0) ? 0 : size + max_alignment_padding * segmentation.num_rowgroups()}); types.push_back(type_kind); }; @@ -871,6 +892,7 @@ encoded_data encode_columns(orc_table_view const& orc_table, encoder_decimal_info&& dec_chunk_sizes, file_segmentation const& segmentation, orc_streams const& streams, + uint32_t block_align, rmm::cuda_stream_view stream) { auto const num_columns = orc_table.num_columns(); @@ -1023,30 +1045,15 @@ encoded_data encode_columns(orc_table_view const& orc_table, strm.lengths[strm_type] = 0; strm.data_ptrs[strm_type] = nullptr; } - if (long(strm.data_ptrs[strm_type]) % 4) { - strm.data_ptrs[strm_type] += (4 - long(strm.data_ptrs[strm_type]) % 4); + if (long(strm.data_ptrs[strm_type]) % block_align) { + strm.data_ptrs[strm_type] += + (block_align - long(strm.data_ptrs[strm_type]) % block_align); } } } } } - /*for (size_t col_idx = 0; col_idx < num_columns; col_idx++) { - auto col_streams = chunk_streams[col_idx]; - for (auto const& stripe : segmentation.stripes) { - for (auto rg_idx_it = stripe.cbegin(); rg_idx_it < stripe.cend(); ++rg_idx_it) { - auto const rg_idx = *rg_idx_it; - auto& strm = col_streams[rg_idx]; - for (int strm_type = 0; strm_type < gpu::CI_NUM_STREAMS; ++strm_type) { - if (long(strm.data_ptrs[strm_type]) % 4) - std::cout << strm_type << ' ' << std::hex << long(strm.data_ptrs[strm_type]) - << std::endl; - if (strm.lengths[strm_type] % 4) - std::cout << strm_type << ' ' << std::hex << strm.lengths[strm_type] << std::endl; - } - } - } - }*/ chunk_streams.host_to_device(stream); if (orc_table.num_rows() > 0) { @@ -2030,14 +2037,6 @@ __global__ void copy_string_data(char* string_pool, } } -auto to_nvcomp_compression_type(CompressionKind compression_kind) -{ - if (compression_kind == SNAPPY) return nvcomp::compression_type::SNAPPY; - if (compression_kind == ZLIB) return nvcomp::compression_type::DEFLATE; - if (compression_kind == ZSTD) return nvcomp::compression_type::ZSTD; - CUDF_FAIL("Unsupported compression type"); -} - size_t max_compression_output_size(CompressionKind compression_kind, uint32_t compression_blocksize) { if (compression_kind == NONE) return 0; @@ -2147,10 +2146,17 @@ void writer::impl::write(table_view const& table) auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream); + auto const block_align = block_alignment(compression_kind_); + auto streams = create_streams(orc_table.columns, segmentation, decimal_column_sizes(dec_chunk_sizes.rg_sizes)); - auto enc_data = encode_columns( - orc_table, std::move(dictionaries), std::move(dec_chunk_sizes), segmentation, streams, stream); + auto enc_data = encode_columns(orc_table, + std::move(dictionaries), + std::move(dec_chunk_sizes), + segmentation, + streams, + block_align, + stream); // Assemble individual disparate column chunks into contiguous data streams size_type const num_index_streams = (orc_table.num_columns() + 1); @@ -2165,6 +2171,8 @@ void writer::impl::write(table_view const& table) size_t num_compressed_blocks = 0; auto const max_compressed_block_size = max_compression_output_size(compression_kind_, compression_blocksize_); + auto const padded_max_compressed_block_size = + util::round_up_unsafe(max_compressed_block_size, block_align); auto stream_output = [&]() { size_t max_stream_size = 0; @@ -2181,7 +2189,7 @@ void writer::impl::write(table_view const& table) (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1); stream_size += num_blocks * block_header_size; num_compressed_blocks += num_blocks; - compressed_bfr_size += compressed_block_size(max_compressed_block_size) * num_blocks; + compressed_bfr_size += padded_max_compressed_block_size * num_blocks; } max_stream_size = std::max(max_stream_size, stream_size); } @@ -2212,6 +2220,7 @@ void writer::impl::write(table_view const& table) compression_kind_, compression_blocksize_, max_compressed_block_size, + block_align, strm_descs, enc_data.streams, comp_stats, diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 5b84b62643d..4ed7ca8c135 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -217,8 +217,6 @@ __global__ void __launch_bounds__(128) if (frag_id < num_fragments_per_column and lane_id == 0) groups[column_id][frag_id] = *g; } -constexpr size_t nvcomp_pad(size_t size) { return (size + 3) & ~3; } - // blockDim {128,1,1} __global__ void __launch_bounds__(128) gpuInitPages(device_2dspan chunks, @@ -230,7 +228,8 @@ __global__ void __launch_bounds__(128) statistics_merge_group* chunk_grstats, int32_t num_columns, size_t max_page_size_bytes, - size_type max_page_size_rows) + size_type max_page_size_rows, + uint32_t page_align) { // TODO: All writing seems to be done by thread 0. Could be replaced by thrust foreach __shared__ __align__(8) parquet_column_device_view col_g; @@ -286,7 +285,8 @@ __global__ void __launch_bounds__(128) page_g.num_rows = ck_g.num_dict_entries; page_g.num_leaf_values = ck_g.num_dict_entries; page_g.num_values = ck_g.num_dict_entries; // TODO: shouldn't matter for dict page - page_offset += nvcomp_pad(page_g.max_hdr_size + page_g.max_data_size); + page_offset += + util::round_up_unsafe(page_g.max_hdr_size + page_g.max_data_size, page_align); if (not comp_page_sizes.empty()) { comp_page_offset += page_g.max_hdr_size + comp_page_sizes[ck_g.first_page]; } @@ -363,7 +363,7 @@ __global__ void __launch_bounds__(128) page_g.max_hdr_size += stats_hdr_len; } // pad max_hdr_size - page_g.max_hdr_size = nvcomp_pad(page_g.max_hdr_size); + page_g.max_hdr_size = util::round_up_unsafe(page_g.max_hdr_size, page_align); page_g.page_data = ck_g.uncompressed_bfr + page_offset; if (not comp_page_sizes.empty()) { page_g.compressed_data = ck_g.compressed_bfr + comp_page_offset; @@ -388,7 +388,8 @@ __global__ void __launch_bounds__(128) pagestats_g.start_chunk = ck_g.first_fragment + page_start; pagestats_g.num_chunks = page_g.num_fragments; - page_offset += nvcomp_pad(page_g.max_hdr_size + page_g.max_data_size); + page_offset += + util::round_up_unsafe(page_g.max_hdr_size + page_g.max_data_size, page_align); if (not comp_page_sizes.empty()) { comp_page_offset += page_g.max_hdr_size + comp_page_sizes[ck_g.first_page + num_pages]; } @@ -426,7 +427,7 @@ __global__ void __launch_bounds__(128) __syncwarp(); if (!t) { if (ck_g.ck_stat_size == 0 && ck_g.stats) { - uint32_t ck_stat_size = nvcomp_pad(48 + 2 * ck_max_stats_len); + uint32_t ck_stat_size = util::round_up_unsafe(48 + 2 * ck_max_stats_len, page_align); page_offset += ck_stat_size; comp_page_offset += ck_stat_size; ck_g.ck_stat_size = ck_stat_size; @@ -2044,6 +2045,7 @@ void InitEncoderPages(device_2dspan chunks, int32_t num_columns, size_t max_page_size_bytes, size_type max_page_size_rows, + uint32_t page_align, statistics_merge_group* page_grstats, statistics_merge_group* chunk_grstats, rmm::cuda_stream_view stream) @@ -2059,7 +2061,8 @@ void InitEncoderPages(device_2dspan chunks, chunk_grstats, num_columns, max_page_size_bytes, - max_page_size_rows); + max_page_size_rows, + page_align); } void EncodePages(device_span pages, diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 14e3584900a..6f83744fd09 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -515,6 +515,7 @@ void get_dictionary_indices(cudf::detail::device_2dspan * @param[in] num_rowgroups Number of fragments per column * @param[in] num_columns Number of columns * @param[in] page_grstats Setup for page-level stats + * @param[in] page_align Required alignment for uncompressed pages * @param[in] chunk_grstats Setup for chunk-level stats * @param[in] max_page_comp_data_size Calculated maximum compressed data size of pages * @param[in] stream CUDA stream to use, default 0 @@ -527,6 +528,7 @@ void InitEncoderPages(cudf::detail::device_2dspan chunks, int32_t num_columns, size_t max_page_size_bytes, size_type max_page_size_rows, + uint32_t page_align, statistics_merge_group* page_grstats, statistics_merge_group* chunk_grstats, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index f87717956e4..a9c17cd5bed 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -917,6 +917,16 @@ auto to_nvcomp_compression_type(Compression codec) CUDF_FAIL("Unsupported compression type"); } +auto page_alignment(Compression codec) +{ + if (codec == Compression::UNCOMPRESSED or + not nvcomp::is_compression_enabled(to_nvcomp_compression_type(codec))) { + return 1u; + } + + return 1u << nvcomp::compress_input_alignment_bits(to_nvcomp_compression_type(codec)); +} + size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize) { if (codec == Compression::UNCOMPRESSED) return 0; @@ -945,6 +955,7 @@ auto init_page_sizes(hostdevice_2dvector& chunks, num_columns, max_page_size_bytes, max_page_size_rows, + page_alignment(compression_codec), nullptr, nullptr, stream); @@ -968,6 +979,7 @@ auto init_page_sizes(hostdevice_2dvector& chunks, num_columns, max_page_size_bytes, max_page_size_rows, + page_alignment(compression_codec), nullptr, nullptr, stream); @@ -992,6 +1004,7 @@ auto init_page_sizes(hostdevice_2dvector& chunks, num_columns, max_page_size_bytes, max_page_size_rows, + page_alignment(compression_codec), nullptr, nullptr, stream); @@ -1110,6 +1123,7 @@ void writer::impl::init_encoder_pages(hostdevice_2dvector& num_columns, max_page_size_bytes, max_page_size_rows, + page_alignment(compression_), (num_stats_bfr) ? page_stats_mrg.data() : nullptr, (num_stats_bfr > num_pages) ? page_stats_mrg.data() + num_pages : nullptr, stream); @@ -1160,7 +1174,7 @@ void writer::impl::encode_pages(hostdevice_2dvector& chunks gpu::EncodePages(batch_pages, comp_in, comp_out, comp_stats, stream); switch (compression_) { case parquet::Compression::SNAPPY: - if (nvcomp_integration::is_stable_enabled()) { + if (nvcomp::is_compression_enabled(nvcomp::compression_type::SNAPPY)) { nvcomp::batched_compress( nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_stats, stream); } else { @@ -1168,7 +1182,7 @@ void writer::impl::encode_pages(hostdevice_2dvector& chunks } break; case parquet::Compression::ZSTD: - if (nvcomp_integration::is_all_enabled()) { + if (nvcomp::is_compression_enabled(nvcomp::compression_type::ZSTD)) { nvcomp::batched_compress( nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_stats, stream); } From a620ce9b633f0d27c939d5bbb299b206127d4fb2 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 31 Aug 2022 18:49:52 -0700 Subject: [PATCH 47/66] central max chunk size; other clean up --- cpp/src/io/comp/nvcomp_adapter.cpp | 40 +++++--------------------- cpp/src/io/comp/nvcomp_adapter.hpp | 37 ++++++++++++++++++++++-- cpp/src/io/orc/orc.hpp | 12 ++++---- cpp/src/io/orc/stripe_enc.cu | 1 - cpp/src/io/orc/writer_impl.cu | 46 ++++++++++++++++-------------- cpp/src/io/parquet/writer_impl.cu | 3 +- 6 files changed, 73 insertions(+), 66 deletions(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 4c6256b427f..c3e93e796f7 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -240,27 +240,12 @@ auto batched_compress_temp_size(compression_type compression, return temp_size; } -std::optional max_allowed_chunk_size(compression_type compression) +size_t compress_max_output_chunk_size(compression_type compression, + uint32_t max_uncompressed_chunk_bytes) { - switch (compression) { - case compression_type::ZSTD: -#if NVCOMP_HAS_ZSTD_COMP - return nvcompZstdMaxAllowedChunkSize; -#else - CUDF_FAIL("Unsupported compression type"); -#endif - case compression_type::SNAPPY: return std::nullopt; - case compression_type::DEFLATE: return std::nullopt; - default: return std::nullopt; - } -} - -size_t batched_compress_max_output_chunk_size(compression_type compression, - uint32_t max_uncompressed_chunk_bytes) -{ - auto const capped_uncomp_bytes = - std::min(max_allowed_chunk_size(compression).value_or(max_uncompressed_chunk_bytes), - max_uncompressed_chunk_bytes); + auto const capped_uncomp_bytes = std::min( + compress_max_allowed_chunk_size(compression).value_or(max_uncompressed_chunk_bytes), + max_uncompressed_chunk_bytes); size_t max_comp_chunk_size = 0; nvcompStatus_t status = nvcompStatus_t::nvcompSuccess; @@ -293,16 +278,6 @@ size_t batched_compress_max_output_chunk_size(compression_type compression, return max_comp_chunk_size; } -size_t compress_input_alignment_bits(compression_type compression) -{ - switch (compression) { - case compression_type::DEFLATE: return 8; - case compression_type::SNAPPY: return 0; - case compression_type::ZSTD: return 2; - default: CUDF_FAIL("Unsupported compression type"); - } -} - // Dispatcher for nvcompBatchedCompressAsync static void batched_compress_async(compression_type compression, const void* const* device_uncompressed_ptrs, @@ -368,8 +343,7 @@ static void batched_compress_async(compression_type compression, inline bool is_aligned(const void* ptr, std::uintptr_t alignment) noexcept { - auto iptr = reinterpret_cast(ptr); - return !(iptr % alignment); + return (reinterpret_cast(ptr) % alignment) == 0; } void batched_compress(compression_type compression, @@ -383,7 +357,7 @@ void batched_compress(compression_type compression, auto nvcomp_args = create_batched_nvcomp_args(inputs, outputs, stream); auto const max_uncomp_chunk_size = skip_unsupported_inputs( - nvcomp_args.input_data_sizes, statuses, max_allowed_chunk_size(compression), stream); + nvcomp_args.input_data_sizes, statuses, compress_max_allowed_chunk_size(compression), stream); auto const temp_size = batched_compress_temp_size(compression, num_chunks, max_uncomp_chunk_size); rmm::device_buffer scratch(temp_size, stream); diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp index 30c8a2d4382..971de17a595 100644 --- a/cpp/src/io/comp/nvcomp_adapter.hpp +++ b/cpp/src/io/comp/nvcomp_adapter.hpp @@ -18,6 +18,7 @@ #include "gpuinflate.hpp" +#include #include #include @@ -61,8 +62,8 @@ void batched_decompress(compression_type compression, * @param compression Compression type * @param max_uncomp_chunk_size Size of the largest uncompressed chunk in the batch */ -[[nodiscard]] size_t batched_compress_max_output_chunk_size(compression_type compression, - uint32_t max_uncomp_chunk_size); +[[nodiscard]] size_t compress_max_output_chunk_size(compression_type compression, + uint32_t max_uncomp_chunk_size); /** * @brief Gets input alignment requirements for the given compression type. @@ -70,7 +71,37 @@ void batched_decompress(compression_type compression, * @param compression Compression type * @returns required alignment, in bits */ -[[nodiscard]] size_t compress_input_alignment_bits(compression_type compression); +[[nodiscard]] constexpr size_t compress_input_alignment_bits(compression_type compression) +{ + switch (compression) { + case compression_type::DEFLATE: return 8; + case compression_type::SNAPPY: return 0; + case compression_type::ZSTD: return 2; + default: CUDF_FAIL("Unsupported compression type"); + } +} + +/** + * @brief Maximum size of uncompressed chunks that can be compressed with nvCOMP. + * + * @param compression Compression type + * @returns maximum chunk size + */ +[[nodiscard]] constexpr std::optional compress_max_allowed_chunk_size( + compression_type compression) +{ + switch (compression) { + case compression_type::ZSTD: +#if NVCOMP_HAS_ZSTD_COMP + return nvcompZstdMaxAllowedChunkSize; +#else + CUDF_FAIL("Unsupported compression type"); +#endif + case compression_type::SNAPPY: return std::nullopt; + case compression_type::DEFLATE: return 64 * 1024; + default: return std::nullopt; + } +} /** * @brief Device batch compression of given type. diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp index 858f7682b11..a007750d264 100644 --- a/cpp/src/io/orc/orc.hpp +++ b/cpp/src/io/orc/orc.hpp @@ -38,12 +38,12 @@ namespace cudf { namespace io { namespace orc { struct PostScript { - uint64_t footerLength = 0; // the length of the footer section in bytes - CompressionKind compression = NONE; // the kind of generic compression used - uint32_t compressionBlockSize = 256 * 1024; // the maximum size of each compression chunk - std::vector version; // the version of the writer [major, minor] - uint64_t metadataLength = 0; // the length of the metadata section in bytes - std::string magic = ""; // the fixed string "ORC" + uint64_t footerLength = 0; // the length of the footer section in bytes + CompressionKind compression = NONE; // the kind of generic compression used + uint32_t compressionBlockSize{}; // the maximum size of each compression chunk + std::vector version; // the version of the writer [major, minor] + uint64_t metadataLength = 0; // the length of the metadata section in bytes + std::string magic = ""; // the fixed string "ORC" }; struct StripeInformation { diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 927e0a658a5..374a51ab41a 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -33,7 +33,6 @@ #include #include -#include #include #include diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index a6f9a60823a..2fd218388f2 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -85,7 +85,18 @@ template using pinned_buffer = std::unique_ptr; /** - * @brief Function that translates GDF compression to ORC compression + * @brief Translates ORC compression to nvCOMP compression + */ +auto to_nvcomp_compression_type(CompressionKind compression_kind) +{ + if (compression_kind == SNAPPY) return nvcomp::compression_type::SNAPPY; + if (compression_kind == ZLIB) return nvcomp::compression_type::DEFLATE; + if (compression_kind == ZSTD) return nvcomp::compression_type::ZSTD; + CUDF_FAIL("Unsupported compression type"); +} + +/** + * @brief Translates cuDF compression to ORC compression */ orc::CompressionKind to_orc_compression(compression_type compression) { @@ -95,27 +106,28 @@ orc::CompressionKind to_orc_compression(compression_type compression) case compression_type::ZLIB: return orc::CompressionKind::ZLIB; case compression_type::ZSTD: return orc::CompressionKind::ZSTD; case compression_type::NONE: return orc::CompressionKind::NONE; - default: CUDF_FAIL("Unsupported compression type"); return orc::CompressionKind::NONE; + default: CUDF_FAIL("Unsupported compression type"); } } /** * @brief Returns the block size for a given compression kind. - * - * The nvCOMP ZLIB compression is limited to blocks up to 64KiB. */ constexpr size_t compression_block_size(orc::CompressionKind compression) { - switch (compression) { - case orc::CompressionKind::NONE: return 0; - case orc::CompressionKind::ZLIB: return 64 * 1024; - case orc::CompressionKind::ZSTD: return 64 * 1024; - default: return 256 * 1024; - } + if (compression == orc::CompressionKind::NONE) { return 0; } + + auto const ncomp_type = to_nvcomp_compression_type(compression); + auto const nvcomp_limit = nvcomp::is_compression_enabled(ncomp_type) + ? nvcomp::compress_max_allowed_chunk_size(ncomp_type) + : std::nullopt; + + constexpr size_t max_block_size = 256 * 1024; + return std::min(nvcomp_limit.value_or(max_block_size), max_block_size); } /** - * @brief Function that translates GDF dtype to ORC datatype + * @brief Translates cuDF dtype to ORC datatype */ constexpr orc::TypeKind to_orc_type(cudf::type_id id, bool list_column_as_map) { @@ -522,14 +534,6 @@ constexpr size_t RLE_stream_size(TypeKind kind, size_t count) } } -auto to_nvcomp_compression_type(CompressionKind compression_kind) -{ - if (compression_kind == SNAPPY) return nvcomp::compression_type::SNAPPY; - if (compression_kind == ZLIB) return nvcomp::compression_type::DEFLATE; - if (compression_kind == ZSTD) return nvcomp::compression_type::ZSTD; - CUDF_FAIL("Unsupported compression type"); -} - auto block_alignment(CompressionKind compression_kind) { if (compression_kind == NONE or @@ -2041,8 +2045,8 @@ size_t max_compression_output_size(CompressionKind compression_kind, uint32_t co { if (compression_kind == NONE) return 0; - return batched_compress_max_output_chunk_size(to_nvcomp_compression_type(compression_kind), - compression_blocksize); + return compress_max_output_chunk_size(to_nvcomp_compression_type(compression_kind), + compression_blocksize); } void writer::impl::persisted_statistics::persist(int num_table_rows, diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index a9c17cd5bed..0777f446d66 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -931,8 +931,7 @@ size_t max_compression_output_size(Compression codec, uint32_t compression_block { if (codec == Compression::UNCOMPRESSED) return 0; - return batched_compress_max_output_chunk_size(to_nvcomp_compression_type(codec), - compression_blocksize); + return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize); } auto init_page_sizes(hostdevice_2dvector& chunks, From 3039afabd2fc51382f1a041ad1b699b097fa3b77 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 31 Aug 2022 19:23:37 -0700 Subject: [PATCH 48/66] max page size clean up --- cpp/src/io/parquet/writer_impl.cu | 26 ++++++++++++++++---------- cpp/src/io/parquet/writer_impl.hpp | 2 +- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 0777f446d66..6da0cadeab6 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1242,6 +1242,18 @@ size_t writer::impl::column_index_buffer_size(gpu::EncColumnChunk* ck) const return ck->ck_stat_size * ck->num_pages + column_index_truncate_length + padding; } +size_t max_page_bytes(Compression compression, size_t max_page_size_bytes) +{ + if (compression == parquet::Compression::UNCOMPRESSED) { return max_page_size_bytes; } + + auto const ncomp_type = to_nvcomp_compression_type(compression); + auto const nvcomp_limit = nvcomp::is_compression_enabled(ncomp_type) + ? nvcomp::compress_max_allowed_chunk_size(ncomp_type) + : std::nullopt; + + return std::min(nvcomp_limit.value_or(max_page_size_bytes), max_page_size_bytes); +} + writer::impl::impl(std::vector> sinks, parquet_writer_options const& options, SingleWriteMode mode, @@ -1249,11 +1261,11 @@ writer::impl::impl(std::vector> sinks, rmm::mr::device_memory_resource* mr) : _mr(mr), stream(stream), + compression_(to_parquet_compression(options.get_compression())), max_row_group_size{options.get_row_group_size_bytes()}, max_row_group_rows{options.get_row_group_size_rows()}, - max_page_size_bytes(options.get_max_page_size_bytes()), + max_page_size_bytes(max_page_bytes(compression_, options.get_max_page_size_bytes())), max_page_size_rows(options.get_max_page_size_rows()), - compression_(to_parquet_compression(options.get_compression())), stats_granularity_(options.get_stats_level()), int96_timestamps(options.is_enabled_int96_timestamps()), column_index_truncate_length(options.get_column_index_truncate_length()), @@ -1261,9 +1273,6 @@ writer::impl::impl(std::vector> sinks, single_write_mode(mode == SingleWriteMode::YES), out_sink_(std::move(sinks)) { - if (options.get_compression() == compression_type::ZSTD) { - max_page_size_bytes = std::min(max_page_size_bytes, 64 * 1024ul); - } if (options.get_metadata()) { table_meta = std::make_unique(*options.get_metadata()); } @@ -1277,11 +1286,11 @@ writer::impl::impl(std::vector> sinks, rmm::mr::device_memory_resource* mr) : _mr(mr), stream(stream), + compression_(to_parquet_compression(options.get_compression())), max_row_group_size{options.get_row_group_size_bytes()}, max_row_group_rows{options.get_row_group_size_rows()}, - max_page_size_bytes(options.get_max_page_size_bytes()), + max_page_size_bytes(max_page_bytes(compression_, options.get_max_page_size_bytes())), max_page_size_rows(options.get_max_page_size_rows()), - compression_(to_parquet_compression(options.get_compression())), stats_granularity_(options.get_stats_level()), int96_timestamps(options.is_enabled_int96_timestamps()), column_index_truncate_length(options.get_column_index_truncate_length()), @@ -1289,9 +1298,6 @@ writer::impl::impl(std::vector> sinks, single_write_mode(mode == SingleWriteMode::YES), out_sink_(std::move(sinks)) { - if (options.get_compression() == compression_type::ZSTD) { - max_page_size_bytes = std::min(max_page_size_bytes, 64 * 1024ul); - } if (options.get_metadata()) { table_meta = std::make_unique(*options.get_metadata()); } diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index c6309488d6b..cac75a5dcd9 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -208,11 +208,11 @@ class writer::impl { // Cuda stream to be used rmm::cuda_stream_view stream; + Compression compression_ = Compression::UNCOMPRESSED; size_t max_row_group_size = default_row_group_size_bytes; size_type max_row_group_rows = default_row_group_size_rows; size_t max_page_size_bytes = default_max_page_size_bytes; size_type max_page_size_rows = default_max_page_size_rows; - Compression compression_ = Compression::UNCOMPRESSED; statistics_freq stats_granularity_ = statistics_freq::STATISTICS_NONE; bool int96_timestamps = false; size_type column_index_truncate_length = default_column_index_truncate_length; From a14c2a0aaa71d6ed0782f8db0b22694af5d2ebb3 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 31 Aug 2022 23:14:30 -0700 Subject: [PATCH 49/66] remove unused header --- cpp/src/io/comp/nvcomp_adapter.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index c3e93e796f7..64e8e058ed5 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -21,8 +21,6 @@ #include -#include - #define NVCOMP_DEFLATE_HEADER #if __has_include(NVCOMP_DEFLATE_HEADER) #include NVCOMP_DEFLATE_HEADER From c06acf97f0692e6353e048b8a5a2d1ca8334fe57 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 1 Sep 2022 00:25:52 -0700 Subject: [PATCH 50/66] minor clean up --- cpp/src/io/comp/nvcomp_adapter.cpp | 4 ++-- cpp/src/io/parquet/page_enc.cu | 1 - cpp/src/io/parquet/writer_impl.cu | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 64e8e058ed5..b920c22a9ef 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -339,7 +339,7 @@ static void batched_compress_async(compression_type compression, CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "Error in compression"); } -inline bool is_aligned(const void* ptr, std::uintptr_t alignment) noexcept +bool is_aligned(void const* ptr, std::uintptr_t alignment) noexcept { return (reinterpret_cast(ptr) % alignment) == 0; } @@ -359,7 +359,7 @@ void batched_compress(compression_type compression, auto const temp_size = batched_compress_temp_size(compression, num_chunks, max_uncomp_chunk_size); rmm::device_buffer scratch(temp_size, stream); - CUDF_EXPECTS(is_aligned(scratch.data(), 8), "misaligned scratch buffer"); + CUDF_EXPECTS(is_aligned(scratch.data(), 8), "Compression failed, misaligned scratch buffer"); rmm::device_uvector actual_compressed_data_sizes(num_chunks, stream); diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 4ed7ca8c135..3b9956dc3ee 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -362,7 +362,6 @@ __global__ void __launch_bounds__(128) } page_g.max_hdr_size += stats_hdr_len; } - // pad max_hdr_size page_g.max_hdr_size = util::round_up_unsafe(page_g.max_hdr_size, page_align); page_g.page_data = ck_g.uncompressed_bfr + page_offset; if (not comp_page_sizes.empty()) { diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 6da0cadeab6..4a6c9e45dc0 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -48,7 +48,6 @@ #include #include #include -#include #include #include From e22fd6f5d1ed0145ffa30081372171d86b78a7dc Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 1 Sep 2022 00:37:22 -0700 Subject: [PATCH 51/66] fix pq test --- python/cudf/cudf/tests/test_parquet.py | 33 +++++++++++++------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 1f72dbf173f..b258858747c 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2526,20 +2526,19 @@ def test_parquet_columns_and_index_param(index, columns): assert_eq(expected, got, check_index_type=True) -def test_parquet_writer_zstd(tmpdir): - num_rows = 25000 - list_size = 7 - data = [ - struct_gen([string_gen, int_gen, string_gen], 0, list_size, False) - for i in range(num_rows) - ] - tmp = pa.Table.from_pydict({"los": data}) - fname = tmpdir.join("zstd.parquet") - pa.parquet.write_table(tmp, fname) - assert os.path.exists(fname) - expected = cudf.read_parquet(fname) - - buff = BytesIO() - expected.to_parquet(buff, compression="ZSTD") - got = cudf.read_parquet(buff) - assert_eq(expected, got) +def test_parquet_writer_zstd(): + size = 12345 + expected = cudf.DataFrame( + { + "a": np.arange(0, stop=size, dtype="int64"), + "b": np.random.choice(list("abcd"), size=size), + "c": np.random.choice(np.arange(4), size=size), + } + ) + try: + buff = BytesIO() + expected.to_orc(buff, compression="ZSTD") + got = cudf.read_orc(buff) + assert_eq(expected, got) + except RuntimeError: + pytest.mark.xfail(reason="Newer nvCOMP version is required") \ No newline at end of file From 5a6377aa8a47482965f76a9306c3f47117215fa0 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 1 Sep 2022 00:39:11 -0700 Subject: [PATCH 52/66] style fix --- python/cudf/cudf/tests/test_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index b258858747c..eda4a8edecd 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2541,4 +2541,4 @@ def test_parquet_writer_zstd(): got = cudf.read_orc(buff) assert_eq(expected, got) except RuntimeError: - pytest.mark.xfail(reason="Newer nvCOMP version is required") \ No newline at end of file + pytest.mark.xfail(reason="Newer nvCOMP version is required") From e8c2161bc6466528f832db326b3c2f123711b776 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 1 Sep 2022 13:01:27 -0700 Subject: [PATCH 53/66] move definitions back to cpp --- cpp/src/io/comp/nvcomp_adapter.cpp | 25 +++++++++++++++++++++++++ cpp/src/io/comp/nvcomp_adapter.hpp | 26 ++------------------------ 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index b920c22a9ef..6267d01f3c8 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -387,4 +387,29 @@ bool is_compression_enabled(compression_type compression) } } +size_t compress_input_alignment_bits(compression_type compression) +{ + switch (compression) { + case compression_type::DEFLATE: return 2; + case compression_type::SNAPPY: return 0; + case compression_type::ZSTD: return 2; + default: CUDF_FAIL("Unsupported compression type"); + } +} + +std::optional compress_max_allowed_chunk_size(compression_type compression) +{ + switch (compression) { + case compression_type::DEFLATE: return 64 * 1024; + case compression_type::SNAPPY: return std::nullopt; + case compression_type::ZSTD: +#if NVCOMP_HAS_ZSTD_COMP + return nvcompZstdMaxAllowedChunkSize; +#else + CUDF_FAIL("Unsupported compression type"); +#endif + default: return std::nullopt; + } +} + } // namespace cudf::io::nvcomp diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp index 971de17a595..9e27d61538f 100644 --- a/cpp/src/io/comp/nvcomp_adapter.hpp +++ b/cpp/src/io/comp/nvcomp_adapter.hpp @@ -71,15 +71,7 @@ void batched_decompress(compression_type compression, * @param compression Compression type * @returns required alignment, in bits */ -[[nodiscard]] constexpr size_t compress_input_alignment_bits(compression_type compression) -{ - switch (compression) { - case compression_type::DEFLATE: return 8; - case compression_type::SNAPPY: return 0; - case compression_type::ZSTD: return 2; - default: CUDF_FAIL("Unsupported compression type"); - } -} +[[nodiscard]] size_t compress_input_alignment_bits(compression_type compression); /** * @brief Maximum size of uncompressed chunks that can be compressed with nvCOMP. @@ -87,21 +79,7 @@ void batched_decompress(compression_type compression, * @param compression Compression type * @returns maximum chunk size */ -[[nodiscard]] constexpr std::optional compress_max_allowed_chunk_size( - compression_type compression) -{ - switch (compression) { - case compression_type::ZSTD: -#if NVCOMP_HAS_ZSTD_COMP - return nvcompZstdMaxAllowedChunkSize; -#else - CUDF_FAIL("Unsupported compression type"); -#endif - case compression_type::SNAPPY: return std::nullopt; - case compression_type::DEFLATE: return 64 * 1024; - default: return std::nullopt; - } -} +[[nodiscard]] std::optional compress_max_allowed_chunk_size(compression_type compression); /** * @brief Device batch compression of given type. From 714da715500d2156267d88d171685bfd0dfd7c63 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 1 Sep 2022 13:03:26 -0700 Subject: [PATCH 54/66] fix deflate alignment --- cpp/src/io/comp/nvcomp_adapter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 6267d01f3c8..220434df952 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -390,7 +390,7 @@ bool is_compression_enabled(compression_type compression) size_t compress_input_alignment_bits(compression_type compression) { switch (compression) { - case compression_type::DEFLATE: return 2; + case compression_type::DEFLATE: return 3; case compression_type::SNAPPY: return 0; case compression_type::ZSTD: return 2; default: CUDF_FAIL("Unsupported compression type"); From 7c8f57192581003c3cd2056b134d67857bf5c41c Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 1 Sep 2022 14:07:57 -0700 Subject: [PATCH 55/66] remove debug code --- cpp/src/io/comp/nvcomp_adapter.cu | 7 ------- 1 file changed, 7 deletions(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu index a2ccce00894..2e59d7b1acf 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cu +++ b/cpp/src/io/comp/nvcomp_adapter.cu @@ -19,7 +19,6 @@ #include -#include #include #include #include @@ -118,12 +117,6 @@ size_t skip_unsupported_inputs(device_span input_sizes, }); } - auto const skipped_num = thrust::count_if( - rmm::exec_policy(stream), statuses.begin(), statuses.end(), [] __device__(auto const& x) { - return x.status == compression_status::SKIPPED; - }); - std::cout << "Skipped " << skipped_num << " out of " << statuses.size() << std::endl; - return thrust::reduce(rmm::exec_policy(stream), input_sizes.begin(), input_sizes.end(), From c8c1f41e5b633a0ffd4f5a0042d2594a4174ca33 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 1 Sep 2022 16:40:48 -0700 Subject: [PATCH 56/66] docs --- python/cudf/cudf/utils/ioutils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index aa9b5abe812..7f597d0678a 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -208,7 +208,7 @@ File path or Root Directory path. Will be used as Root Directory path while writing a partitioned dataset. Use list of str with partition_offsets to write parts of the dataframe to different files. -compression : {'snappy', None}, default 'snappy' +compression : {'snappy', 'ZSTD', None}, default 'snappy' Name of the compression to use. Use ``None`` for no compression. index : bool, default None If ``True``, include the dataframe's index(es) in the file output. If @@ -429,7 +429,7 @@ ---------- fname : str File path or object where the ORC dataset will be stored. -compression : {{ 'snappy', 'ZLIB', None }}, default None +compression : {{ 'snappy', 'ZLIB', 'ZSTD', None }}, default None Name of the compression to use. Use None for no compression. enable_statistics: boolean, default True Enable writing column statistics. From 2a8d082b37912dd293d0338fe1efc922b15b8a1a Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 2 Sep 2022 10:41:35 -0700 Subject: [PATCH 57/66] rename to keep up with nvcomp --- cpp/src/io/comp/nvcomp_adapter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 220434df952..7398c0dc626 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -404,7 +404,7 @@ std::optional compress_max_allowed_chunk_size(compression_type compressi case compression_type::SNAPPY: return std::nullopt; case compression_type::ZSTD: #if NVCOMP_HAS_ZSTD_COMP - return nvcompZstdMaxAllowedChunkSize; + return nvcompZstdCompressionMaxAllowedChunkSize; #else CUDF_FAIL("Unsupported compression type"); #endif From 0e423ec776a4cdd825fbca3cd2dbe862b4e6840e Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 2 Sep 2022 15:22:36 -0700 Subject: [PATCH 58/66] cast; fix comp_buffer size --- cpp/src/io/orc/stripe_enc.cu | 4 ++-- cpp/src/io/orc/writer_impl.cu | 9 ++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 374a51ab41a..da90267506c 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -1185,7 +1185,7 @@ __global__ void __launch_bounds__(256) for (uint32_t b = t; b < num_blocks; b += 256) { uint32_t blk_size = min(comp_blk_size, ss.stream_size - min(b * comp_blk_size, ss.stream_size)); inputs[ss.first_block + b] = {src + b * comp_blk_size, blk_size}; - auto const dst_offset = b * padded_comp_block_size + padded_block_header_size; + auto const dst_offset = b * (padded_block_header_size + padded_comp_block_size); outputs[ss.first_block + b] = {dst + dst_offset, max_comp_blk_size}; statuses[ss.first_block + b] = {0, compression_status::FAILURE}; } @@ -1197,7 +1197,7 @@ __global__ void __launch_bounds__(256) * * @param[in,out] strm_desc StripeStream device array [stripe][stream] * @param[in] chunks EncChunk device array [rowgroup][column] - * @param[out] inputs Per-block compression input buffers + * @param[in] inputs Per-block compression input buffers * @param[out] outputs Per-block compression output buffers * @param[out] statuses Per-block compression status * @param[in] compressed_bfr Compression output buffer diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 2fd218388f2..71b8d51ac6a 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -1049,9 +1049,9 @@ encoded_data encode_columns(orc_table_view const& orc_table, strm.lengths[strm_type] = 0; strm.data_ptrs[strm_type] = nullptr; } - if (long(strm.data_ptrs[strm_type]) % block_align) { + if (reinterpret_cast(strm.data_ptrs[strm_type]) % block_align) { strm.data_ptrs[strm_type] += - (block_align - long(strm.data_ptrs[strm_type]) % block_align); + (block_align - reinterpret_cast(strm.data_ptrs[strm_type]) % block_align); } } } @@ -2177,6 +2177,8 @@ void writer::impl::write(table_view const& table) max_compression_output_size(compression_kind_, compression_blocksize_); auto const padded_max_compressed_block_size = util::round_up_unsafe(max_compressed_block_size, block_align); + auto const padded_block_header_size = + util::round_up_unsafe(block_header_size, block_align); auto stream_output = [&]() { size_t max_stream_size = 0; @@ -2193,7 +2195,8 @@ void writer::impl::write(table_view const& table) (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1); stream_size += num_blocks * block_header_size; num_compressed_blocks += num_blocks; - compressed_bfr_size += padded_max_compressed_block_size * num_blocks; + compressed_bfr_size += + (padded_block_header_size + padded_max_compressed_block_size) * num_blocks; } max_stream_size = std::max(max_stream_size, stream_size); } From e2968157ed9d7f7f20e67ab81450208ef91a0b13 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 2 Sep 2022 15:23:59 -0700 Subject: [PATCH 59/66] use switch Co-authored-by: Mike Wilson --- cpp/src/io/comp/gpuinflate.cu | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu index 0e815f02bc5..e738b66812f 100644 --- a/cpp/src/io/comp/gpuinflate.cu +++ b/cpp/src/io/comp/gpuinflate.cu @@ -1134,12 +1134,16 @@ __global__ void __launch_bounds__(block_size) state->err = 1; } statuses[z].bytes_written = state->out - state->outbase; - if (state->err == 1) { - statuses[z].status = compression_status::OUTPUT_OVERFLOW; - } else { - statuses[z].status = - (state->err == 0) ? compression_status::SUCCESS : compression_status::FAILURE; - } + statuses[z].status = [&]() { + switch(state->err) { + case 0: + return compression_status::SUCCESS; + case 1: + return compression_status::OUTPUT_OVERFLOW; + default: + return compression_status::FAILURE; + } + })(); statuses[z].reserved = (int)(state->end - state->cur); // Here mainly for debug purposes } } From 96b75dfbc94a2b3c5a3a2c29ae3cbec85aaf7228 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 2 Sep 2022 15:25:09 -0700 Subject: [PATCH 60/66] style --- cpp/src/io/comp/gpuinflate.cu | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu index e738b66812f..2742a6b8b9d 100644 --- a/cpp/src/io/comp/gpuinflate.cu +++ b/cpp/src/io/comp/gpuinflate.cu @@ -1135,13 +1135,10 @@ __global__ void __launch_bounds__(block_size) } statuses[z].bytes_written = state->out - state->outbase; statuses[z].status = [&]() { - switch(state->err) { - case 0: - return compression_status::SUCCESS; - case 1: - return compression_status::OUTPUT_OVERFLOW; - default: - return compression_status::FAILURE; + switch (state->err) { + case 0: return compression_status::SUCCESS; + case 1: return compression_status::OUTPUT_OVERFLOW; + default: return compression_status::FAILURE; } })(); statuses[z].reserved = (int)(state->end - state->cur); // Here mainly for debug purposes From 83006f9da2e1ed73d676bc228eb06e182b463ac8 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 2 Sep 2022 15:57:48 -0700 Subject: [PATCH 61/66] separate input and output alignment --- cpp/src/io/comp/gpuinflate.cu | 4 ++-- cpp/src/io/comp/nvcomp_adapter.cpp | 12 +++++++++- cpp/src/io/comp/nvcomp_adapter.hpp | 8 +++++++ cpp/src/io/orc/orc_gpu.hpp | 4 ++-- cpp/src/io/orc/stripe_enc.cu | 12 +++++----- cpp/src/io/orc/writer_impl.cu | 35 ++++++++++++++++++++---------- 6 files changed, 52 insertions(+), 23 deletions(-) diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu index 2742a6b8b9d..f0f52f7881c 100644 --- a/cpp/src/io/comp/gpuinflate.cu +++ b/cpp/src/io/comp/gpuinflate.cu @@ -1134,13 +1134,13 @@ __global__ void __launch_bounds__(block_size) state->err = 1; } statuses[z].bytes_written = state->out - state->outbase; - statuses[z].status = [&]() { + statuses[z].status = [&]() { switch (state->err) { case 0: return compression_status::SUCCESS; case 1: return compression_status::OUTPUT_OVERFLOW; default: return compression_status::FAILURE; } - })(); + }(); statuses[z].reserved = (int)(state->end - state->cur); // Here mainly for debug purposes } } diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 7398c0dc626..4d5d00eaba1 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -390,13 +390,23 @@ bool is_compression_enabled(compression_type compression) size_t compress_input_alignment_bits(compression_type compression) { switch (compression) { - case compression_type::DEFLATE: return 3; + case compression_type::DEFLATE: return 0; case compression_type::SNAPPY: return 0; case compression_type::ZSTD: return 2; default: CUDF_FAIL("Unsupported compression type"); } } +size_t compress_output_alignment_bits(compression_type compression) +{ + switch (compression) { + case compression_type::DEFLATE: return 3; + case compression_type::SNAPPY: return 0; + case compression_type::ZSTD: return 0; + default: CUDF_FAIL("Unsupported compression type"); + } +} + std::optional compress_max_allowed_chunk_size(compression_type compression) { switch (compression) { diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp index 9e27d61538f..f947fb76297 100644 --- a/cpp/src/io/comp/nvcomp_adapter.hpp +++ b/cpp/src/io/comp/nvcomp_adapter.hpp @@ -73,6 +73,14 @@ void batched_decompress(compression_type compression, */ [[nodiscard]] size_t compress_input_alignment_bits(compression_type compression); +/** + * @brief Gets output alignment requirements for the given compression type. + * + * @param compression Compression type + * @returns required alignment, in bits + */ +[[nodiscard]] size_t compress_output_alignment_bits(compression_type compression); + /** * @brief Maximum size of uncompressed chunks that can be compressed with nvCOMP. * diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp index 436f680812a..177d40a4adc 100644 --- a/cpp/src/io/orc/orc_gpu.hpp +++ b/cpp/src/io/orc/orc_gpu.hpp @@ -348,7 +348,7 @@ void CompactOrcDataStreams(device_2dspan strm_desc, * @param[in] compression Type of compression * @param[in] comp_blk_size Compression block size * @param[in] max_comp_blk_size Max size of any block after compression - * @param[in] block_align Required alignment for uncompressed blocks + * @param[in] comp_block_align Required alignment for compressed blocks * @param[in,out] strm_desc StripeStream device array [stripe][stream] * @param[in,out] enc_streams chunk streams device array [column][rowgroup] * @param[out] comp_stat Per-block compression status @@ -359,7 +359,7 @@ void CompressOrcDataStreams(uint8_t* compressed_data, CompressionKind compression, uint32_t comp_blk_size, uint32_t max_comp_blk_size, - uint32_t block_align, + uint32_t comp_block_align, device_2dspan strm_desc, device_2dspan enc_streams, device_span comp_stat, diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index da90267506c..df338895a8e 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -1148,7 +1148,7 @@ __global__ void __launch_bounds__(1024) * @param[in] compressed_bfr Compression output buffer * @param[in] comp_blk_size Compression block size * @param[in] max_comp_blk_size Max size of any block after compression - * @param[in] block_align Required alignment for uncompressed blocks + * @param[in] comp_block_align Required alignment for compressed blocks */ // blockDim {256,1,1} __global__ void __launch_bounds__(256) @@ -1160,13 +1160,13 @@ __global__ void __launch_bounds__(256) uint8_t* compressed_bfr, uint32_t comp_blk_size, uint32_t max_comp_blk_size, - uint32_t block_align) + uint32_t comp_block_align) { __shared__ __align__(16) StripeStream ss; __shared__ uint8_t* volatile uncomp_base_g; - auto const padded_block_header_size = util::round_up_unsafe(block_header_size, block_align); - auto const padded_comp_block_size = util::round_up_unsafe(max_comp_blk_size, block_align); + auto const padded_block_header_size = util::round_up_unsafe(block_header_size, comp_block_align); + auto const padded_comp_block_size = util::round_up_unsafe(max_comp_blk_size, comp_block_align); auto const stripe_id = blockIdx.x; auto const stream_id = blockIdx.y; @@ -1314,7 +1314,7 @@ void CompressOrcDataStreams(uint8_t* compressed_data, CompressionKind compression, uint32_t comp_blk_size, uint32_t max_comp_blk_size, - uint32_t block_align, + uint32_t comp_block_align, device_2dspan strm_desc, device_2dspan enc_streams, device_span comp_stat, @@ -1333,7 +1333,7 @@ void CompressOrcDataStreams(uint8_t* compressed_data, compressed_data, comp_blk_size, max_comp_blk_size, - block_align); + comp_block_align); if (compression == SNAPPY) { try { diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 71b8d51ac6a..3c64ed45976 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -534,7 +534,7 @@ constexpr size_t RLE_stream_size(TypeKind kind, size_t count) } } -auto block_alignment(CompressionKind compression_kind) +auto uncomp_block_alignment(CompressionKind compression_kind) { if (compression_kind == NONE or not nvcomp::is_compression_enabled(to_nvcomp_compression_type(compression_kind))) { @@ -544,6 +544,16 @@ auto block_alignment(CompressionKind compression_kind) return 1u << nvcomp::compress_input_alignment_bits(to_nvcomp_compression_type(compression_kind)); } +auto comp_block_alignment(CompressionKind compression_kind) +{ + if (compression_kind == NONE or + not nvcomp::is_compression_enabled(to_nvcomp_compression_type(compression_kind))) { + return 1u; + } + + return 1u << nvcomp::compress_output_alignment_bits(to_nvcomp_compression_type(compression_kind)); +} + orc_streams writer::impl::create_streams(host_span columns, file_segmentation const& segmentation, std::map const& decimal_column_sizes) @@ -589,7 +599,7 @@ orc_streams writer::impl::create_streams(host_span columns, auto add_stream = [&](gpu::StreamIndexType index_type, StreamKind kind, TypeKind type_kind, size_t size) { - auto const max_alignment_padding = block_alignment(compression_kind_) - 1; + auto const max_alignment_padding = uncomp_block_alignment(compression_kind_) - 1; const auto base = column.index() * gpu::CI_NUM_STREAMS; ids[base + index_type] = streams.size(); streams.push_back(orc::Stream{ @@ -896,7 +906,7 @@ encoded_data encode_columns(orc_table_view const& orc_table, encoder_decimal_info&& dec_chunk_sizes, file_segmentation const& segmentation, orc_streams const& streams, - uint32_t block_align, + uint32_t uncomp_block_align, rmm::cuda_stream_view stream) { auto const num_columns = orc_table.num_columns(); @@ -1049,9 +1059,10 @@ encoded_data encode_columns(orc_table_view const& orc_table, strm.lengths[strm_type] = 0; strm.data_ptrs[strm_type] = nullptr; } - if (reinterpret_cast(strm.data_ptrs[strm_type]) % block_align) { - strm.data_ptrs[strm_type] += - (block_align - reinterpret_cast(strm.data_ptrs[strm_type]) % block_align); + auto const misalignment = + reinterpret_cast(strm.data_ptrs[strm_type]) % uncomp_block_align; + if (misalignment != 0) { + strm.data_ptrs[strm_type] += (uncomp_block_align - misalignment); } } } @@ -2150,8 +2161,7 @@ void writer::impl::write(table_view const& table) auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream); - auto const block_align = block_alignment(compression_kind_); - + auto const uncomp_block_align = uncomp_block_alignment(compression_kind_); auto streams = create_streams(orc_table.columns, segmentation, decimal_column_sizes(dec_chunk_sizes.rg_sizes)); auto enc_data = encode_columns(orc_table, @@ -2159,7 +2169,7 @@ void writer::impl::write(table_view const& table) std::move(dec_chunk_sizes), segmentation, streams, - block_align, + uncomp_block_align, stream); // Assemble individual disparate column chunks into contiguous data streams @@ -2173,12 +2183,13 @@ void writer::impl::write(table_view const& table) // Allocate intermediate output stream buffer size_t compressed_bfr_size = 0; size_t num_compressed_blocks = 0; + auto const max_compressed_block_size = max_compression_output_size(compression_kind_, compression_blocksize_); auto const padded_max_compressed_block_size = - util::round_up_unsafe(max_compressed_block_size, block_align); + util::round_up_unsafe(max_compressed_block_size, uncomp_block_align); auto const padded_block_header_size = - util::round_up_unsafe(block_header_size, block_align); + util::round_up_unsafe(block_header_size, uncomp_block_align); auto stream_output = [&]() { size_t max_stream_size = 0; @@ -2227,7 +2238,7 @@ void writer::impl::write(table_view const& table) compression_kind_, compression_blocksize_, max_compressed_block_size, - block_align, + comp_block_alignment(compression_kind_), strm_descs, enc_data.streams, comp_stats, From 58f84399ff7869fb21c4dcd6478b4580ff4392a0 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 2 Sep 2022 15:58:36 -0700 Subject: [PATCH 62/66] copyright year --- java/src/main/java/ai/rapids/cudf/CompressionType.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/java/ai/rapids/cudf/CompressionType.java b/java/src/main/java/ai/rapids/cudf/CompressionType.java index d722d6d3adb..96edf1a8add 100644 --- a/java/src/main/java/ai/rapids/cudf/CompressionType.java +++ b/java/src/main/java/ai/rapids/cudf/CompressionType.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 8bf2a1110db2d509fcd2d1a01301149e8a58f905 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 6 Sep 2022 18:27:37 -0700 Subject: [PATCH 63/66] nvcomp version fallback in is_compression_enabled --- cpp/src/io/comp/nvcomp_adapter.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 4d5d00eaba1..4af295960fb 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -380,11 +380,14 @@ void batched_compress(compression_type compression, bool is_compression_enabled(compression_type compression) { switch (compression) { - case compression_type::DEFLATE: return detail::nvcomp_integration::is_all_enabled(); + case compression_type::DEFLATE: + return NVCOMP_HAS_DEFLATE and detail::nvcomp_integration::is_all_enabled(); case compression_type::SNAPPY: return detail::nvcomp_integration::is_stable_enabled(); - case compression_type::ZSTD: return detail::nvcomp_integration::is_all_enabled(); + case compression_type::ZSTD: + return NVCOMP_HAS_ZSTD_COMP and detail::nvcomp_integration::is_all_enabled(); default: return false; } + return false; } size_t compress_input_alignment_bits(compression_type compression) From 632a6a9cb666291231ddd530ca8b37a5916996b2 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 9 Sep 2022 15:44:50 -0700 Subject: [PATCH 64/66] rename status to result to match the type name --- cpp/src/io/comp/debrotli.cu | 14 ++++----- cpp/src/io/comp/gpuinflate.cu | 14 ++++----- cpp/src/io/comp/gpuinflate.hpp | 16 +++++----- cpp/src/io/comp/nvcomp_adapter.cpp | 10 +++---- cpp/src/io/comp/nvcomp_adapter.cu | 24 +++++++-------- cpp/src/io/comp/nvcomp_adapter.cuh | 6 ++-- cpp/src/io/comp/nvcomp_adapter.hpp | 8 ++--- cpp/src/io/comp/snap.cu | 12 ++++---- cpp/src/io/comp/unsnap.cu | 14 ++++----- cpp/src/io/orc/orc_gpu.hpp | 16 +++++----- cpp/src/io/orc/reader_impl.cu | 36 +++++++++++----------- cpp/src/io/orc/stripe_enc.cu | 48 +++++++++++++++--------------- cpp/src/io/orc/stripe_init.cu | 10 +++---- cpp/src/io/orc/writer_impl.cu | 20 ++++++------- cpp/src/io/parquet/page_enc.cu | 27 +++++++++-------- cpp/src/io/parquet/parquet_gpu.hpp | 8 ++--- cpp/src/io/parquet/reader_impl.cu | 31 ++++++++++--------- cpp/src/io/parquet/writer_impl.cu | 16 +++++----- 18 files changed, 164 insertions(+), 166 deletions(-) diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu index 8ece1fad2e0..b6f2d2db811 100644 --- a/cpp/src/io/comp/debrotli.cu +++ b/cpp/src/io/comp/debrotli.cu @@ -1906,7 +1906,7 @@ static __device__ void ProcessCommands(debrotli_state_s* s, const brotli_diction * * @param[in] inputs Source buffer per block * @param[out] outputs Destination buffer per block - * @param[out] statuses Decompressor status per block + * @param[out] results Decompressor status per block * @param scratch Intermediate device memory heap space (will be dynamically shared between blocks) * @param scratch_size Size of scratch heap space (smaller sizes may result in serialization between * blocks) @@ -1914,7 +1914,7 @@ static __device__ void ProcessCommands(debrotli_state_s* s, const brotli_diction __global__ void __launch_bounds__(block_size, 2) gpu_debrotli_kernel(device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span results, uint8_t* scratch, uint32_t scratch_size) { @@ -2016,11 +2016,11 @@ __global__ void __launch_bounds__(block_size, 2) __syncthreads(); // Output decompression status if (!t) { - statuses[block_id].bytes_written = s->out - s->outbase; - statuses[block_id].status = + results[block_id].bytes_written = s->out - s->outbase; + results[block_id].status = (s->error == 0) ? compression_status::SUCCESS : compression_status::FAILURE; // Return ext heap used by last block (statistics) - statuses[block_id].reserved = s->fb_size; + results[block_id].reserved = s->fb_size; } } @@ -2080,7 +2080,7 @@ size_t __host__ get_gpu_debrotli_scratch_size(int max_num_inputs) void gpu_debrotli(device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span results, void* scratch, size_t scratch_size, rmm::cuda_stream_view stream) @@ -2105,7 +2105,7 @@ void gpu_debrotli(device_span const> inputs, cudaMemcpyHostToDevice, stream.value())); gpu_debrotli_kernel<<>>( - inputs, outputs, statuses, scratch_u8, fb_heap_size); + inputs, outputs, results, scratch_u8, fb_heap_size); #if DUMP_FB_HEAP uint32_t dump[2]; uint32_t cur = 0; diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu index f0f52f7881c..dacc5a00d16 100644 --- a/cpp/src/io/comp/gpuinflate.cu +++ b/cpp/src/io/comp/gpuinflate.cu @@ -1020,14 +1020,14 @@ __device__ int parse_gzip_header(const uint8_t* src, size_t src_size) * @tparam block_size Thread block dimension for this call * @param inputs Source and destination buffer information per block * @param outputs Destination buffer information per block - * @param statuses Decompression status buffer per block + * @param results Decompression status buffer per block * @param parse_hdr If nonzero, indicates that the compressed bitstream includes a GZIP header */ template __global__ void __launch_bounds__(block_size) inflate_kernel(device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span results, gzip_header_included parse_hdr) { __shared__ __align__(16) inflate_state_s state_g; @@ -1133,15 +1133,15 @@ __global__ void __launch_bounds__(block_size) // Output buffer too small state->err = 1; } - statuses[z].bytes_written = state->out - state->outbase; - statuses[z].status = [&]() { + results[z].bytes_written = state->out - state->outbase; + results[z].status = [&]() { switch (state->err) { case 0: return compression_status::SUCCESS; case 1: return compression_status::OUTPUT_OVERFLOW; default: return compression_status::FAILURE; } }(); - statuses[z].reserved = (int)(state->end - state->cur); // Here mainly for debug purposes + results[z].reserved = (int)(state->end - state->cur); // Here mainly for debug purposes } } @@ -1206,14 +1206,14 @@ __global__ void __launch_bounds__(1024) void gpuinflate(device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span results, gzip_header_included parse_hdr, rmm::cuda_stream_view stream) { constexpr int block_size = 128; // Threads per block if (inputs.size() > 0) { inflate_kernel - <<>>(inputs, outputs, statuses, parse_hdr); + <<>>(inputs, outputs, results, parse_hdr); } } diff --git a/cpp/src/io/comp/gpuinflate.hpp b/cpp/src/io/comp/gpuinflate.hpp index d0e4e27e678..1b45a31b13b 100644 --- a/cpp/src/io/comp/gpuinflate.hpp +++ b/cpp/src/io/comp/gpuinflate.hpp @@ -54,13 +54,13 @@ enum class gzip_header_included { NO, YES }; * * @param[in] inputs List of input buffers * @param[out] outputs List of output buffers - * @param[out] statuses List of output status structures + * @param[out] results List of output status structures * @param[in] parse_hdr Whether or not to parse GZIP header * @param[in] stream CUDA stream to use */ void gpuinflate(device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span results, gzip_header_included parse_hdr, rmm::cuda_stream_view stream); @@ -83,12 +83,12 @@ void gpu_copy_uncompressed_blocks(device_span const> * * @param[in] inputs List of input buffers * @param[out] outputs List of output buffers - * @param[out] statuses List of output status structures + * @param[out] results List of output status structures * @param[in] stream CUDA stream to use */ void gpu_unsnap(device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span results, rmm::cuda_stream_view stream); /** @@ -108,14 +108,14 @@ size_t get_gpu_debrotli_scratch_size(int max_num_inputs = 0); * * @param[in] inputs List of input buffers * @param[out] outputs List of output buffers - * @param[out] statuses List of output status structures + * @param[out] results List of output status structures * @param[in] scratch Temporary memory for intermediate work * @param[in] scratch_size Size in bytes of the temporary memory * @param[in] stream CUDA stream to use */ void gpu_debrotli(device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span results, void* scratch, size_t scratch_size, rmm::cuda_stream_view stream); @@ -128,12 +128,12 @@ void gpu_debrotli(device_span const> inputs, * * @param[in] inputs List of input buffers * @param[out] outputs List of output buffers - * @param[out] statuses List of output status structures + * @param[out] results List of output status structures * @param[in] stream CUDA stream to use */ void gpu_snap(device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span results, rmm::cuda_stream_view stream); } // namespace io diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 4af295960fb..31f7b9b472e 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -158,7 +158,7 @@ size_t batched_decompress_temp_size(compression_type compression, void batched_decompress(compression_type compression, device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span results, size_t max_uncomp_chunk_size, size_t max_total_uncomp_size, rmm::cuda_stream_view stream) @@ -199,7 +199,7 @@ void batched_decompress(compression_type compression, stream.value()); CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "unable to perform decompression"); - update_compression_results(nvcomp_statuses, actual_uncompressed_data_sizes, statuses, stream); + update_compression_results(nvcomp_statuses, actual_uncompressed_data_sizes, results, stream); } // Dispatcher for nvcompBatchedCompressGetTempSize @@ -347,7 +347,7 @@ bool is_aligned(void const* ptr, std::uintptr_t alignment) noexcept void batched_compress(compression_type compression, device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span results, rmm::cuda_stream_view stream) { auto const num_chunks = inputs.size(); @@ -355,7 +355,7 @@ void batched_compress(compression_type compression, auto nvcomp_args = create_batched_nvcomp_args(inputs, outputs, stream); auto const max_uncomp_chunk_size = skip_unsupported_inputs( - nvcomp_args.input_data_sizes, statuses, compress_max_allowed_chunk_size(compression), stream); + nvcomp_args.input_data_sizes, results, compress_max_allowed_chunk_size(compression), stream); auto const temp_size = batched_compress_temp_size(compression, num_chunks, max_uncomp_chunk_size); rmm::device_buffer scratch(temp_size, stream); @@ -374,7 +374,7 @@ void batched_compress(compression_type compression, actual_compressed_data_sizes.data(), stream.value()); - update_compression_results(actual_compressed_data_sizes, statuses, stream); + update_compression_results(actual_compressed_data_sizes, results, stream); } bool is_compression_enabled(compression_type compression) diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu index 2e59d7b1acf..c3c1bff9073 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cu +++ b/cpp/src/io/comp/nvcomp_adapter.cu @@ -59,7 +59,7 @@ batched_args create_batched_nvcomp_args(device_span c void update_compression_results(device_span nvcomp_stats, device_span actual_output_sizes, - device_span cudf_stats, + device_span results, rmm::cuda_stream_view stream) { thrust::transform_if( @@ -67,8 +67,8 @@ void update_compression_results(device_span nvcomp_stats, nvcomp_stats.begin(), nvcomp_stats.end(), actual_output_sizes.begin(), - cudf_stats.begin(), - cudf_stats.begin(), + results.begin(), + results.begin(), [] __device__(auto const& nvcomp_status, auto const& size) { return compression_result{size, nvcomp_status == nvcompStatus_t::nvcompSuccess @@ -81,32 +81,30 @@ void update_compression_results(device_span nvcomp_stats, } void update_compression_results(device_span actual_output_sizes, - device_span cudf_stats, + device_span results, rmm::cuda_stream_view stream) { thrust::transform_if( rmm::exec_policy(stream), actual_output_sizes.begin(), actual_output_sizes.end(), - cudf_stats.begin(), - cudf_stats.begin(), + results.begin(), + results.begin(), [] __device__(auto const& size) { return compression_result{size}; }, - [] __device__(auto const& cudf_status) { - return cudf_status.status != compression_status::SKIPPED; - }); + [] __device__(auto const& results) { return results.status != compression_status::SKIPPED; }); } size_t skip_unsupported_inputs(device_span input_sizes, - device_span statuses, + device_span results, std::optional max_valid_input_size, rmm::cuda_stream_view stream) { if (max_valid_input_size.has_value()) { - auto status_size_it = thrust::make_zip_iterator(input_sizes.begin(), statuses.begin()); + auto status_size_it = thrust::make_zip_iterator(input_sizes.begin(), results.begin()); thrust::transform_if( rmm::exec_policy(stream), - statuses.begin(), - statuses.end(), + results.begin(), + results.end(), input_sizes.begin(), status_size_it, [] __device__(auto const& status) { diff --git a/cpp/src/io/comp/nvcomp_adapter.cuh b/cpp/src/io/comp/nvcomp_adapter.cuh index 4938a30b8b3..e49a9a6d348 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cuh +++ b/cpp/src/io/comp/nvcomp_adapter.cuh @@ -52,14 +52,14 @@ batched_args create_batched_nvcomp_args(device_span c */ void update_compression_results(device_span nvcomp_stats, device_span actual_output_sizes, - device_span cudf_stats, + device_span results, rmm::cuda_stream_view stream); /** * @brief Fill the result array based on the actual output sizes. */ void update_compression_results(device_span actual_output_sizes, - device_span cudf_stats, + device_span results, rmm::cuda_stream_view stream); /** @@ -68,7 +68,7 @@ void update_compression_results(device_span actual_output_sizes, * Returns the size of the largest remaining input chunk. */ size_t skip_unsupported_inputs(device_span input_sizes, - device_span statuses, + device_span results, std::optional max_valid_input_size, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp index f947fb76297..41af564ca76 100644 --- a/cpp/src/io/comp/nvcomp_adapter.hpp +++ b/cpp/src/io/comp/nvcomp_adapter.hpp @@ -43,7 +43,7 @@ enum class compression_type { SNAPPY, ZSTD, DEFLATE }; * @param[in] compression Compression type * @param[in] inputs List of input buffers * @param[out] outputs List of output buffers - * @param[out] statuses List of output status structures + * @param[out] results List of output status structures * @param[in] max_uncomp_chunk_size maximum size of uncompressed chunk * @param[in] max_total_uncomp_size maximum total size of uncompressed data * @param[in] stream CUDA stream to use @@ -51,7 +51,7 @@ enum class compression_type { SNAPPY, ZSTD, DEFLATE }; void batched_decompress(compression_type compression, device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span results, size_t max_uncomp_chunk_size, size_t max_total_uncomp_size, rmm::cuda_stream_view stream); @@ -95,13 +95,13 @@ void batched_decompress(compression_type compression, * @param[in] compression Compression type * @param[in] inputs List of input buffers * @param[out] outputs List of output buffers - * @param[out] statuses List of output status structures + * @param[out] results List of output status structures * @param[in] stream CUDA stream to use */ void batched_compress(compression_type compression, device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span results, rmm::cuda_stream_view stream); } // namespace cudf::io::nvcomp diff --git a/cpp/src/io/comp/snap.cu b/cpp/src/io/comp/snap.cu index 3696f8e53e4..6c7ab490751 100644 --- a/cpp/src/io/comp/snap.cu +++ b/cpp/src/io/comp/snap.cu @@ -260,7 +260,7 @@ static __device__ uint32_t Match60(const uint8_t* src1, __global__ void __launch_bounds__(128) snap_kernel(device_span const> inputs, device_span const> outputs, - device_span statuses) + device_span results) { __shared__ __align__(16) snap_state_s state_g; @@ -337,22 +337,22 @@ __global__ void __launch_bounds__(128) } __syncthreads(); if (!t) { - statuses[blockIdx.x].bytes_written = s->dst - s->dst_base; - statuses[blockIdx.x].status = + results[blockIdx.x].bytes_written = s->dst - s->dst_base; + results[blockIdx.x].status = (s->dst > s->end) ? compression_status::FAILURE : compression_status::SUCCESS; - statuses[blockIdx.x].reserved = 0; + results[blockIdx.x].reserved = 0; } } void gpu_snap(device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span results, rmm::cuda_stream_view stream) { dim3 dim_block(128, 1); // 4 warps per stream, 1 stream per block dim3 dim_grid(inputs.size(), 1); if (inputs.size() > 0) { - snap_kernel<<>>(inputs, outputs, statuses); + snap_kernel<<>>(inputs, outputs, results); } } diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu index 8a3fe717d9e..8b13ddd1de4 100644 --- a/cpp/src/io/comp/unsnap.cu +++ b/cpp/src/io/comp/unsnap.cu @@ -627,7 +627,7 @@ template __global__ void __launch_bounds__(block_size) unsnap_kernel(device_span const> inputs, device_span const> outputs, - device_span statuses) + device_span results) { __shared__ __align__(16) unsnap_state_s state_g; __shared__ cub::WarpReduce::TempStorage temp_storage; @@ -698,26 +698,26 @@ __global__ void __launch_bounds__(block_size) __syncthreads(); } if (!t) { - statuses[strm_id].bytes_written = s->uncompressed_size - s->bytes_left; - statuses[strm_id].status = + results[strm_id].bytes_written = s->uncompressed_size - s->bytes_left; + results[strm_id].status = (s->error == 0) ? compression_status::SUCCESS : compression_status::FAILURE; if (log_cyclecount) { - statuses[strm_id].reserved = clock() - s->tstart; + results[strm_id].reserved = clock() - s->tstart; } else { - statuses[strm_id].reserved = 0; + results[strm_id].reserved = 0; } } } void gpu_unsnap(device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span results, rmm::cuda_stream_view stream) { dim3 dim_block(128, 1); // 4 warps per stream, 1 stream per block dim3 dim_grid(inputs.size(), 1); // TODO: Check max grid dimensions vs max expected count - unsnap_kernel<128><<>>(inputs, outputs, statuses); + unsnap_kernel<128><<>>(inputs, outputs, results); } } // namespace io diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp index 177d40a4adc..c7a7a423cf2 100644 --- a/cpp/src/io/orc/orc_gpu.hpp +++ b/cpp/src/io/orc/orc_gpu.hpp @@ -56,12 +56,12 @@ struct CompressedStreamInfo { } const uint8_t* compressed_data; // [in] base ptr to compressed stream data uint8_t* uncompressed_data; // [in] base ptr to uncompressed stream data or NULL if not known yet - size_t compressed_data_size; // [in] compressed data size for this stream - device_span* dec_in_ctl; // [in] input buffer to decompress - device_span* dec_out_ctl; // [in] output buffer to decompress into - device_span decstatus; // [in] results of decompression - device_span* copy_in_ctl; // [out] input buffer to copy - device_span* copy_out_ctl; // [out] output buffer to copy to + size_t compressed_data_size; // [in] compressed data size for this stream + device_span* dec_in_ctl; // [in] input buffer to decompress + device_span* dec_out_ctl; // [in] output buffer to decompress into + device_span dec_res; // [in] results of decompression + device_span* copy_in_ctl; // [out] input buffer to copy + device_span* copy_out_ctl; // [out] output buffer to copy to uint32_t num_compressed_blocks; // [in,out] number of entries in decctl(in), number of compressed // blocks(out) uint32_t num_uncompressed_blocks; // [in,out] number of entries in dec_in_ctl(in), number of @@ -351,7 +351,7 @@ void CompactOrcDataStreams(device_2dspan strm_desc, * @param[in] comp_block_align Required alignment for compressed blocks * @param[in,out] strm_desc StripeStream device array [stripe][stream] * @param[in,out] enc_streams chunk streams device array [column][rowgroup] - * @param[out] comp_stat Per-block compression status + * @param[out] comp_res Per-block compression status * @param[in] stream CUDA stream used for device memory operations and kernel launches */ void CompressOrcDataStreams(uint8_t* compressed_data, @@ -362,7 +362,7 @@ void CompressOrcDataStreams(uint8_t* compressed_data, uint32_t comp_block_align, device_2dspan strm_desc, device_2dspan enc_streams, - device_span comp_stat, + device_span comp_res, rmm::cuda_stream_view stream); /** diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 4e79a8f6399..7ff3ee85939 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -262,26 +262,26 @@ auto decimal_column_type(std::vector const& decimal128_columns, } // namespace -__global__ void decompress_check_kernel(device_span stats, +__global__ void decompress_check_kernel(device_span results, bool* any_block_failure) { auto tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < stats.size()) { - if (stats[tid].status != compression_status::SUCCESS) { + if (tid < results.size()) { + if (results[tid].status != compression_status::SUCCESS) { *any_block_failure = true; // Doesn't need to be atomic } } } -void decompress_check(device_span stats, +void decompress_check(device_span results, bool* any_block_failure, rmm::cuda_stream_view stream) { - if (stats.empty()) { return; } // early exit for empty stats + if (results.empty()) { return; } // early exit for empty results dim3 block(128); - dim3 grid(cudf::util::div_rounding_up_safe(stats.size(), static_cast(block.x))); - decompress_check_kernel<<>>(stats, any_block_failure); + dim3 grid(cudf::util::div_rounding_up_safe(results.size(), static_cast(block.x))); + decompress_check_kernel<<>>(results, any_block_failure); } rmm::device_buffer reader::impl::decompress_stripe_data( @@ -337,10 +337,10 @@ rmm::device_buffer reader::impl::decompress_stripe_data( num_compressed_blocks + num_uncompressed_blocks, stream); rmm::device_uvector> inflate_out( num_compressed_blocks + num_uncompressed_blocks, stream); - rmm::device_uvector inflate_stats(num_compressed_blocks, stream); + rmm::device_uvector inflate_res(num_compressed_blocks, stream); thrust::fill(rmm::exec_policy(stream), - inflate_stats.begin(), - inflate_stats.end(), + inflate_res.begin(), + inflate_res.end(), compression_result{0, compression_status::FAILURE}); // Parse again to populate the decompression input/output buffers @@ -353,8 +353,8 @@ rmm::device_buffer reader::impl::decompress_stripe_data( compinfo[i].uncompressed_data = dst_base + decomp_offset; compinfo[i].dec_in_ctl = inflate_in.data() + start_pos; compinfo[i].dec_out_ctl = inflate_out.data() + start_pos; - compinfo[i].decstatus = {inflate_stats.data() + start_pos, compinfo[i].num_compressed_blocks}; - compinfo[i].copy_in_ctl = inflate_in.data() + start_pos_uncomp; + compinfo[i].dec_res = {inflate_res.data() + start_pos, compinfo[i].num_compressed_blocks}; + compinfo[i].copy_in_ctl = inflate_in.data() + start_pos_uncomp; compinfo[i].copy_out_ctl = inflate_out.data() + start_pos_uncomp; stream_info[i].dst_pos = decomp_offset; @@ -383,13 +383,13 @@ rmm::device_buffer reader::impl::decompress_stripe_data( nvcomp::batched_decompress(nvcomp::compression_type::DEFLATE, inflate_in_view, inflate_out_view, - inflate_stats, + inflate_res, max_uncomp_block_size, total_decomp_size, stream); } else { gpuinflate( - inflate_in_view, inflate_out_view, inflate_stats, gzip_header_included::NO, stream); + inflate_in_view, inflate_out_view, inflate_res, gzip_header_included::NO, stream); } break; case compression_type::SNAPPY: @@ -397,26 +397,26 @@ rmm::device_buffer reader::impl::decompress_stripe_data( nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY, inflate_in_view, inflate_out_view, - inflate_stats, + inflate_res, max_uncomp_block_size, total_decomp_size, stream); } else { - gpu_unsnap(inflate_in_view, inflate_out_view, inflate_stats, stream); + gpu_unsnap(inflate_in_view, inflate_out_view, inflate_res, stream); } break; case compression_type::ZSTD: nvcomp::batched_decompress(nvcomp::compression_type::ZSTD, inflate_in_view, inflate_out_view, - inflate_stats, + inflate_res, max_uncomp_block_size, total_decomp_size, stream); break; default: CUDF_FAIL("Unexpected decompression dispatch"); break; } - decompress_check(inflate_stats, any_block_failure.device_ptr(), stream); + decompress_check(inflate_res, any_block_failure.device_ptr(), stream); } if (num_uncompressed_blocks > 0) { device_span> copy_in_view{inflate_in.data() + num_compressed_blocks, diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index df338895a8e..b1c04099e64 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -1144,7 +1144,7 @@ __global__ void __launch_bounds__(1024) * @param[in] chunks EncChunk device array [rowgroup][column] * @param[out] inputs Per-block compression input buffers * @param[out] outputs Per-block compression output buffers - * @param[out] statuses Per-block compression status + * @param[out] results Per-block compression status * @param[in] compressed_bfr Compression output buffer * @param[in] comp_blk_size Compression block size * @param[in] max_comp_blk_size Max size of any block after compression @@ -1156,7 +1156,7 @@ __global__ void __launch_bounds__(256) device_2dspan streams, // const? device_span> inputs, device_span> outputs, - device_span statuses, + device_span results, uint8_t* compressed_bfr, uint32_t comp_blk_size, uint32_t max_comp_blk_size, @@ -1184,10 +1184,10 @@ __global__ void __launch_bounds__(256) num_blocks = (ss.stream_size > 0) ? (ss.stream_size - 1) / comp_blk_size + 1 : 1; for (uint32_t b = t; b < num_blocks; b += 256) { uint32_t blk_size = min(comp_blk_size, ss.stream_size - min(b * comp_blk_size, ss.stream_size)); - inputs[ss.first_block + b] = {src + b * comp_blk_size, blk_size}; - auto const dst_offset = b * (padded_block_header_size + padded_comp_block_size); - outputs[ss.first_block + b] = {dst + dst_offset, max_comp_blk_size}; - statuses[ss.first_block + b] = {0, compression_status::FAILURE}; + inputs[ss.first_block + b] = {src + b * comp_blk_size, blk_size}; + auto const dst_offset = b * (padded_block_header_size + padded_comp_block_size); + outputs[ss.first_block + b] = {dst + dst_offset, max_comp_blk_size}; + results[ss.first_block + b] = {0, compression_status::FAILURE}; } } @@ -1199,7 +1199,7 @@ __global__ void __launch_bounds__(256) * @param[in] chunks EncChunk device array [rowgroup][column] * @param[in] inputs Per-block compression input buffers * @param[out] outputs Per-block compression output buffers - * @param[out] statuses Per-block compression status + * @param[out] results Per-block compression status * @param[in] compressed_bfr Compression output buffer * @param[in] comp_blk_size Compression block size * @param[in] max_comp_blk_size Max size of any block after compression @@ -1209,7 +1209,7 @@ __global__ void __launch_bounds__(1024) gpuCompactCompressedBlocks(device_2dspan strm_desc, device_span const> inputs, device_span const> outputs, - device_span statuses, + device_span results, uint8_t* compressed_bfr, uint32_t comp_blk_size, uint32_t max_comp_blk_size) @@ -1235,16 +1235,16 @@ __global__ void __launch_bounds__(1024) if (t == 0) { auto const src_len = min(comp_blk_size, ss.stream_size - min(b * comp_blk_size, ss.stream_size)); - auto dst_len = (statuses[ss.first_block + b].status == compression_status::SUCCESS) - ? statuses[ss.first_block + b].bytes_written + auto dst_len = (results[ss.first_block + b].status == compression_status::SUCCESS) + ? results[ss.first_block + b].bytes_written : src_len; uint32_t blk_size24{}; - if (statuses[ss.first_block + b].status == compression_status::SUCCESS) { + if (results[ss.first_block + b].status == compression_status::SUCCESS) { // Copy from uncompressed source - src = inputs[ss.first_block + b].data(); - statuses[ss.first_block + b].bytes_written = src_len; - dst_len = src_len; - blk_size24 = dst_len * 2 + 1; + src = inputs[ss.first_block + b].data(); + results[ss.first_block + b].bytes_written = src_len; + dst_len = src_len; + blk_size24 = dst_len * 2 + 1; } else { // Compressed block src = outputs[ss.first_block + b].data(); @@ -1317,7 +1317,7 @@ void CompressOrcDataStreams(uint8_t* compressed_data, uint32_t comp_block_align, device_2dspan strm_desc, device_2dspan enc_streams, - device_span comp_stat, + device_span comp_res, rmm::cuda_stream_view stream) { rmm::device_uvector> comp_in(num_compressed_blocks, stream); @@ -1329,7 +1329,7 @@ void CompressOrcDataStreams(uint8_t* compressed_data, enc_streams, comp_in, comp_out, - comp_stat, + comp_res, compressed_data, comp_blk_size, max_comp_blk_size, @@ -1339,16 +1339,16 @@ void CompressOrcDataStreams(uint8_t* compressed_data, try { if (nvcomp::is_compression_enabled(nvcomp::compression_type::SNAPPY)) { nvcomp::batched_compress( - nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_stat, stream); + nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream); } else { - gpu_snap(comp_in, comp_out, comp_stat, stream); + gpu_snap(comp_in, comp_out, comp_res, stream); } } catch (...) { // There was an error in compressing so set an error status for each block thrust::for_each( rmm::exec_policy(stream), - comp_stat.begin(), - comp_stat.end(), + comp_res.begin(), + comp_res.end(), [] __device__(compression_result & stat) { stat.status = compression_status::FAILURE; }); // Since SNAPPY is the default compression (may not be explicitly requested), fall back to // writing without compression @@ -1356,17 +1356,17 @@ void CompressOrcDataStreams(uint8_t* compressed_data, } else if (compression == ZLIB and nvcomp::is_compression_enabled(nvcomp::compression_type::DEFLATE)) { nvcomp::batched_compress( - nvcomp::compression_type::DEFLATE, comp_in, comp_out, comp_stat, stream); + nvcomp::compression_type::DEFLATE, comp_in, comp_out, comp_res, stream); } else if (compression == ZSTD and nvcomp::is_compression_enabled(nvcomp::compression_type::ZSTD)) { - nvcomp::batched_compress(nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_stat, stream); + nvcomp::batched_compress(nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_res, stream); } else if (compression != NONE) { CUDF_FAIL("Unsupported compression type"); } dim3 dim_block_compact(1024, 1); gpuCompactCompressedBlocks<<>>( - strm_desc, comp_in, comp_out, comp_stat, compressed_data, comp_blk_size, max_comp_blk_size); + strm_desc, comp_in, comp_out, comp_res, compressed_data, comp_blk_size, max_comp_blk_size); } } // namespace gpu diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu index 0166644c386..bd65089810e 100644 --- a/cpp/src/io/orc/stripe_init.cu +++ b/cpp/src/io/orc/stripe_init.cu @@ -160,7 +160,7 @@ __global__ void __launch_bounds__(128, 8) const uint8_t* cur = s->info.compressed_data; const uint8_t* end = cur + s->info.compressed_data_size; auto dec_out = s->info.dec_out_ctl; - auto dec_status = s->info.decstatus; + auto dec_result = s->info.dec_res; uint8_t* uncompressed_actual = s->info.uncompressed_data; uint8_t* uncompressed_estimated = uncompressed_actual; uint32_t num_compressed_blocks = 0; @@ -180,7 +180,7 @@ __global__ void __launch_bounds__(128, 8) if (num_compressed_blocks > max_compressed_blocks) { break; } uint32_t const dst_size = dec_out[num_compressed_blocks].size(); uncompressed_size_est = shuffle((lane_id == 0) ? dst_size : 0); - uint32_t const bytes_written = dec_status[num_compressed_blocks].bytes_written; + uint32_t const bytes_written = dec_result[num_compressed_blocks].bytes_written; uncompressed_size_actual = shuffle((lane_id == 0) ? bytes_written : 0); } // In practice, this should never happen with a well-behaved writer, as we would expect the @@ -379,7 +379,7 @@ static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s* s, const uint8_t* start = s->strm_info[ci_id].compressed_data; const uint8_t* cur = start; const uint8_t* end = cur + s->strm_info[ci_id].compressed_data_size; - auto decstatus = s->strm_info[ci_id].decstatus.data(); + auto dec_result = s->strm_info[ci_id].dec_res.data(); uint32_t uncomp_offset = 0; for (;;) { uint32_t block_len; @@ -396,8 +396,8 @@ static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s* s, if (is_uncompressed) { uncomp_offset += block_len; } else { - uncomp_offset += decstatus->bytes_written; - decstatus++; + uncomp_offset += dec_result->bytes_written; + dec_result++; } } s->rowgroups[t].strm_offset[ci_id] += uncomp_offset; diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 3c64ed45976..a5e9e9da4cb 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -1385,7 +1385,7 @@ void writer::impl::write_index_stream(int32_t stripe_id, file_segmentation const& segmentation, host_2dspan enc_streams, host_2dspan strm_desc, - host_span comp_out, + host_span comp_res, std::vector const& rg_stats, StripeInformation* stripe, orc_streams* streams, @@ -1410,17 +1410,17 @@ void writer::impl::write_index_stream(int32_t stripe_id, } return record; }; - auto scan_record = [=, &comp_out](gpu::encoder_chunk_streams const& stream, + auto scan_record = [=, &comp_res](gpu::encoder_chunk_streams const& stream, gpu::StreamIndexType type, row_group_index_info& record) { if (record.pos >= 0) { record.pos += stream.lengths[type]; while ((record.pos >= 0) && (record.blk_pos >= 0) && (static_cast(record.pos) >= compression_blocksize_) && - (record.comp_pos + block_header_size + comp_out[record.blk_pos].bytes_written < + (record.comp_pos + block_header_size + comp_res[record.blk_pos].bytes_written < static_cast(record.comp_size))) { record.pos -= compression_blocksize_; - record.comp_pos += block_header_size + comp_out[record.blk_pos].bytes_written; + record.comp_pos += block_header_size + comp_res[record.blk_pos].bytes_written; record.blk_pos += 1; } } @@ -2226,10 +2226,10 @@ void writer::impl::write(table_view const& table) // Compress the data streams rmm::device_buffer compressed_data(compressed_bfr_size, stream); - hostdevice_vector comp_stats(num_compressed_blocks, stream); + hostdevice_vector comp_results(num_compressed_blocks, stream); thrust::fill(rmm::exec_policy(stream), - comp_stats.d_begin(), - comp_stats.d_end(), + comp_results.d_begin(), + comp_results.d_end(), compression_result{0, compression_status::FAILURE}); if (compression_kind_ != NONE) { strm_descs.host_to_device(stream); @@ -2241,10 +2241,10 @@ void writer::impl::write(table_view const& table) comp_block_alignment(compression_kind_), strm_descs, enc_data.streams, - comp_stats, + comp_results, stream); strm_descs.device_to_host(stream); - comp_stats.device_to_host(stream, true); + comp_results.device_to_host(stream, true); } ProtobufWriter pbw_(&buffer_); @@ -2271,7 +2271,7 @@ void writer::impl::write(table_view const& table) segmentation, enc_data.streams, strm_descs, - comp_stats, + comp_results, intermediate_stats.rowgroup_blobs, &stripe, &streams, diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 3b9956dc3ee..77984ee3c27 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -870,7 +870,7 @@ __global__ void __launch_bounds__(128, 8) gpuEncodePages(device_span pages, device_span> comp_in, device_span> comp_out, - device_span comp_stats) + device_span comp_results) { __shared__ __align__(8) page_enc_state_s state_g; using block_scan = cub::BlockScan; @@ -1225,9 +1225,9 @@ __global__ void __launch_bounds__(128, 8) comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size, 0}; // size is unused } pages[blockIdx.x] = s->page; - if (not comp_stats.empty()) { - comp_stats[blockIdx.x] = {0, compression_status::FAILURE}; - pages[blockIdx.x].comp_stat = &comp_stats[blockIdx.x]; + if (not comp_results.empty()) { + comp_results[blockIdx.x] = {0, compression_status::FAILURE}; + pages[blockIdx.x].comp_res = &comp_results[blockIdx.x]; } } } @@ -1260,10 +1260,10 @@ __global__ void __launch_bounds__(128) gpuDecideCompression(device_spanbytes_written; - if (comp_status->status != compression_status::SUCCESS) { atomicAdd(&error_count, 1); } + compressed_data_size += comp_res->bytes_written; + if (comp_res->status != compression_status::SUCCESS) { atomicAdd(&error_count, 1); } } } uncompressed_data_size = warp_reduce(temp_storage[0]).Sum(uncompressed_data_size); @@ -1680,7 +1680,7 @@ __device__ uint8_t* EncodeStatistics(uint8_t* start, // blockDim(128, 1, 1) __global__ void __launch_bounds__(128) gpuEncodePageHeaders(device_span pages, - device_span comp_stat, + device_span comp_results, device_span page_stats, const statistics_chunk* chunk_stats) { @@ -1709,7 +1709,7 @@ __global__ void __launch_bounds__(128) uncompressed_page_size = page_g.max_data_size; if (ck_g.is_compressed) { hdr_start = page_g.compressed_data; - compressed_page_size = (uint32_t)comp_stat[blockIdx.x].bytes_written; + compressed_page_size = (uint32_t)comp_results[blockIdx.x].bytes_written; page_g.max_data_size = compressed_page_size; } else { hdr_start = page_g.page_data; @@ -2067,13 +2067,14 @@ void InitEncoderPages(device_2dspan chunks, void EncodePages(device_span pages, device_span> comp_in, device_span> comp_out, - device_span comp_stats, + device_span comp_results, rmm::cuda_stream_view stream) { auto num_pages = pages.size(); // A page is part of one column. This is launching 1 block per page. 1 block will exclusively // deal with one datatype. - gpuEncodePages<128><<>>(pages, comp_in, comp_out, comp_stats); + gpuEncodePages<128> + <<>>(pages, comp_in, comp_out, comp_results); } void DecideCompression(device_span chunks, rmm::cuda_stream_view stream) @@ -2082,7 +2083,7 @@ void DecideCompression(device_span chunks, rmm::cuda_stream_view } void EncodePageHeaders(device_span pages, - device_span comp_stats, + device_span comp_results, device_span page_stats, const statistics_chunk* chunk_stats, rmm::cuda_stream_view stream) @@ -2090,7 +2091,7 @@ void EncodePageHeaders(device_span pages, // TODO: single thread task. No need for 128 threads/block. Earlier it used to employ rest of the // threads to coop load structs gpuEncodePageHeaders<<>>( - pages, comp_stats, page_stats, chunk_stats); + pages, comp_results, page_stats, chunk_stats); } void GatherPages(device_span chunks, diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 6f83744fd09..ba8a5b0be4a 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -367,7 +367,7 @@ struct EncPage { uint32_t num_leaf_values; //!< Values in page. Different from num_rows in case of nested types uint32_t num_values; //!< Number of def/rep level values in page. Includes null/empty elements in //!< non-leaf levels - compression_result* comp_stat; //!< Ptr to compression status + compression_result* comp_res; //!< Ptr to compression result }; /** @@ -539,13 +539,13 @@ void InitEncoderPages(cudf::detail::device_2dspan chunks, * @param[in,out] pages Device array of EncPages (unordered) * @param[out] comp_in Compressor input buffers * @param[out] comp_in Compressor output buffers - * @param[out] comp_stats Compressor statuses + * @param[out] comp_stats Compressor results * @param[in] stream CUDA stream to use, default 0 */ void EncodePages(device_span pages, device_span> comp_in, device_span> comp_out, - device_span comp_stats, + device_span comp_res, rmm::cuda_stream_view stream); /** @@ -566,7 +566,7 @@ void DecideCompression(device_span chunks, rmm::cuda_stream_view * @param[in] stream CUDA stream to use, default 0 */ void EncodePageHeaders(device_span pages, - device_span comp_stats, + device_span comp_res, device_span page_stats, const statistics_chunk* chunk_stats, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index c09d154e6c3..d2598c79fda 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -247,14 +247,14 @@ std::tuple conversion_info(type_id column_type_id, return std::make_tuple(type_width, clock_rate, converted_type); } -inline void decompress_check(device_span stats, +inline void decompress_check(device_span results, rmm::cuda_stream_view stream) { CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream), - stats.begin(), - stats.end(), - [] __device__(auto const& stat) { - return stat.status == compression_status::SUCCESS; + results.begin(), + results.end(), + [] __device__(auto const& res) { + return res.status == compression_status::SUCCESS; }), "Error during decompression"); } @@ -1143,10 +1143,10 @@ rmm::device_buffer reader::impl::decompress_page_data( std::vector> comp_out; comp_out.reserve(num_comp_pages); - rmm::device_uvector comp_stats(num_comp_pages, _stream); + rmm::device_uvector comp_res(num_comp_pages, _stream); thrust::fill(rmm::exec_policy(_stream), - comp_stats.begin(), - comp_stats.end(), + comp_res.begin(), + comp_res.end(), compression_result{0, compression_status::FAILURE}); size_t decomp_offset = 0; @@ -1171,31 +1171,30 @@ rmm::device_buffer reader::impl::decompress_page_data( host_span const> comp_out_view(comp_out.data() + start_pos, codec.num_pages); auto const d_comp_out = cudf::detail::make_device_uvector_async(comp_out_view, _stream); - device_span d_comp_stats_view(comp_stats.data() + start_pos, - codec.num_pages); + device_span d_comp_res_view(comp_res.data() + start_pos, codec.num_pages); switch (codec.compression_type) { case parquet::GZIP: - gpuinflate(d_comp_in, d_comp_out, d_comp_stats_view, gzip_header_included::YES, _stream); + gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, _stream); break; case parquet::SNAPPY: if (nvcomp_integration::is_stable_enabled()) { nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY, d_comp_in, d_comp_out, - d_comp_stats_view, + d_comp_res_view, codec.max_decompressed_size, codec.total_decomp_size, _stream); } else { - gpu_unsnap(d_comp_in, d_comp_out, d_comp_stats_view, _stream); + gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, _stream); } break; case parquet::ZSTD: nvcomp::batched_decompress(nvcomp::compression_type::ZSTD, d_comp_in, d_comp_out, - d_comp_stats_view, + d_comp_res_view, codec.max_decompressed_size, codec.total_decomp_size, _stream); @@ -1203,7 +1202,7 @@ rmm::device_buffer reader::impl::decompress_page_data( case parquet::BROTLI: gpu_debrotli(d_comp_in, d_comp_out, - d_comp_stats_view, + d_comp_res_view, debrotli_scratch.data(), debrotli_scratch.size(), _stream); @@ -1213,7 +1212,7 @@ rmm::device_buffer reader::impl::decompress_page_data( start_pos += codec.num_pages; } - decompress_check(comp_stats, _stream); + decompress_check(comp_res, _stream); // Update the page information in device memory with the updated value of // page_data; it now points to the uncompressed data buffer diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index e75f5faea46..2bfd7c1ba4d 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1161,26 +1161,26 @@ void writer::impl::encode_pages(hostdevice_2dvector& chunks rmm::device_uvector> comp_in(max_comp_pages, stream); rmm::device_uvector> comp_out(max_comp_pages, stream); - rmm::device_uvector comp_stats(max_comp_pages, stream); + rmm::device_uvector comp_res(max_comp_pages, stream); thrust::fill(rmm::exec_policy(stream), - comp_stats.begin(), - comp_stats.end(), + comp_res.begin(), + comp_res.end(), compression_result{0, compression_status::FAILURE}); - gpu::EncodePages(batch_pages, comp_in, comp_out, comp_stats, stream); + gpu::EncodePages(batch_pages, comp_in, comp_out, comp_res, stream); switch (compression_) { case parquet::Compression::SNAPPY: if (nvcomp::is_compression_enabled(nvcomp::compression_type::SNAPPY)) { nvcomp::batched_compress( - nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_stats, stream); + nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream); } else { - gpu_snap(comp_in, comp_out, comp_stats, stream); + gpu_snap(comp_in, comp_out, comp_res, stream); } break; case parquet::Compression::ZSTD: if (nvcomp::is_compression_enabled(nvcomp::compression_type::ZSTD)) { nvcomp::batched_compress( - nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_stats, stream); + nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_res, stream); } break; case parquet::Compression::UNCOMPRESSED: break; @@ -1191,7 +1191,7 @@ void writer::impl::encode_pages(hostdevice_2dvector& chunks // chunk-level auto d_chunks_in_batch = chunks.device_view().subspan(first_rowgroup, rowgroups_in_batch); DecideCompression(d_chunks_in_batch.flat_view(), stream); - EncodePageHeaders(batch_pages, comp_stats, batch_pages_stats, chunk_stats, stream); + EncodePageHeaders(batch_pages, comp_res, batch_pages_stats, chunk_stats, stream); GatherPages(d_chunks_in_batch.flat_view(), pages, stream); if (column_stats != nullptr) { From 282af9da40c7cb6fd086d815482a59bbee729967 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 9 Sep 2022 16:20:19 -0700 Subject: [PATCH 65/66] address Python code review --- python/cudf/cudf/tests/test_orc.py | 8 +++++--- python/cudf/cudf/tests/test_parquet.py | 10 ++++++---- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 39b745469ee..c2188003531 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1748,13 +1748,15 @@ def test_writer_protobuf_large_rowindexentry(): @pytest.mark.parametrize("compression", ["ZLIB", "ZSTD"]) def test_orc_writer_nvcomp(list_struct_buff, compression): expected = cudf.read_orc(list_struct_buff) + + buff = BytesIO() try: - buff = BytesIO() expected.to_orc(buff, compression=compression) - got = cudf.read_orc(buff) - assert_eq(expected, got) except RuntimeError: pytest.mark.xfail(reason="Newer nvCOMP version is required") + else: + got = pd.read_orc(buff) + assert_eq(expected, got) @pytest.mark.parametrize("index", [True, False, None]) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 5748b767494..2b7b9a140c2 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2551,15 +2551,17 @@ def test_parquet_writer_zstd(): size = 12345 expected = cudf.DataFrame( { - "a": np.arange(0, stop=size, dtype="int64"), + "a": np.arange(0, stop=size, dtype="float64"), "b": np.random.choice(list("abcd"), size=size), "c": np.random.choice(np.arange(4), size=size), } ) + + buff = BytesIO() try: - buff = BytesIO() expected.to_orc(buff, compression="ZSTD") - got = cudf.read_orc(buff) - assert_eq(expected, got) except RuntimeError: pytest.mark.xfail(reason="Newer nvCOMP version is required") + else: + got = pd.read_orc(buff) + assert_eq(expected, got) From f766f00a6f4f43d9be1b08516227556699fe5cde Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 9 Sep 2022 18:30:42 -0700 Subject: [PATCH 66/66] style --- python/cudf/cudf/tests/test_parquet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 2b7b9a140c2..11577fe5bb0 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2556,12 +2556,12 @@ def test_parquet_writer_zstd(): "c": np.random.choice(np.arange(4), size=size), } ) - + buff = BytesIO() try: expected.to_orc(buff, compression="ZSTD") except RuntimeError: pytest.mark.xfail(reason="Newer nvCOMP version is required") - else: + else: got = pd.read_orc(buff) assert_eq(expected, got)