From e164ffcda4e582dd5b4cd1ce3973d5782ab14d05 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 14 Oct 2022 03:33:36 +0000 Subject: [PATCH 1/3] fix local offset handling --- cpp/src/io/text/bgzip_data_chunk_source.cu | 4 +-- cpp/tests/io/text/data_chunk_source_test.cpp | 29 ++++++++++++++++++++ 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu index 9c4ff218783..e4b6bad614d 100644 --- a/cpp/src/io/text/bgzip_data_chunk_source.cu +++ b/cpp/src/io/text/bgzip_data_chunk_source.cu @@ -271,8 +271,8 @@ class bgzip_data_chunk_reader : public data_chunk_reader { // seek to the beginning of the provided local offset auto const local_pos = virtual_begin & 0xFFFFu; if (local_pos > 0) { - CUDF_EXPECTS(_curr_blocks.h_compressed_offsets.size() > 1 && - local_pos < _curr_blocks.h_compressed_offsets[1], + CUDF_EXPECTS(_curr_blocks.h_decompressed_offsets.size() > 1 && + local_pos < _curr_blocks.h_decompressed_offsets[1], "local part of virtual offset is out of bounds"); _curr_blocks.consume_bytes(local_pos); } diff --git a/cpp/tests/io/text/data_chunk_source_test.cpp b/cpp/tests/io/text/data_chunk_source_test.cpp index 7cb75aea8e2..64b8c40aa31 100644 --- a/cpp/tests/io/text/data_chunk_source_test.cpp +++ b/cpp/tests/io/text/data_chunk_source_test.cpp @@ -342,4 +342,33 @@ TEST_F(DataChunkSourceTest, BgzipCompressedSourceVirtualOffsets) test_source(input, *source); } +TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsetsSingleCompressedGZipBlock) +{ + auto const filename = temp_env->get_temp_filepath("bgzip_source_offsets_single_compressed_block"); + std::string const input{"collection unit brings"}; + std::string head_garbage{"garbage"}; + // make the head_garbage large enough so the local offset will overrun the compressed block size + for (int i = 0; i < 10; i++) { + head_garbage += head_garbage; + } + std::string const tail_garbage{"GARBAGE"}; + std::size_t begin_compressed_offset{}; + std::size_t end_compressed_offset{}; + std::size_t const begin_local_offset{head_garbage.size()}; + std::size_t const end_local_offset{head_garbage.size() + input.size()}; + { + std::ofstream output_stream{filename}; + cudf::io::text::detail::bgzip::write_compressed_block(output_stream, + head_garbage + input + tail_garbage); + cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream, {}); + } + + auto const source = + cudf::io::text::make_source_from_bgzip_file(filename, + begin_compressed_offset << 16 | begin_local_offset, + end_compressed_offset << 16 | end_local_offset); + + test_source(input, *source); +} + CUDF_TEST_PROGRAM_MAIN() From f0b6f5c546f271f5357c0b85ff609c1666d30b4b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 14 Oct 2022 13:17:57 +0000 Subject: [PATCH 2/3] more const, extract virtual offset computation --- cpp/tests/io/text/data_chunk_source_test.cpp | 52 +++++++++----------- 1 file changed, 22 insertions(+), 30 deletions(-) diff --git a/cpp/tests/io/text/data_chunk_source_test.cpp b/cpp/tests/io/text/data_chunk_source_test.cpp index 64b8c40aa31..8491a9d6473 100644 --- a/cpp/tests/io/text/data_chunk_source_test.cpp +++ b/cpp/tests/io/text/data_chunk_source_test.cpp @@ -130,6 +130,11 @@ enum class compression { ENABLED, DISABLED }; enum class eof { ADD_EOF_BLOCK, NO_EOF_BLOCK }; +uint64_t virtual_offset(std::size_t block_offset, std::size_t local_offset) +{ + return (block_offset << 16) | local_offset; +} + void write_bgzip(std::ostream& output_stream, cudf::host_span data, std::default_random_engine& rng, @@ -241,10 +246,10 @@ TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsets) } input = begininput + input + endinput; - auto const source = - cudf::io::text::make_source_from_bgzip_file(filename, - begin_compressed_offset << 16 | begin_local_offset, - end_compressed_offset << 16 | end_local_offset); + auto const source = cudf::io::text::make_source_from_bgzip_file( + filename, + virtual_offset(begin_compressed_offset, begin_local_offset), + virtual_offset(end_compressed_offset, end_local_offset)); test_source(input, *source); } @@ -255,8 +260,6 @@ TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsetsSingleGZipBlock) std::string const input{"collection unit brings"}; std::string const head_garbage{"garbage"}; std::string const tail_garbage{"GARBAGE"}; - std::size_t begin_compressed_offset{}; - std::size_t end_compressed_offset{}; std::size_t const begin_local_offset{head_garbage.size()}; std::size_t const end_local_offset{head_garbage.size() + input.size()}; { @@ -266,10 +269,8 @@ TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsetsSingleGZipBlock) cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream, {}); } - auto const source = - cudf::io::text::make_source_from_bgzip_file(filename, - begin_compressed_offset << 16 | begin_local_offset, - end_compressed_offset << 16 | end_local_offset); + auto const source = cudf::io::text::make_source_from_bgzip_file( + filename, virtual_offset(0, begin_local_offset), virtual_offset(0, end_local_offset)); test_source(input, *source); } @@ -280,7 +281,6 @@ TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsetsSingleChunk) std::string const input{"collection unit brings"}; std::string const head_garbage{"garbage"}; std::string const tail_garbage{"GARBAGE"}; - std::size_t begin_compressed_offset{}; std::size_t end_compressed_offset{}; std::size_t const begin_local_offset{head_garbage.size()}; std::size_t const end_local_offset{input.size() - 10}; @@ -294,10 +294,10 @@ TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsetsSingleChunk) cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream, {}); } - auto const source = - cudf::io::text::make_source_from_bgzip_file(filename, - begin_compressed_offset << 16 | begin_local_offset, - end_compressed_offset << 16 | end_local_offset); + auto const source = cudf::io::text::make_source_from_bgzip_file( + filename, + virtual_offset(0, begin_local_offset), + virtual_offset(end_compressed_offset, end_local_offset)); test_source(input, *source); } @@ -335,10 +335,10 @@ TEST_F(DataChunkSourceTest, BgzipCompressedSourceVirtualOffsets) } input = begininput + input + endinput; - auto source = - cudf::io::text::make_source_from_bgzip_file(filename, - begin_compressed_offset << 16 | begin_local_offset, - end_compressed_offset << 16 | end_local_offset); + auto source = cudf::io::text::make_source_from_bgzip_file( + filename, + virtual_offset(begin_compressed_offset, begin_local_offset), + virtual_offset(end_compressed_offset, end_local_offset)); test_source(input, *source); } @@ -346,14 +346,8 @@ TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsetsSingleCompressedGZipBlock) { auto const filename = temp_env->get_temp_filepath("bgzip_source_offsets_single_compressed_block"); std::string const input{"collection unit brings"}; - std::string head_garbage{"garbage"}; - // make the head_garbage large enough so the local offset will overrun the compressed block size - for (int i = 0; i < 10; i++) { - head_garbage += head_garbage; - } + std::string const head_garbage(10000, 'g'); std::string const tail_garbage{"GARBAGE"}; - std::size_t begin_compressed_offset{}; - std::size_t end_compressed_offset{}; std::size_t const begin_local_offset{head_garbage.size()}; std::size_t const end_local_offset{head_garbage.size() + input.size()}; { @@ -363,10 +357,8 @@ TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsetsSingleCompressedGZipBlock) cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream, {}); } - auto const source = - cudf::io::text::make_source_from_bgzip_file(filename, - begin_compressed_offset << 16 | begin_local_offset, - end_compressed_offset << 16 | end_local_offset); + auto const source = cudf::io::text::make_source_from_bgzip_file( + filename, virtual_offset(0, begin_local_offset), virtual_offset(0, end_local_offset)); test_source(input, *source); } From f2019c914310fa17e4bb84317e56406aa4a1c544 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 14 Oct 2022 13:36:28 +0000 Subject: [PATCH 3/3] preallocate storage --- cpp/tests/io/text/data_chunk_source_test.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/cpp/tests/io/text/data_chunk_source_test.cpp b/cpp/tests/io/text/data_chunk_source_test.cpp index 8491a9d6473..2111d66a066 100644 --- a/cpp/tests/io/text/data_chunk_source_test.cpp +++ b/cpp/tests/io/text/data_chunk_source_test.cpp @@ -198,6 +198,7 @@ TEST_F(DataChunkSourceTest, BgzipSource) { auto const filename = temp_env->get_temp_filepath("bgzip_source"); std::string input{"bananarama"}; + input.reserve(input.size() << 25); for (int i = 0; i < 24; i++) { input = input + input; } @@ -216,13 +217,11 @@ TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsets) { auto const filename = temp_env->get_temp_filepath("bgzip_source_offsets"); std::string input{"bananarama"}; + input.reserve(input.size() << 25); for (int i = 0; i < 24; i++) { input = input + input; } - std::string padding_garbage{"garbage"}; - for (int i = 0; i < 10; i++) { - padding_garbage = padding_garbage + padding_garbage; - } + std::string const padding_garbage(10000, 'g'); std::string const data_garbage{"GARBAGE"}; std::string const begininput{"begin of bananarama"}; std::string const endinput{"end of bananarama"}; @@ -306,13 +305,11 @@ TEST_F(DataChunkSourceTest, BgzipCompressedSourceVirtualOffsets) { auto const filename = temp_env->get_temp_filepath("bgzip_source_compressed_offsets"); std::string input{"bananarama"}; + input.reserve(input.size() << 25); for (int i = 0; i < 24; i++) { input = input + input; } - std::string padding_garbage{"garbage"}; - for (int i = 0; i < 10; i++) { - padding_garbage = padding_garbage + padding_garbage; - } + std::string const padding_garbage(10000, 'g'); std::string const data_garbage{"GARBAGE"}; std::string const begininput{"begin of bananarama"}; std::string const endinput{"end of bananarama"};