Skip to content

Commit

Permalink
Fix local offset handling in bgzip reader (#11918)
Browse files Browse the repository at this point in the history
We accidentally checked the local offset against the compressed, not the uncompressed size. The new test failed prior to fixing the behavior.

Authors:
  - Tobias Ribizel (https://github.com/upsj)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: #11918
  • Loading branch information
upsj authored Oct 14, 2022
1 parent e91d7d9 commit 8a31e26
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 29 deletions.
4 changes: 2 additions & 2 deletions cpp/src/io/text/bgzip_data_chunk_source.cu
Original file line number Diff line number Diff line change
Expand Up @@ -271,8 +271,8 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
// seek to the beginning of the provided local offset
auto const local_pos = virtual_begin & 0xFFFFu;
if (local_pos > 0) {
CUDF_EXPECTS(_curr_blocks.h_compressed_offsets.size() > 1 &&
local_pos < _curr_blocks.h_compressed_offsets[1],
CUDF_EXPECTS(_curr_blocks.h_decompressed_offsets.size() > 1 &&
local_pos < _curr_blocks.h_decompressed_offsets[1],
"local part of virtual offset is out of bounds");
_curr_blocks.consume_bytes(local_pos);
}
Expand Down
72 changes: 45 additions & 27 deletions cpp/tests/io/text/data_chunk_source_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,11 @@ enum class compression { ENABLED, DISABLED };

enum class eof { ADD_EOF_BLOCK, NO_EOF_BLOCK };

uint64_t virtual_offset(std::size_t block_offset, std::size_t local_offset)
{
return (block_offset << 16) | local_offset;
}

void write_bgzip(std::ostream& output_stream,
cudf::host_span<const char> data,
std::default_random_engine& rng,
Expand Down Expand Up @@ -193,6 +198,7 @@ TEST_F(DataChunkSourceTest, BgzipSource)
{
auto const filename = temp_env->get_temp_filepath("bgzip_source");
std::string input{"bananarama"};
input.reserve(input.size() << 25);
for (int i = 0; i < 24; i++) {
input = input + input;
}
Expand All @@ -211,13 +217,11 @@ TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsets)
{
auto const filename = temp_env->get_temp_filepath("bgzip_source_offsets");
std::string input{"bananarama"};
input.reserve(input.size() << 25);
for (int i = 0; i < 24; i++) {
input = input + input;
}
std::string padding_garbage{"garbage"};
for (int i = 0; i < 10; i++) {
padding_garbage = padding_garbage + padding_garbage;
}
std::string const padding_garbage(10000, 'g');
std::string const data_garbage{"GARBAGE"};
std::string const begininput{"begin of bananarama"};
std::string const endinput{"end of bananarama"};
Expand All @@ -241,10 +245,10 @@ TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsets)
}
input = begininput + input + endinput;

auto const source =
cudf::io::text::make_source_from_bgzip_file(filename,
begin_compressed_offset << 16 | begin_local_offset,
end_compressed_offset << 16 | end_local_offset);
auto const source = cudf::io::text::make_source_from_bgzip_file(
filename,
virtual_offset(begin_compressed_offset, begin_local_offset),
virtual_offset(end_compressed_offset, end_local_offset));

test_source(input, *source);
}
Expand All @@ -255,8 +259,6 @@ TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsetsSingleGZipBlock)
std::string const input{"collection unit brings"};
std::string const head_garbage{"garbage"};
std::string const tail_garbage{"GARBAGE"};
std::size_t begin_compressed_offset{};
std::size_t end_compressed_offset{};
std::size_t const begin_local_offset{head_garbage.size()};
std::size_t const end_local_offset{head_garbage.size() + input.size()};
{
Expand All @@ -266,10 +268,8 @@ TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsetsSingleGZipBlock)
cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream, {});
}

auto const source =
cudf::io::text::make_source_from_bgzip_file(filename,
begin_compressed_offset << 16 | begin_local_offset,
end_compressed_offset << 16 | end_local_offset);
auto const source = cudf::io::text::make_source_from_bgzip_file(
filename, virtual_offset(0, begin_local_offset), virtual_offset(0, end_local_offset));

test_source(input, *source);
}
Expand All @@ -280,7 +280,6 @@ TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsetsSingleChunk)
std::string const input{"collection unit brings"};
std::string const head_garbage{"garbage"};
std::string const tail_garbage{"GARBAGE"};
std::size_t begin_compressed_offset{};
std::size_t end_compressed_offset{};
std::size_t const begin_local_offset{head_garbage.size()};
std::size_t const end_local_offset{input.size() - 10};
Expand All @@ -294,10 +293,10 @@ TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsetsSingleChunk)
cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream, {});
}

auto const source =
cudf::io::text::make_source_from_bgzip_file(filename,
begin_compressed_offset << 16 | begin_local_offset,
end_compressed_offset << 16 | end_local_offset);
auto const source = cudf::io::text::make_source_from_bgzip_file(
filename,
virtual_offset(0, begin_local_offset),
virtual_offset(end_compressed_offset, end_local_offset));

test_source(input, *source);
}
Expand All @@ -306,13 +305,11 @@ TEST_F(DataChunkSourceTest, BgzipCompressedSourceVirtualOffsets)
{
auto const filename = temp_env->get_temp_filepath("bgzip_source_compressed_offsets");
std::string input{"bananarama"};
input.reserve(input.size() << 25);
for (int i = 0; i < 24; i++) {
input = input + input;
}
std::string padding_garbage{"garbage"};
for (int i = 0; i < 10; i++) {
padding_garbage = padding_garbage + padding_garbage;
}
std::string const padding_garbage(10000, 'g');
std::string const data_garbage{"GARBAGE"};
std::string const begininput{"begin of bananarama"};
std::string const endinput{"end of bananarama"};
Expand All @@ -335,10 +332,31 @@ TEST_F(DataChunkSourceTest, BgzipCompressedSourceVirtualOffsets)
}
input = begininput + input + endinput;

auto source =
cudf::io::text::make_source_from_bgzip_file(filename,
begin_compressed_offset << 16 | begin_local_offset,
end_compressed_offset << 16 | end_local_offset);
auto source = cudf::io::text::make_source_from_bgzip_file(
filename,
virtual_offset(begin_compressed_offset, begin_local_offset),
virtual_offset(end_compressed_offset, end_local_offset));
test_source(input, *source);
}

TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsetsSingleCompressedGZipBlock)
{
auto const filename = temp_env->get_temp_filepath("bgzip_source_offsets_single_compressed_block");
std::string const input{"collection unit brings"};
std::string const head_garbage(10000, 'g');
std::string const tail_garbage{"GARBAGE"};
std::size_t const begin_local_offset{head_garbage.size()};
std::size_t const end_local_offset{head_garbage.size() + input.size()};
{
std::ofstream output_stream{filename};
cudf::io::text::detail::bgzip::write_compressed_block(output_stream,
head_garbage + input + tail_garbage);
cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream, {});
}

auto const source = cudf::io::text::make_source_from_bgzip_file(
filename, virtual_offset(0, begin_local_offset), virtual_offset(0, end_local_offset));

test_source(input, *source);
}

Expand Down

0 comments on commit 8a31e26

Please sign in to comment.