Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix local offset handling in bgzip reader #11918

Merged
merged 3 commits into from
Oct 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cpp/src/io/text/bgzip_data_chunk_source.cu
Original file line number Diff line number Diff line change
Expand Up @@ -271,8 +271,8 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
// seek to the beginning of the provided local offset
auto const local_pos = virtual_begin & 0xFFFFu;
if (local_pos > 0) {
CUDF_EXPECTS(_curr_blocks.h_compressed_offsets.size() > 1 &&
local_pos < _curr_blocks.h_compressed_offsets[1],
CUDF_EXPECTS(_curr_blocks.h_decompressed_offsets.size() > 1 &&
local_pos < _curr_blocks.h_decompressed_offsets[1],
"local part of virtual offset is out of bounds");
_curr_blocks.consume_bytes(local_pos);
}
Expand Down
72 changes: 45 additions & 27 deletions cpp/tests/io/text/data_chunk_source_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,11 @@ enum class compression { ENABLED, DISABLED };

enum class eof { ADD_EOF_BLOCK, NO_EOF_BLOCK };

uint64_t virtual_offset(std::size_t block_offset, std::size_t local_offset)
{
return (block_offset << 16) | local_offset;
}

void write_bgzip(std::ostream& output_stream,
cudf::host_span<const char> data,
std::default_random_engine& rng,
Expand Down Expand Up @@ -193,6 +198,7 @@ TEST_F(DataChunkSourceTest, BgzipSource)
{
auto const filename = temp_env->get_temp_filepath("bgzip_source");
std::string input{"bananarama"};
input.reserve(input.size() << 25);
for (int i = 0; i < 24; i++) {
input = input + input;
}
Expand All @@ -211,13 +217,11 @@ TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsets)
{
auto const filename = temp_env->get_temp_filepath("bgzip_source_offsets");
std::string input{"bananarama"};
input.reserve(input.size() << 25);
for (int i = 0; i < 24; i++) {
input = input + input;
}
std::string padding_garbage{"garbage"};
for (int i = 0; i < 10; i++) {
padding_garbage = padding_garbage + padding_garbage;
}
std::string const padding_garbage(10000, 'g');
std::string const data_garbage{"GARBAGE"};
std::string const begininput{"begin of bananarama"};
std::string const endinput{"end of bananarama"};
Expand All @@ -241,10 +245,10 @@ TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsets)
}
input = begininput + input + endinput;

auto const source =
cudf::io::text::make_source_from_bgzip_file(filename,
begin_compressed_offset << 16 | begin_local_offset,
end_compressed_offset << 16 | end_local_offset);
auto const source = cudf::io::text::make_source_from_bgzip_file(
filename,
virtual_offset(begin_compressed_offset, begin_local_offset),
virtual_offset(end_compressed_offset, end_local_offset));

test_source(input, *source);
}
Expand All @@ -255,8 +259,6 @@ TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsetsSingleGZipBlock)
std::string const input{"collection unit brings"};
std::string const head_garbage{"garbage"};
std::string const tail_garbage{"GARBAGE"};
std::size_t begin_compressed_offset{};
std::size_t end_compressed_offset{};
std::size_t const begin_local_offset{head_garbage.size()};
std::size_t const end_local_offset{head_garbage.size() + input.size()};
{
Expand All @@ -266,10 +268,8 @@ TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsetsSingleGZipBlock)
cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream, {});
}

auto const source =
cudf::io::text::make_source_from_bgzip_file(filename,
begin_compressed_offset << 16 | begin_local_offset,
end_compressed_offset << 16 | end_local_offset);
auto const source = cudf::io::text::make_source_from_bgzip_file(
filename, virtual_offset(0, begin_local_offset), virtual_offset(0, end_local_offset));

test_source(input, *source);
}
Expand All @@ -280,7 +280,6 @@ TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsetsSingleChunk)
std::string const input{"collection unit brings"};
std::string const head_garbage{"garbage"};
std::string const tail_garbage{"GARBAGE"};
std::size_t begin_compressed_offset{};
std::size_t end_compressed_offset{};
std::size_t const begin_local_offset{head_garbage.size()};
std::size_t const end_local_offset{input.size() - 10};
Expand All @@ -294,10 +293,10 @@ TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsetsSingleChunk)
cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream, {});
}

auto const source =
cudf::io::text::make_source_from_bgzip_file(filename,
begin_compressed_offset << 16 | begin_local_offset,
end_compressed_offset << 16 | end_local_offset);
auto const source = cudf::io::text::make_source_from_bgzip_file(
filename,
virtual_offset(0, begin_local_offset),
virtual_offset(end_compressed_offset, end_local_offset));

test_source(input, *source);
}
Expand All @@ -306,13 +305,11 @@ TEST_F(DataChunkSourceTest, BgzipCompressedSourceVirtualOffsets)
{
auto const filename = temp_env->get_temp_filepath("bgzip_source_compressed_offsets");
std::string input{"bananarama"};
input.reserve(input.size() << 25);
for (int i = 0; i < 24; i++) {
input = input + input;
}
std::string padding_garbage{"garbage"};
for (int i = 0; i < 10; i++) {
padding_garbage = padding_garbage + padding_garbage;
}
std::string const padding_garbage(10000, 'g');
std::string const data_garbage{"GARBAGE"};
std::string const begininput{"begin of bananarama"};
std::string const endinput{"end of bananarama"};
Expand All @@ -335,10 +332,31 @@ TEST_F(DataChunkSourceTest, BgzipCompressedSourceVirtualOffsets)
}
input = begininput + input + endinput;

auto source =
cudf::io::text::make_source_from_bgzip_file(filename,
begin_compressed_offset << 16 | begin_local_offset,
end_compressed_offset << 16 | end_local_offset);
auto source = cudf::io::text::make_source_from_bgzip_file(
filename,
virtual_offset(begin_compressed_offset, begin_local_offset),
virtual_offset(end_compressed_offset, end_local_offset));
test_source(input, *source);
}

TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsetsSingleCompressedGZipBlock)
{
auto const filename = temp_env->get_temp_filepath("bgzip_source_offsets_single_compressed_block");
std::string const input{"collection unit brings"};
std::string const head_garbage(10000, 'g');
std::string const tail_garbage{"GARBAGE"};
std::size_t const begin_local_offset{head_garbage.size()};
std::size_t const end_local_offset{head_garbage.size() + input.size()};
{
std::ofstream output_stream{filename};
cudf::io::text::detail::bgzip::write_compressed_block(output_stream,
head_garbage + input + tail_garbage);
cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream, {});
}

auto const source = cudf::io::text::make_source_from_bgzip_file(
filename, virtual_offset(0, begin_local_offset), virtual_offset(0, end_local_offset));

test_source(input, *source);
}

Expand Down