Skip to content

Commit

Permalink
Add BGZIP multibyte_split benchmark (#11723)
Browse files Browse the repository at this point in the history
This refactors #11652 to extract the BGZIP IO and adds another `source_type` to the `multibyte_split` benchmark, creating a compressed file using `zlib`.

A quick benchmark shows performance results around 2.5x slower than reading from a device buffer at around 1:5 compression ratio

### [0] Tesla T4

| source_type | delim_size | delim_percent |    size_approx    | byte_range_percent | Time  | Peak Memory Usage | Encoded file size |
|-------------|------------|---------------|-------------------|--------------------|------------|-------------------|-------------------|
|           bgzip |          1 |             1 | 2^30 = 1073741824 |                100 |  507.479 ms |         4.022 GiB |      1006.638 MiB |
|           file |          1 |             1 | 2^30 = 1073741824 |                100 |  339.860 ms |    3.947 GiB |      1006.638 MiB |
|           device |          1 |             1 | 2^30 = 1073741824 |                100 | 201.556 ms |       3.947 GiB |      1006.638 MiB |

Authors:
  - Tobias Ribizel (https://github.com/upsj)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)
  - Jordan Jacobelli (https://github.com/Ethyling)

URL: #11723
  • Loading branch information
upsj authored Oct 10, 2022
1 parent fc5b675 commit 4eb9c6c
Show file tree
Hide file tree
Showing 9 changed files with 473 additions and 185 deletions.
1 change: 1 addition & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ outputs:
- test -f $PREFIX/include/cudf/io/text/byte_range_info.hpp
- test -f $PREFIX/include/cudf/io/text/data_chunk_source.hpp
- test -f $PREFIX/include/cudf/io/text/data_chunk_source_factories.hpp
- test -f $PREFIX/include/cudf/io/text/detail/bgzip_utils.hpp
- test -f $PREFIX/include/cudf/io/text/detail/multistate.hpp
- test -f $PREFIX/include/cudf/io/text/detail/tile_state.hpp
- test -f $PREFIX/include/cudf/io/text/detail/trie.hpp
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,7 @@ add_library(
src/io/text/byte_range_info.cpp
src/io/text/data_chunk_source_factories.cpp
src/io/text/bgzip_data_chunk_source.cu
src/io/text/bgzip_utils.cpp
src/io/text/multibyte_split.cu
src/io/utilities/column_buffer.cpp
src/io/utilities/config_utils.cpp
Expand Down
3 changes: 2 additions & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,8 @@ ConfigureNVBench(NESTED_JSON_NVBENCH io/json/nested_json.cpp)

# ##################################################################################################
# * io benchmark ---------------------------------------------------------------------
ConfigureNVBench(MULTIBYTE_SPLIT_BENCHMARK io/text/multibyte_split.cpp)
ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp)
target_link_libraries(MULTIBYTE_SPLIT_NVBENCH PRIVATE ZLIB::ZLIB)

add_custom_target(
run_benchmarks
Expand Down
70 changes: 57 additions & 13 deletions cpp/benchmarks/io/text/multibyte_split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/io/text/data_chunk_source_factories.hpp>
#include <cudf/io/text/detail/bgzip_utils.hpp>
#include <cudf/io/text/multibyte_split.hpp>
#include <cudf/scalar/scalar_factories.hpp>
#include <cudf/strings/combine.hpp>
Expand All @@ -40,10 +41,25 @@
#include <cstdio>
#include <fstream>
#include <memory>
#include <random>

temp_directory const temp_dir("cudf_nvbench");

enum class data_chunk_source_type { device, file, host, host_pinned };
enum class data_chunk_source_type { device, file, host, host_pinned, file_bgzip };

NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
data_chunk_source_type,
[](auto value) {
switch (value) {
case data_chunk_source_type::device: return "device";
case data_chunk_source_type::file: return "file";
case data_chunk_source_type::host: return "host";
case data_chunk_source_type::host_pinned: return "host_pinned";
case data_chunk_source_type::file_bgzip: return "file_bgzip";
default: return "Unknown";
}
},
[](auto) { return std::string{}; })

static cudf::string_scalar create_random_input(int32_t num_chars,
double delim_factor,
Expand Down Expand Up @@ -78,14 +94,32 @@ static cudf::string_scalar create_random_input(int32_t num_chars,
return cudf::string_scalar(std::move(*chars_buffer));
}

static void bench_multibyte_split(nvbench::state& state)
static void write_bgzip_file(cudf::host_span<char const> host_data, std::ostream& output_stream)
{
// a bit of variability with a decent amount of padding so we don't overflow 16 bit block sizes
std::uniform_int_distribution<std::size_t> chunk_size_dist{64000, 65000};
std::default_random_engine rng{};
std::size_t pos = 0;
while (pos < host_data.size()) {
auto const remainder = host_data.size() - pos;
auto const chunk_size = std::min(remainder, chunk_size_dist(rng));
cudf::io::text::detail::bgzip::write_compressed_block(output_stream,
{host_data.data() + pos, chunk_size});
pos += chunk_size;
}
// empty block denotes EOF
cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream, {});
}

template <data_chunk_source_type source_type>
static void bench_multibyte_split(nvbench::state& state,
nvbench::type_list<nvbench::enum_type<source_type>>)
{
cudf::rmm_pool_raii pool_raii;

auto const source_type = static_cast<data_chunk_source_type>(state.get_int64("source_type"));
auto const delim_size = state.get_int64("delim_size");
auto const delim_percent = state.get_int64("delim_percent");
auto const file_size_approx = state.get_int64("size_approx");
auto const delim_size = state.get_int64("delim_size");
auto const delim_percent = state.get_int64("delim_percent");
auto const file_size_approx = state.get_int64("size_approx");
auto const byte_range_percent = state.get_int64("byte_range_percent");

auto const byte_range_factor = static_cast<double>(byte_range_percent) / 100;
Expand All @@ -104,7 +138,8 @@ static void bench_multibyte_split(nvbench::state& state)
auto host_pinned_input =
thrust::host_vector<char, thrust::system::cuda::experimental::pinned_allocator<char>>{};

if (source_type == data_chunk_source_type::host || source_type == data_chunk_source_type::file) {
if (source_type == data_chunk_source_type::host || source_type == data_chunk_source_type::file ||
source_type == data_chunk_source_type::file_bgzip) {
host_input = cudf::detail::make_std_vector_sync<char>(
{device_input.data(), static_cast<std::size_t>(device_input.size())},
cudf::default_stream_value);
Expand All @@ -131,6 +166,14 @@ static void bench_multibyte_split(nvbench::state& state)
return cudf::io::text::make_source(host_pinned_input);
case data_chunk_source_type::device: //
return cudf::io::text::make_source(device_input);
case data_chunk_source_type::file_bgzip: {
auto const temp_file_name = random_file_in_dir(temp_dir.path());
{
std::ofstream output_stream(temp_file_name, std::ofstream::out);
write_bgzip_file(host_input, output_stream);
}
return cudf::io::text::make_source_from_bgzip_file(temp_file_name);
}
default: CUDF_FAIL();
}
}();
Expand All @@ -152,13 +195,14 @@ static void bench_multibyte_split(nvbench::state& state)
state.add_buffer_size(range_size, "efs", "Encoded file size");
}

NVBENCH_BENCH(bench_multibyte_split)
using source_type_list = nvbench::enum_type_list<data_chunk_source_type::device,
data_chunk_source_type::file,
data_chunk_source_type::host,
data_chunk_source_type::host_pinned,
data_chunk_source_type::file_bgzip>;

NVBENCH_BENCH_TYPES(bench_multibyte_split, NVBENCH_TYPE_AXES(source_type_list))
.set_name("multibyte_split")
.add_int64_axis("source_type",
{static_cast<int>(data_chunk_source_type::device),
static_cast<int>(data_chunk_source_type::file),
static_cast<int>(data_chunk_source_type::host),
static_cast<int>(data_chunk_source_type::host_pinned)})
.add_int64_axis("delim_size", {1, 4, 7})
.add_int64_axis("delim_percent", {1, 25})
.add_int64_power_of_two_axis("size_approx", {15, 30})
Expand Down
112 changes: 112 additions & 0 deletions cpp/include/cudf/io/text/detail/bgzip_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <zlib.h>

#include <cudf/utilities/error.hpp>
#include <cudf/utilities/span.hpp>

#include <algorithm>
#include <array>
#include <fstream>
#include <limits>

namespace cudf::io::text::detail::bgzip {

struct header {
int block_size;
int extra_length;
[[nodiscard]] int data_size() const { return block_size - extra_length - 20; }
};

struct footer {
uint32_t crc;
uint32_t decompressed_size;
};

/**
* @brief Reads the full BGZIP header from the given input stream. Afterwards, the stream position
* is at the first data byte.
*
* @param input_stream The input stream
* @return The header storing the compressed size and extra subfield length
*/
header read_header(std::istream& input_stream);

/**
* @brief Reads the full BGZIP footer from the given input stream. Afterwards, the stream position
* is after the last footer byte.
*
* @param input_stream The input stream
* @return The footer storing uncompressed size and CRC32
*/
footer read_footer(std::istream& input_stream);

/**
* @brief Writes a header for data of the given compressed size to the given stream.
*
* @param output_stream The output stream
* @param compressed_size The size of the compressed data
* @param pre_size_subfields Any GZIP extra subfields (need to be valid) to be placed before the
* BGZIP block size subfield
* @param post_size_subfields Any subfields to be placed after the BGZIP block size subfield
*/
void write_header(std::ostream& output_stream,
uint16_t compressed_size,
host_span<char const> pre_size_subfields,
host_span<char const> post_size_subfields);

/**
* @brief Writes a footer for the given uncompressed data to the given stream.
*
* @param output_stream The output stream
* @param data The data for which uncompressed size and CRC32 will be computed and written
*/
void write_footer(std::ostream& output_stream, host_span<char const> data);

/**
* @brief Writes the given data to the given stream as an uncompressed deflate block with BZGIP
* header and footer.
*
* @param output_stream The output stream
* @param data The uncompressed data
* @param pre_size_subfields Any GZIP extra subfields (need to be valid) to be placed before the
* BGZIP block size subfield
* @param post_size_subfields Any subfields to be placed after the BGZIP block size subfield
*/
void write_uncompressed_block(std::ostream& output_stream,
host_span<char const> data,
host_span<char const> pre_size_subfields = {},
host_span<char const> post_size_subfields = {});

/**
* @brief Writes the given data to the given stream as a compressed deflate block with BZGIP
* header and footer.
*
* @param output_stream The output stream
* @param data The uncompressed data
* @param pre_size_subfields Any GZIP extra subfields (need to be valid) to be placed before the
* BGZIP block size subfield
* @param post_size_subfields Any subfields to be placed after the BGZIP block size subfield
*/
void write_compressed_block(std::ostream& output_stream,
host_span<char const> data,
host_span<char const> pre_size_subfields = {},
host_span<char const> post_size_subfields = {});

} // namespace cudf::io::text::detail::bgzip
72 changes: 5 additions & 67 deletions cpp/src/io/text/bgzip_data_chunk_source.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/io/text/data_chunk_source_factories.hpp>
#include <cudf/io/text/detail/bgzip_utils.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>

Expand All @@ -36,7 +37,6 @@
#include <limits>

namespace cudf::io::text {

namespace {

/**
Expand Down Expand Up @@ -64,68 +64,6 @@ struct bgzip_nvcomp_transform_functor {

class bgzip_data_chunk_reader : public data_chunk_reader {
private:
template <typename IntType>
static IntType read_int(char* data)
{
IntType result{};
// we assume little-endian
std::memcpy(&result, &data[0], sizeof(result));
return result;
}

struct bgzip_header {
int block_size;
int extra_length;
[[nodiscard]] int data_size() const { return block_size - extra_length - 20; }
};

bgzip_header read_header()
{
std::array<char, 12> buffer{};
_data_stream->read(buffer.data(), sizeof(buffer));
std::array<uint8_t, 4> const expected_header{{31, 139, 8, 4}};
CUDF_EXPECTS(
std::equal(
expected_header.begin(), expected_header.end(), reinterpret_cast<uint8_t*>(buffer.data())),
"malformed BGZIP header");
// we ignore the remaining bytes of the fixed header, since they don't matter to us
auto const extra_length = read_int<uint16_t>(&buffer[10]);
uint16_t extra_offset{};
// read all the extra subfields
while (extra_offset < extra_length) {
auto const remaining_size = extra_length - extra_offset;
CUDF_EXPECTS(remaining_size >= 4, "invalid extra field length");
// a subfield consists of 2 identifier bytes and a uint16 length
// 66/67 identifies a BGZIP block size field, we skip all other fields
_data_stream->read(buffer.data(), 4);
extra_offset += 4;
auto const subfield_size = read_int<uint16_t>(&buffer[2]);
if (buffer[0] == 66 && buffer[1] == 67) {
// the block size subfield contains a single uint16 value, which is block_size - 1
CUDF_EXPECTS(subfield_size == sizeof(uint16_t), "malformed BGZIP extra subfield");
_data_stream->read(buffer.data(), sizeof(uint16_t));
_data_stream->seekg(remaining_size - 6, std::ios_base::cur);
auto const block_size_minus_one = read_int<uint16_t>(&buffer[0]);
return {block_size_minus_one + 1, extra_length};
} else {
_data_stream->seekg(subfield_size, std::ios_base::cur);
extra_offset += subfield_size;
}
}
CUDF_FAIL("missing BGZIP size extra subfield");
}

struct bgzip_footer {
uint32_t decompressed_size;
};

bgzip_footer read_footer()
{
std::array<char, 8> buffer{};
_data_stream->read(buffer.data(), sizeof(buffer));
return {read_int<uint32_t>(&buffer[4])};
}

template <typename T>
using pinned_host_vector =
thrust::host_vector<T, thrust::system::cuda::experimental::pinned_allocator<T>>;
Expand Down Expand Up @@ -258,13 +196,13 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
return available_decompressed_size - read_pos;
}

void read_block(bgzip_header header, std::istream& stream)
void read_block(detail::bgzip::header header, std::istream& stream)
{
h_compressed_blocks.resize(h_compressed_blocks.size() + header.data_size());
stream.read(h_compressed_blocks.data() + compressed_size(), header.data_size());
}

void add_block_offsets(bgzip_header header, bgzip_footer footer)
void add_block_offsets(detail::bgzip::header header, detail::bgzip::footer footer)
{
max_decompressed_size =
std::max<std::size_t>(footer.decompressed_size, max_decompressed_size);
Expand Down Expand Up @@ -294,9 +232,9 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
// peek is necessary if we are already at the end, but didn't try to read another byte
_data_stream->peek();
if (_data_stream->eof() || _compressed_pos > _compressed_end) { break; }
auto header = read_header();
auto header = detail::bgzip::read_header(*_data_stream);
_curr_blocks.read_block(header, *_data_stream);
auto footer = read_footer();
auto footer = detail::bgzip::read_footer(*_data_stream);
_curr_blocks.add_block_offsets(header, footer);
// for the last GZIP block, we restrict ourselves to the bytes up to _local_end
// but only for the reader, not for decompression!
Expand Down
Loading

0 comments on commit 4eb9c6c

Please sign in to comment.