Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add BGZIP multibyte_split benchmark #11723

Merged
merged 7 commits into from
Oct 10, 2022
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ outputs:
- test -f $PREFIX/include/cudf/io/text/byte_range_info.hpp
- test -f $PREFIX/include/cudf/io/text/data_chunk_source.hpp
- test -f $PREFIX/include/cudf/io/text/data_chunk_source_factories.hpp
- test -f $PREFIX/include/cudf/io/text/detail/bgzip_utils.hpp
- test -f $PREFIX/include/cudf/io/text/detail/multistate.hpp
- test -f $PREFIX/include/cudf/io/text/detail/tile_state.hpp
- test -f $PREFIX/include/cudf/io/text/detail/trie.hpp
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,7 @@ add_library(
src/io/text/byte_range_info.cpp
src/io/text/data_chunk_source_factories.cpp
src/io/text/bgzip_data_chunk_source.cu
src/io/text/bgzip_utils.cpp
src/io/text/multibyte_split.cu
src/io/utilities/column_buffer.cpp
src/io/utilities/config_utils.cpp
Expand Down
3 changes: 2 additions & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,8 @@ ConfigureNVBench(NESTED_JSON_NVBENCH io/json/nested_json.cpp)

# ##################################################################################################
# * io benchmark ---------------------------------------------------------------------
ConfigureNVBench(MULTIBYTE_SPLIT_BENCHMARK io/text/multibyte_split.cpp)
ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp)
target_link_libraries(MULTIBYTE_SPLIT_NVBENCH PRIVATE ZLIB::ZLIB)

add_custom_target(
run_benchmarks
Expand Down
70 changes: 57 additions & 13 deletions cpp/benchmarks/io/text/multibyte_split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/io/text/data_chunk_source_factories.hpp>
#include <cudf/io/text/detail/bgzip_utils.hpp>
#include <cudf/io/text/multibyte_split.hpp>
#include <cudf/scalar/scalar_factories.hpp>
#include <cudf/strings/combine.hpp>
Expand All @@ -40,10 +41,25 @@
#include <cstdio>
#include <fstream>
#include <memory>
#include <random>

temp_directory const temp_dir("cudf_nvbench");

enum class data_chunk_source_type { device, file, host, host_pinned };
enum class data_chunk_source_type { device, file, host, host_pinned, file_bgzip };

NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
data_chunk_source_type,
[](auto value) {
switch (value) {
case data_chunk_source_type::device: return "device";
case data_chunk_source_type::file: return "file";
case data_chunk_source_type::host: return "host";
case data_chunk_source_type::host_pinned: return "host_pinned";
case data_chunk_source_type::file_bgzip: return "file_bgzip";
default: return "Unknown";
}
},
[](auto) { return std::string{}; })

static cudf::string_scalar create_random_input(int32_t num_chars,
double delim_factor,
Expand Down Expand Up @@ -78,14 +94,32 @@ static cudf::string_scalar create_random_input(int32_t num_chars,
return cudf::string_scalar(std::move(*chars_buffer));
}

static void bench_multibyte_split(nvbench::state& state)
static void write_bgzip_file(cudf::host_span<char const> host_data, std::ostream& output_stream)
{
// a bit of variability with a decent amount of padding so we don't overflow 16 bit block sizes
std::uniform_int_distribution<std::size_t> chunk_size_dist{64000, 65000};
std::default_random_engine rng{};
std::size_t pos = 0;
while (pos < host_data.size()) {
auto const remainder = host_data.size() - pos;
auto const chunk_size = std::min(remainder, chunk_size_dist(rng));
cudf::io::text::detail::bgzip::write_compressed_block(output_stream,
{host_data.data() + pos, chunk_size});
pos += chunk_size;
}
// empty block denotes EOF
cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream, {});
}

template <data_chunk_source_type source_type>
static void bench_multibyte_split(nvbench::state& state,
nvbench::type_list<nvbench::enum_type<source_type>>)
{
cudf::rmm_pool_raii pool_raii;

auto const source_type = static_cast<data_chunk_source_type>(state.get_int64("source_type"));
auto const delim_size = state.get_int64("delim_size");
auto const delim_percent = state.get_int64("delim_percent");
auto const file_size_approx = state.get_int64("size_approx");
auto const delim_size = state.get_int64("delim_size");
auto const delim_percent = state.get_int64("delim_percent");
auto const file_size_approx = state.get_int64("size_approx");
auto const byte_range_percent = state.get_int64("byte_range_percent");

auto const byte_range_factor = static_cast<double>(byte_range_percent) / 100;
Expand All @@ -104,7 +138,8 @@ static void bench_multibyte_split(nvbench::state& state)
auto host_pinned_input =
thrust::host_vector<char, thrust::system::cuda::experimental::pinned_allocator<char>>{};

if (source_type == data_chunk_source_type::host || source_type == data_chunk_source_type::file) {
if (source_type == data_chunk_source_type::host || source_type == data_chunk_source_type::file ||
source_type == data_chunk_source_type::file_bgzip) {
host_input = cudf::detail::make_std_vector_sync<char>(
{device_input.data(), static_cast<std::size_t>(device_input.size())},
cudf::default_stream_value);
Expand All @@ -131,6 +166,14 @@ static void bench_multibyte_split(nvbench::state& state)
return cudf::io::text::make_source(host_pinned_input);
case data_chunk_source_type::device: //
return cudf::io::text::make_source(device_input);
case data_chunk_source_type::file_bgzip: {
auto const temp_file_name = random_file_in_dir(temp_dir.path());
{
std::ofstream output_stream(temp_file_name, std::ofstream::out);
write_bgzip_file(host_input, output_stream);
}
return cudf::io::text::make_source_from_bgzip_file(temp_file_name);
}
default: CUDF_FAIL();
}
}();
Expand All @@ -152,13 +195,14 @@ static void bench_multibyte_split(nvbench::state& state)
state.add_buffer_size(range_size, "efs", "Encoded file size");
}

NVBENCH_BENCH(bench_multibyte_split)
using source_type_list = nvbench::enum_type_list<data_chunk_source_type::device,
data_chunk_source_type::file,
data_chunk_source_type::host,
data_chunk_source_type::host_pinned,
data_chunk_source_type::file_bgzip>;

NVBENCH_BENCH_TYPES(bench_multibyte_split, NVBENCH_TYPE_AXES(source_type_list))
.set_name("multibyte_split")
.add_int64_axis("source_type",
{static_cast<int>(data_chunk_source_type::device),
static_cast<int>(data_chunk_source_type::file),
static_cast<int>(data_chunk_source_type::host),
static_cast<int>(data_chunk_source_type::host_pinned)})
.add_int64_axis("delim_size", {1, 4, 7})
.add_int64_axis("delim_percent", {1, 25})
.add_int64_power_of_two_axis("size_approx", {15, 30})
Expand Down
112 changes: 112 additions & 0 deletions cpp/include/cudf/io/text/detail/bgzip_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/utilities/error.hpp>
#include <cudf/utilities/span.hpp>

#include <algorithm>
#include <array>
#include <fstream>
#include <limits>

#include <zlib.h>
upsj marked this conversation as resolved.
Show resolved Hide resolved

namespace cudf::io::text::detail::bgzip {

struct header {
int block_size;
int extra_length;
[[nodiscard]] int data_size() const { return block_size - extra_length - 20; }
};

struct footer {
uint32_t crc;
uint32_t decompressed_size;
};

/**
* @brief Reads the full BGZIP header from the given input stream. Afterwards, the stream position
* is at the first data byte.
*
* @param input_stream The input stream
* @return The header storing the compressed size and extra subfield length
*/
header read_header(std::istream& input_stream);

/**
* @brief Reads the full BGZIP footer from the given input stream. Afterwards, the stream position
* is after the last footer byte.
*
* @param input_stream The input stream
* @return The footer storing uncompressed size and CRC32
*/
footer read_footer(std::istream& input_stream);

/**
* @brief Writes a header for data of the given compressed size to the given stream.
*
* @param output_stream The output stream
* @param compressed_size The size of the compressed data
* @param pre_size_subfields Any GZIP extra subfields (need to be valid) to be placed before the
* BGZIP block size subfield
* @param post_size_subfields Any subfields to be placed after the BGZIP block size subfield
*/
void write_header(std::ostream& output_stream,
uint16_t compressed_size,
host_span<char const> pre_size_subfields,
host_span<char const> post_size_subfields);

/**
* @brief Writes a footer for the given uncompressed data to the given stream.
*
* @param output_stream The output stream
* @param data The data for which uncompressed size and CRC32 will be computed and written
*/
void write_footer(std::ostream& output_stream, host_span<char const> data);

/**
* @brief Writes the given data to the given stream as an uncompressed deflate block with BZGIP
* header and footer.
*
* @param output_stream The output stream
* @param data The uncompressed data
* @param extra_garbage_before Any GZIP extra subfields (need to be valid) to be placed before the
* BGZIP block size subfield
* @param extra_garbage_after Any subfields to be placed after the BGZIP block size subfield
*/
void write_uncompressed_block(std::ostream& output_stream,
host_span<char const> data,
host_span<char const> extra_garbage_before = {},
host_span<char const> extra_garbage_after = {});

/**
* @brief Writes the given data to the given stream as a compressed deflate block with BZGIP
* header and footer.
*
* @param output_stream The output stream
* @param data The uncompressed data
* @param extra_garbage_before Any GZIP extra subfields (need to be valid) to be placed before the
upsj marked this conversation as resolved.
Show resolved Hide resolved
* BGZIP block size subfield
* @param extra_garbage_after Any subfields to be placed after the BGZIP block size subfield
*/
void write_compressed_block(std::ostream& output_stream,
host_span<char const> data,
host_span<char const> extra_garbage_before = {},
host_span<char const> extra_garbage_after = {});

} // namespace cudf::io::text::detail::bgzip
72 changes: 5 additions & 67 deletions cpp/src/io/text/bgzip_data_chunk_source.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/io/text/data_chunk_source_factories.hpp>
#include <cudf/io/text/detail/bgzip_utils.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>

Expand All @@ -36,7 +37,6 @@
#include <limits>

namespace cudf::io::text {

namespace {

/**
Expand Down Expand Up @@ -64,68 +64,6 @@ struct bgzip_nvcomp_transform_functor {

class bgzip_data_chunk_reader : public data_chunk_reader {
private:
template <typename IntType>
static IntType read_int(char* data)
{
IntType result{};
// we assume little-endian
std::memcpy(&result, &data[0], sizeof(result));
return result;
}

struct bgzip_header {
int block_size;
int extra_length;
[[nodiscard]] int data_size() const { return block_size - extra_length - 20; }
};

bgzip_header read_header()
{
std::array<char, 12> buffer{};
_data_stream->read(buffer.data(), sizeof(buffer));
std::array<uint8_t, 4> const expected_header{{31, 139, 8, 4}};
CUDF_EXPECTS(
std::equal(
expected_header.begin(), expected_header.end(), reinterpret_cast<uint8_t*>(buffer.data())),
"malformed BGZIP header");
// we ignore the remaining bytes of the fixed header, since they don't matter to us
auto const extra_length = read_int<uint16_t>(&buffer[10]);
uint16_t extra_offset{};
// read all the extra subfields
while (extra_offset < extra_length) {
auto const remaining_size = extra_length - extra_offset;
CUDF_EXPECTS(remaining_size >= 4, "invalid extra field length");
// a subfield consists of 2 identifier bytes and a uint16 length
// 66/67 identifies a BGZIP block size field, we skip all other fields
_data_stream->read(buffer.data(), 4);
extra_offset += 4;
auto const subfield_size = read_int<uint16_t>(&buffer[2]);
if (buffer[0] == 66 && buffer[1] == 67) {
// the block size subfield contains a single uint16 value, which is block_size - 1
CUDF_EXPECTS(subfield_size == sizeof(uint16_t), "malformed BGZIP extra subfield");
_data_stream->read(buffer.data(), sizeof(uint16_t));
_data_stream->seekg(remaining_size - 6, std::ios_base::cur);
auto const block_size_minus_one = read_int<uint16_t>(&buffer[0]);
return {block_size_minus_one + 1, extra_length};
} else {
_data_stream->seekg(subfield_size, std::ios_base::cur);
extra_offset += subfield_size;
}
}
CUDF_FAIL("missing BGZIP size extra subfield");
}

struct bgzip_footer {
uint32_t decompressed_size;
};

bgzip_footer read_footer()
{
std::array<char, 8> buffer{};
_data_stream->read(buffer.data(), sizeof(buffer));
return {read_int<uint32_t>(&buffer[4])};
}

template <typename T>
using pinned_host_vector =
thrust::host_vector<T, thrust::system::cuda::experimental::pinned_allocator<T>>;
Expand Down Expand Up @@ -258,13 +196,13 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
return available_decompressed_size - read_pos;
}

void read_block(bgzip_header header, std::istream& stream)
void read_block(detail::bgzip::header header, std::istream& stream)
{
h_compressed_blocks.resize(h_compressed_blocks.size() + header.data_size());
stream.read(h_compressed_blocks.data() + compressed_size(), header.data_size());
}

void add_block_offsets(bgzip_header header, bgzip_footer footer)
void add_block_offsets(detail::bgzip::header header, detail::bgzip::footer footer)
{
max_decompressed_size =
std::max<std::size_t>(footer.decompressed_size, max_decompressed_size);
Expand Down Expand Up @@ -294,9 +232,9 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
// peek is necessary if we are already at the end, but didn't try to read another byte
_data_stream->peek();
if (_data_stream->eof() || _compressed_pos > _compressed_end) { break; }
auto header = read_header();
auto header = detail::bgzip::read_header(*_data_stream);
_curr_blocks.read_block(header, *_data_stream);
auto footer = read_footer();
auto footer = detail::bgzip::read_footer(*_data_stream);
_curr_blocks.add_block_offsets(header, footer);
// for the last GZIP block, we restrict ourselves to the bytes up to _local_end
// but only for the reader, not for decompression!
Expand Down
Loading