Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert compression and io to string axis type in IO benchmarks #14347

Merged
merged 13 commits into from
Dec 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions cpp/benchmarks/io/cuio_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,3 +201,30 @@ void try_drop_l3_cache()
[](auto& cmd) { return exec_cmd(cmd).empty(); }),
"Failed to execute the drop cache command");
}

cudf::io::io_type retrieve_io_type_enum(std::string_view io_string)
{
if (io_string == "FILEPATH") { return cudf::io::io_type::FILEPATH; }
if (io_string == "HOST_BUFFER") { return cudf::io::io_type::HOST_BUFFER; }
if (io_string == "DEVICE_BUFFER") { return cudf::io::io_type::DEVICE_BUFFER; }
if (io_string == "VOID") { return cudf::io::io_type::VOID; }
if (io_string == "USER_IMPLEMENTED") { return cudf::io::io_type::USER_IMPLEMENTED; }
CUDF_FAIL("Unsupported io_type.");
}

cudf::io::compression_type retrieve_compression_type_enum(std::string_view compression_string)
{
if (compression_string == "NONE") { return cudf::io::compression_type::NONE; }
if (compression_string == "AUTO") { return cudf::io::compression_type::AUTO; }
if (compression_string == "SNAPPY") { return cudf::io::compression_type::SNAPPY; }
if (compression_string == "GZIP") { return cudf::io::compression_type::GZIP; }
if (compression_string == "BZIP2") { return cudf::io::compression_type::BZIP2; }
if (compression_string == "BROTLI") { return cudf::io::compression_type::BROTLI; }
if (compression_string == "ZIP") { return cudf::io::compression_type::ZIP; }
if (compression_string == "XZ") { return cudf::io::compression_type::XZ; }
if (compression_string == "ZLIB") { return cudf::io::compression_type::ZLIB; }
if (compression_string == "LZ4") { return cudf::io::compression_type::LZ4; }
if (compression_string == "LZO") { return cudf::io::compression_type::LZO; }
if (compression_string == "ZSTD") { return cudf::io::compression_type::ZSTD; }
CUDF_FAIL("Unsupported compression_type.");
}
24 changes: 24 additions & 0 deletions cpp/benchmarks/io/cuio_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,27 @@ std::vector<cudf::size_type> segments_in_chunk(int num_segments, int num_chunks,
* @throw cudf::logic_error if the environment variable is set and the command fails
*/
void try_drop_l3_cache();

/**
* @brief Convert a string to the corresponding io_type enum value.
*
* This function takes a string and returns the matching io_type enum value. It allows you to
* convert a string representation of an io_type into its corresponding enum value.
*
* @param io_string The input string representing the io_type
*
* @return The io_type enum value
*/
cudf::io::io_type retrieve_io_type_enum(std::string_view io_string);

/**
* @brief Convert a string to the corresponding compression_type enum value.
*
* This function takes a string and returns the matching compression_type enum value. It allows you
* to convert a string representation of a compression_type into its corresponding enum value.
*
* @param compression_string The input string representing the compression_type
*
* @return The compression_type enum value
*/
cudf::io::compression_type retrieve_compression_type_enum(std::string_view compression_string);
91 changes: 37 additions & 54 deletions cpp/benchmarks/io/parquet/parquet_reader_input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,34 +56,30 @@ void parquet_read_common(cudf::io::parquet_writer_options const& write_opts,
state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
}

template <data_type DataType, cudf::io::io_type IOType>
void BM_parquet_read_data(
nvbench::state& state,
nvbench::type_list<nvbench::enum_type<DataType>, nvbench::enum_type<IOType>>)
template <data_type DataType>
void BM_parquet_read_data(nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>>)
{
auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));
cudf::size_type const cardinality = state.get_int64("cardinality");
cudf::size_type const run_length = state.get_int64("run_length");
auto const compression = cudf::io::compression_type::SNAPPY;
auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));
auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
auto const run_length = static_cast<cudf::size_type>(state.get_int64("run_length"));
auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
auto const compression = cudf::io::compression_type::SNAPPY;

auto const tbl =
create_random_table(cycle_dtypes(d_type, num_cols),
table_size_bytes{data_size},
data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
auto const view = tbl->view();

cuio_source_sink_pair source_sink(IOType);
cuio_source_sink_pair source_sink(source_type);
cudf::io::parquet_writer_options write_opts =
cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
.compression(compression);

parquet_read_common(write_opts, source_sink, state);
}

template <cudf::io::io_type IOType, cudf::io::compression_type Compression>
void BM_parquet_read_io_compression(
nvbench::state& state,
nvbench::type_list<nvbench::enum_type<IOType>, nvbench::enum_type<Compression>>)
void BM_parquet_read_io_compression(nvbench::state& state)
{
auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
static_cast<int32_t>(data_type::FLOAT),
Expand All @@ -94,10 +90,10 @@ void BM_parquet_read_io_compression(
static_cast<int32_t>(data_type::LIST),
static_cast<int32_t>(data_type::STRUCT)});

cudf::size_type const cardinality = state.get_int64("cardinality");
cudf::size_type const run_length = state.get_int64("run_length");
auto const compression = Compression;
auto const source_type = IOType;
auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
auto const run_length = static_cast<cudf::size_type>(state.get_int64("run_length"));
auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
auto const compression = retrieve_compression_type_enum(state.get_string("compression_type"));

auto const tbl =
create_random_table(cycle_dtypes(d_type, num_cols),
Expand All @@ -113,17 +109,15 @@ void BM_parquet_read_io_compression(
parquet_read_common(write_opts, source_sink, state);
}

template <cudf::io::io_type IOType>
void BM_parquet_read_io_small_mixed(nvbench::state& state,
nvbench::type_list<nvbench::enum_type<IOType>>)
void BM_parquet_read_io_small_mixed(nvbench::state& state)
{
auto const d_type =
std::pair<cudf::type_id, cudf::type_id>{cudf::type_id::STRING, cudf::type_id::INT32};

cudf::size_type const cardinality = state.get_int64("cardinality");
cudf::size_type const run_length = state.get_int64("run_length");
cudf::size_type const num_strings = state.get_int64("num_string_cols");
auto const source_type = IOType;
auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
auto const run_length = static_cast<cudf::size_type>(state.get_int64("run_length"));
auto const num_strings = static_cast<cudf::size_type>(state.get_int64("num_string_cols"));
auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));

// want 80 pages total, across 4 columns, so 20 pages per column
cudf::size_type constexpr n_col = 4;
Expand All @@ -145,24 +139,23 @@ void BM_parquet_read_io_small_mixed(nvbench::state& state,
parquet_read_common(write_opts, source_sink, state);
}

template <data_type DataType, cudf::io::io_type IOType>
void BM_parquet_read_chunks(
nvbench::state& state,
nvbench::type_list<nvbench::enum_type<DataType>, nvbench::enum_type<IOType>>)
template <data_type DataType>
void BM_parquet_read_chunks(nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>>)
{
auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));
cudf::size_type const cardinality = state.get_int64("cardinality");
cudf::size_type const run_length = state.get_int64("run_length");
cudf::size_type const byte_limit = state.get_int64("byte_limit");
auto const compression = cudf::io::compression_type::SNAPPY;
auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));
auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
auto const run_length = static_cast<cudf::size_type>(state.get_int64("run_length"));
auto const byte_limit = static_cast<cudf::size_type>(state.get_int64("byte_limit"));
auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
auto const compression = cudf::io::compression_type::SNAPPY;

auto const tbl =
create_random_table(cycle_dtypes(d_type, num_cols),
table_size_bytes{data_size},
data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
auto const view = tbl->view();

cuio_source_sink_pair source_sink(IOType);
cuio_source_sink_pair source_sink(source_type);
cudf::io::parquet_writer_options write_opts =
cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
.compression(compression);
Expand Down Expand Up @@ -202,43 +195,33 @@ using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
data_type::LIST,
data_type::STRUCT>;

using io_list = nvbench::enum_type_list<cudf::io::io_type::FILEPATH,
cudf::io::io_type::HOST_BUFFER,
cudf::io::io_type::DEVICE_BUFFER>;

using compression_list =
nvbench::enum_type_list<cudf::io::compression_type::SNAPPY, cudf::io::compression_type::NONE>;

NVBENCH_BENCH_TYPES(BM_parquet_read_data,
NVBENCH_TYPE_AXES(d_type_list,
nvbench::enum_type_list<cudf::io::io_type::DEVICE_BUFFER>))
NVBENCH_BENCH_TYPES(BM_parquet_read_data, NVBENCH_TYPE_AXES(d_type_list))
.set_name("parquet_read_decode")
.set_type_axes_names({"data_type", "io"})
.set_type_axes_names({"data_type"})
.add_string_axis("io_type", {"DEVICE_BUFFER"})
.set_min_samples(4)
.add_int64_axis("cardinality", {0, 1000})
.add_int64_axis("run_length", {1, 32});

NVBENCH_BENCH_TYPES(BM_parquet_read_io_compression, NVBENCH_TYPE_AXES(io_list, compression_list))
NVBENCH_BENCH(BM_parquet_read_io_compression)
.set_name("parquet_read_io_compression")
.set_type_axes_names({"io", "compression"})
.add_string_axis("io_type", {"FILEPATH", "HOST_BUFFER", "DEVICE_BUFFER"})
.add_string_axis("compression_type", {"SNAPPY", "NONE"})
.set_min_samples(4)
.add_int64_axis("cardinality", {0, 1000})
.add_int64_axis("run_length", {1, 32});

NVBENCH_BENCH_TYPES(BM_parquet_read_chunks,
NVBENCH_TYPE_AXES(d_type_list,
nvbench::enum_type_list<cudf::io::io_type::DEVICE_BUFFER>))
NVBENCH_BENCH_TYPES(BM_parquet_read_chunks, NVBENCH_TYPE_AXES(d_type_list))
.set_name("parquet_read_chunks")
.set_type_axes_names({"data_type", "io"})
.add_string_axis("io_type", {"DEVICE_BUFFER"})
.set_min_samples(4)
.add_int64_axis("cardinality", {0, 1000})
.add_int64_axis("run_length", {1, 32})
.add_int64_axis("byte_limit", {0, 500'000});

NVBENCH_BENCH_TYPES(BM_parquet_read_io_small_mixed,
NVBENCH_TYPE_AXES(nvbench::enum_type_list<cudf::io::io_type::FILEPATH>))
NVBENCH_BENCH(BM_parquet_read_io_small_mixed)
.set_name("parquet_read_io_small_mixed")
.set_type_axes_names({"io"})
.add_string_axis("io_type", {"FILEPATH"})
.set_min_samples(4)
.add_int64_axis("cardinality", {0, 1000})
.add_int64_axis("run_length", {1, 32})
Expand Down