Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert compression and io to string axis type in IO benchmarks #14347

Merged
merged 13 commits into from
Dec 11, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions cpp/benchmarks/io/cuio_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,3 +201,51 @@ void try_drop_l3_cache()
[](auto& cmd) { return exec_cmd(cmd).empty(); }),
"Failed to execute the drop cache command");
}

cudf::io::io_type retrieve_io_type_enum(std::string const& io_string)
SurajAralihalli marked this conversation as resolved.
Show resolved Hide resolved
{
if (io_string == "FILEPATH") {
return cudf::io::io_type::FILEPATH;
} else if (io_string == "HOST_BUFFER") {
return cudf::io::io_type::HOST_BUFFER;
} else if (io_string == "DEVICE_BUFFER") {
return cudf::io::io_type::DEVICE_BUFFER;
} else if (io_string == "VOID") {
return cudf::io::io_type::VOID;
} else if (io_string == "USER_IMPLEMENTED") {
return cudf::io::io_type::USER_IMPLEMENTED;
} else {
CUDF_FAIL("Unsupported io_type.");
SurajAralihalli marked this conversation as resolved.
Show resolved Hide resolved
}
}

cudf::io::compression_type retrieve_compression_type_enum(std::string const& compression_string)
{
if (compression_string == "NONE") {
return cudf::io::compression_type::NONE;
} else if (compression_string == "AUTO") {
return cudf::io::compression_type::AUTO;
SurajAralihalli marked this conversation as resolved.
Show resolved Hide resolved
} else if (compression_string == "SNAPPY") {
return cudf::io::compression_type::SNAPPY;
} else if (compression_string == "GZIP") {
return cudf::io::compression_type::GZIP;
} else if (compression_string == "BZIP2") {
return cudf::io::compression_type::BZIP2;
} else if (compression_string == "BROTLI") {
return cudf::io::compression_type::BROTLI;
} else if (compression_string == "ZIP") {
return cudf::io::compression_type::ZIP;
} else if (compression_string == "XZ") {
return cudf::io::compression_type::XZ;
} else if (compression_string == "ZLIB") {
return cudf::io::compression_type::ZLIB;
} else if (compression_string == "LZ4") {
return cudf::io::compression_type::LZ4;
} else if (compression_string == "LZO") {
return cudf::io::compression_type::LZO;
} else if (compression_string == "ZSTD") {
return cudf::io::compression_type::ZSTD;
} else {
CUDF_FAIL("Unsupported compression_type.");
}
}
24 changes: 24 additions & 0 deletions cpp/benchmarks/io/cuio_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,27 @@ std::vector<cudf::size_type> segments_in_chunk(int num_segments, int num_chunks,
* @throw cudf::logic_error if the environment variable is set and the command fails
*/
void try_drop_l3_cache();

/**
* @brief Convert a string to the corresponding io_type enum value.
*
* This function takes a string and returns the matching io_type enum value. It allows you to
* convert a string representation of an io_type into its corresponding enum value.
*
* @param io_string The input string representing the io_type
*
* @return The io_type enum value
*/
cudf::io::io_type retrieve_io_type_enum(std::string const& io_string);

/**
* @brief Convert a string to the corresponding compression_type enum value.
*
* This function takes a string and returns the matching compression_type enum value. It allows you
* to convert a string representation of a compression_type into its corresponding enum value.
*
* @param compression_string The input string representing the compression_type
*
* @return The compression_type enum value
*/
cudf::io::compression_type retrieve_compression_type_enum(std::string const& compression_string);
92 changes: 38 additions & 54 deletions cpp/benchmarks/io/parquet/parquet_reader_input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,34 +56,30 @@ void parquet_read_common(cudf::io::parquet_writer_options const& write_opts,
state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
}

template <data_type DataType, cudf::io::io_type IOType>
void BM_parquet_read_data(
nvbench::state& state,
nvbench::type_list<nvbench::enum_type<DataType>, nvbench::enum_type<IOType>>)
template <data_type DataType>
void BM_parquet_read_data(nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>>)
{
auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));
cudf::size_type const cardinality = state.get_int64("cardinality");
cudf::size_type const run_length = state.get_int64("run_length");
auto const compression = cudf::io::compression_type::SNAPPY;
auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));
cudf::size_type const cardinality = state.get_int64("cardinality");
cudf::size_type const run_length = state.get_int64("run_length");
SurajAralihalli marked this conversation as resolved.
Show resolved Hide resolved
cudf::io::io_type const source_type = retrieve_io_type_enum(state.get_string("io_type"));
SurajAralihalli marked this conversation as resolved.
Show resolved Hide resolved
auto const compression = cudf::io::compression_type::SNAPPY;

auto const tbl =
create_random_table(cycle_dtypes(d_type, num_cols),
table_size_bytes{data_size},
data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
auto const view = tbl->view();

cuio_source_sink_pair source_sink(IOType);
cuio_source_sink_pair source_sink(source_type);
cudf::io::parquet_writer_options write_opts =
cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
.compression(compression);

parquet_read_common(write_opts, source_sink, state);
}

template <cudf::io::io_type IOType, cudf::io::compression_type Compression>
void BM_parquet_read_io_compression(
nvbench::state& state,
nvbench::type_list<nvbench::enum_type<IOType>, nvbench::enum_type<Compression>>)
void BM_parquet_read_io_compression(nvbench::state& state)
{
auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
static_cast<int32_t>(data_type::FLOAT),
Expand All @@ -94,10 +90,11 @@ void BM_parquet_read_io_compression(
static_cast<int32_t>(data_type::LIST),
static_cast<int32_t>(data_type::STRUCT)});

cudf::size_type const cardinality = state.get_int64("cardinality");
cudf::size_type const run_length = state.get_int64("run_length");
auto const compression = Compression;
auto const source_type = IOType;
cudf::size_type const cardinality = state.get_int64("cardinality");
cudf::size_type const run_length = state.get_int64("run_length");
cudf::io::io_type const source_type = retrieve_io_type_enum(state.get_string("io_type"));
cudf::io::compression_type const compression =
SurajAralihalli marked this conversation as resolved.
Show resolved Hide resolved
retrieve_compression_type_enum(state.get_string("compression_type"));

auto const tbl =
create_random_table(cycle_dtypes(d_type, num_cols),
Expand All @@ -113,17 +110,15 @@ void BM_parquet_read_io_compression(
parquet_read_common(write_opts, source_sink, state);
}

template <cudf::io::io_type IOType>
void BM_parquet_read_io_small_mixed(nvbench::state& state,
nvbench::type_list<nvbench::enum_type<IOType>>)
void BM_parquet_read_io_small_mixed(nvbench::state& state)
{
auto const d_type =
std::pair<cudf::type_id, cudf::type_id>{cudf::type_id::STRING, cudf::type_id::INT32};

cudf::size_type const cardinality = state.get_int64("cardinality");
cudf::size_type const run_length = state.get_int64("run_length");
cudf::size_type const num_strings = state.get_int64("num_string_cols");
auto const source_type = IOType;
cudf::size_type const cardinality = state.get_int64("cardinality");
cudf::size_type const run_length = state.get_int64("run_length");
cudf::size_type const num_strings = state.get_int64("num_string_cols");
cudf::io::io_type const source_type = retrieve_io_type_enum(state.get_string("io_type"));
SurajAralihalli marked this conversation as resolved.
Show resolved Hide resolved

// want 80 pages total, across 4 columns, so 20 pages per column
cudf::size_type constexpr n_col = 4;
Expand All @@ -145,24 +140,23 @@ void BM_parquet_read_io_small_mixed(nvbench::state& state,
parquet_read_common(write_opts, source_sink, state);
}

template <data_type DataType, cudf::io::io_type IOType>
void BM_parquet_read_chunks(
nvbench::state& state,
nvbench::type_list<nvbench::enum_type<DataType>, nvbench::enum_type<IOType>>)
template <data_type DataType>
void BM_parquet_read_chunks(nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>>)
{
auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));
cudf::size_type const cardinality = state.get_int64("cardinality");
cudf::size_type const run_length = state.get_int64("run_length");
cudf::size_type const byte_limit = state.get_int64("byte_limit");
auto const compression = cudf::io::compression_type::SNAPPY;
auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));
cudf::size_type const cardinality = state.get_int64("cardinality");
cudf::size_type const run_length = state.get_int64("run_length");
cudf::size_type const byte_limit = state.get_int64("byte_limit");
cudf::io::io_type const source_type = retrieve_io_type_enum(state.get_string("io_type"));
SurajAralihalli marked this conversation as resolved.
Show resolved Hide resolved
auto const compression = cudf::io::compression_type::SNAPPY;

auto const tbl =
create_random_table(cycle_dtypes(d_type, num_cols),
table_size_bytes{data_size},
data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
auto const view = tbl->view();

cuio_source_sink_pair source_sink(IOType);
cuio_source_sink_pair source_sink(source_type);
cudf::io::parquet_writer_options write_opts =
cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
.compression(compression);
Expand Down Expand Up @@ -202,43 +196,33 @@ using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
data_type::LIST,
data_type::STRUCT>;

using io_list = nvbench::enum_type_list<cudf::io::io_type::FILEPATH,
cudf::io::io_type::HOST_BUFFER,
cudf::io::io_type::DEVICE_BUFFER>;

using compression_list =
nvbench::enum_type_list<cudf::io::compression_type::SNAPPY, cudf::io::compression_type::NONE>;

NVBENCH_BENCH_TYPES(BM_parquet_read_data,
NVBENCH_TYPE_AXES(d_type_list,
nvbench::enum_type_list<cudf::io::io_type::DEVICE_BUFFER>))
NVBENCH_BENCH_TYPES(BM_parquet_read_data, NVBENCH_TYPE_AXES(d_type_list))
.set_name("parquet_read_decode")
.set_type_axes_names({"data_type", "io"})
.set_type_axes_names({"data_type"})
.add_string_axis("io_type", {"DEVICE_BUFFER"})
.set_min_samples(4)
.add_int64_axis("cardinality", {0, 1000})
.add_int64_axis("run_length", {1, 32});

NVBENCH_BENCH_TYPES(BM_parquet_read_io_compression, NVBENCH_TYPE_AXES(io_list, compression_list))
NVBENCH_BENCH(BM_parquet_read_io_compression)
.set_name("parquet_read_io_compression")
.set_type_axes_names({"io", "compression"})
.add_string_axis("io_type", {"FILEPATH", "HOST_BUFFER", "DEVICE_BUFFER"})
.add_string_axis("compression_type", {"SNAPPY", "NONE"})
.set_min_samples(4)
.add_int64_axis("cardinality", {0, 1000})
.add_int64_axis("run_length", {1, 32});

NVBENCH_BENCH_TYPES(BM_parquet_read_chunks,
NVBENCH_TYPE_AXES(d_type_list,
nvbench::enum_type_list<cudf::io::io_type::DEVICE_BUFFER>))
NVBENCH_BENCH_TYPES(BM_parquet_read_chunks, NVBENCH_TYPE_AXES(d_type_list))
.set_name("parquet_read_chunks")
.set_type_axes_names({"data_type", "io"})
.add_string_axis("io_type", {"DEVICE_BUFFER"})
.set_min_samples(4)
.add_int64_axis("cardinality", {0, 1000})
.add_int64_axis("run_length", {1, 32})
.add_int64_axis("byte_limit", {0, 500'000});

NVBENCH_BENCH_TYPES(BM_parquet_read_io_small_mixed,
NVBENCH_TYPE_AXES(nvbench::enum_type_list<cudf::io::io_type::FILEPATH>))
NVBENCH_BENCH(BM_parquet_read_io_small_mixed)
.set_name("parquet_read_io_small_mixed")
.set_type_axes_names({"io"})
.add_string_axis("io_type", {"FILEPATH"})
.set_min_samples(4)
.add_int64_axis("cardinality", {0, 1000})
.add_int64_axis("run_length", {1, 32})
Expand Down
Loading