Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEA] Add file size counter to cuIO benchmarks #10154

Merged
merged 4 commits into from
Jan 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 8 additions & 9 deletions cpp/benchmarks/io/csv/csv_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,7 @@ void BM_csv_read_varying_input(benchmark::State& state)

cuio_source_sink_pair source_sink(source_type);
cudf_io::csv_writer_options options =
cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view)
.include_header(true)
.rows_per_chunk(1 << 14); // TODO: remove once default is sensible
cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view).include_header(true);
cudf_io::write_csv(options);

cudf_io::csv_reader_options const read_options =
Expand All @@ -59,6 +57,7 @@ void BM_csv_read_varying_input(benchmark::State& state)

state.SetBytesProcessed(data_size * state.iterations());
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

void BM_csv_read_varying_options(benchmark::State& state)
Expand All @@ -79,23 +78,22 @@ void BM_csv_read_varying_options(benchmark::State& state)
auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
auto const view = tbl->view();

std::vector<char> csv_data;
cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
cudf_io::csv_writer_options options =
cudf_io::csv_writer_options::builder(cudf_io::sink_info{&csv_data}, view)
cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view)
.include_header(true)
.line_terminator("\r\n")
.rows_per_chunk(1 << 14); // TODO: remove once default is sensible
.line_terminator("\r\n");
cudf_io::write_csv(options);

cudf_io::csv_reader_options read_options =
cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_data.data(), csv_data.size()})
cudf_io::csv_reader_options::builder(source_sink.make_source_info())
.use_cols_indexes(cols_to_read)
.thousands('\'')
.windowslinetermination(true)
.comment('#')
.prefix("BM_");

size_t const chunk_size = csv_data.size() / num_chunks;
size_t const chunk_size = source_sink.size() / num_chunks;
cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
auto mem_stats_logger = cudf::memory_stats_logger();
for (auto _ : state) {
Expand Down Expand Up @@ -132,6 +130,7 @@ void BM_csv_read_varying_options(benchmark::State& state)
auto const data_processed = data_size * cols_to_read.size() / view.num_columns();
state.SetBytesProcessed(data_processed * state.iterations());
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

#define CSV_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type) \
Expand Down
10 changes: 5 additions & 5 deletions cpp/benchmarks/io/csv/csv_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,13 @@ void BM_csv_write_varying_inout(benchmark::State& state)
for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
cudf_io::csv_writer_options options =
cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view)
.include_header(true)
.rows_per_chunk(1 << 14); // TODO: remove once default is sensible
cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view).include_header(true);
cudf_io::write_csv(options);
}

state.SetBytesProcessed(data_size * state.iterations());
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

void BM_csv_write_varying_options(benchmark::State& state)
Expand All @@ -71,12 +70,12 @@ void BM_csv_write_varying_options(benchmark::State& state)
auto const view = tbl->view();

std::string const na_per(na_per_len, '#');
std::vector<char> csv_data;
cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
auto mem_stats_logger = cudf::memory_stats_logger();
for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
cudf_io::csv_writer_options options =
cudf_io::csv_writer_options::builder(cudf_io::sink_info{&csv_data}, view)
cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view)
.include_header(true)
.na_rep(na_per)
.rows_per_chunk(rows_per_chunk);
Expand All @@ -85,6 +84,7 @@ void BM_csv_write_varying_options(benchmark::State& state)

state.SetBytesProcessed(data_size * state.iterations());
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

#define CSV_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type) \
Expand Down
15 changes: 14 additions & 1 deletion cpp/benchmarks/io/cuio_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include <benchmarks/io/cuio_common.hpp>

#include <fstream>
#include <numeric>
#include <string>

Expand Down Expand Up @@ -53,13 +54,25 @@ cudf_io::source_info cuio_source_sink_pair::make_source_info()
cudf_io::sink_info cuio_source_sink_pair::make_sink_info()
{
switch (type) {
case io_type::VOID: return cudf_io::sink_info();
case io_type::VOID: return cudf_io::sink_info(&void_sink);
case io_type::FILEPATH: return cudf_io::sink_info(file_name);
case io_type::HOST_BUFFER: return cudf_io::sink_info(&buffer);
default: CUDF_FAIL("invalid output type");
}
}

size_t cuio_source_sink_pair::size()
{
switch (type) {
case io_type::VOID: return void_sink.bytes_written();
case io_type::FILEPATH:
return static_cast<size_t>(
std::ifstream(file_name, std::ifstream::ate | std::ifstream::binary).tellg());
case io_type::HOST_BUFFER: return buffer.size();
default: CUDF_FAIL("invalid output type");
}
}

std::vector<cudf::type_id> dtypes_for_column_selection(std::vector<cudf::type_id> const& data_types,
column_selection col_sel)
{
Expand Down
12 changes: 12 additions & 0 deletions cpp/benchmarks/io/cuio_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,15 @@ std::string random_file_in_dir(std::string const& dir_path);
* @brief Class to create a coupled `source_info` and `sink_info` of given type.
*/
class cuio_source_sink_pair {
class bytes_written_only_sink : public cudf::io::data_sink {
size_t _bytes_written = 0;

public:
void host_write(void const* data, size_t size) override { _bytes_written += size; }
void flush() override {}
size_t bytes_written() override { return _bytes_written; }
};

public:
cuio_source_sink_pair(io_type type);
~cuio_source_sink_pair()
Expand Down Expand Up @@ -66,12 +75,15 @@ class cuio_source_sink_pair {
*/
cudf::io::sink_info make_sink_info();

[[nodiscard]] size_t size();

private:
static temp_directory const tmpdir;

io_type const type;
std::vector<char> buffer;
std::string const file_name;
bytes_written_only_sink void_sink;
};

/**
Expand Down
16 changes: 9 additions & 7 deletions cpp/benchmarks/io/orc/orc_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,13 @@ void BM_orc_read_varying_input(benchmark::State& state)

state.SetBytesProcessed(data_size * state.iterations());
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

std::vector<std::string> get_col_names(std::vector<char> const& orc_data)
std::vector<std::string> get_col_names(cudf_io::source_info const& source)
{
cudf_io::orc_reader_options const read_options =
cudf_io::orc_reader_options::builder(cudf_io::source_info{orc_data.data(), orc_data.size()})
.num_rows(1);
cudf_io::orc_reader_options::builder(source).num_rows(1);
return cudf_io::read_orc(read_options).metadata.column_names;
}

Expand All @@ -99,14 +99,15 @@ void BM_orc_read_varying_options(benchmark::State& state)
auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
auto const view = tbl->view();

std::vector<char> orc_data;
cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
cudf_io::orc_writer_options options =
cudf_io::orc_writer_options::builder(cudf_io::sink_info{&orc_data}, view);
cudf_io::orc_writer_options::builder(source_sink.make_sink_info(), view);
cudf_io::write_orc(options);

auto const cols_to_read = select_column_names(get_col_names(orc_data), col_sel);
auto const cols_to_read =
select_column_names(get_col_names(source_sink.make_source_info()), col_sel);
cudf_io::orc_reader_options read_options =
cudf_io::orc_reader_options::builder(cudf_io::source_info{orc_data.data(), orc_data.size()})
cudf_io::orc_reader_options::builder(source_sink.make_source_info())
.columns(cols_to_read)
.use_index(use_index)
.use_np_dtypes(use_np_dtypes)
Expand Down Expand Up @@ -148,6 +149,7 @@ void BM_orc_read_varying_options(benchmark::State& state)
auto const data_processed = data_size * cols_to_read.size() / view.num_columns();
state.SetBytesProcessed(data_processed * state.iterations());
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

#define ORC_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type) \
Expand Down
2 changes: 2 additions & 0 deletions cpp/benchmarks/io/orc/orc_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ void BM_orc_write_varying_inout(benchmark::State& state)

state.SetBytesProcessed(data_size * state.iterations());
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

void BM_orc_write_varying_options(benchmark::State& state)
Expand Down Expand Up @@ -98,6 +99,7 @@ void BM_orc_write_varying_options(benchmark::State& state)

state.SetBytesProcessed(data_size * state.iterations());
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

#define ORC_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type) \
Expand Down
18 changes: 9 additions & 9 deletions cpp/benchmarks/io/parquet/parquet_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,13 @@ void BM_parq_read_varying_input(benchmark::State& state)

state.SetBytesProcessed(data_size * state.iterations());
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

std::vector<std::string> get_col_names(std::vector<char> const& parquet_data)
std::vector<std::string> get_col_names(cudf::io::source_info const& source)
{
cudf_io::parquet_reader_options const read_options =
cudf_io::parquet_reader_options::builder(
cudf_io::source_info{parquet_data.data(), parquet_data.size()})
.num_rows(1);
cudf_io::parquet_reader_options::builder(source).num_rows(1);
return cudf_io::read_parquet(read_options).metadata.column_names;
}

Expand All @@ -100,15 +99,15 @@ void BM_parq_read_varying_options(benchmark::State& state)
auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
auto const view = tbl->view();

std::vector<char> parquet_data;
cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
cudf_io::parquet_writer_options options =
cudf_io::parquet_writer_options::builder(cudf_io::sink_info{&parquet_data}, view);
cudf_io::parquet_writer_options::builder(source_sink.make_sink_info(), view);
cudf_io::write_parquet(options);

auto const cols_to_read = select_column_names(get_col_names(parquet_data), col_sel);
auto const cols_to_read =
select_column_names(get_col_names(source_sink.make_source_info()), col_sel);
cudf_io::parquet_reader_options read_options =
cudf_io::parquet_reader_options::builder(
cudf_io::source_info{parquet_data.data(), parquet_data.size()})
cudf_io::parquet_reader_options::builder(source_sink.make_source_info())
.columns(cols_to_read)
.convert_strings_to_categories(str_to_categories)
.use_pandas_metadata(use_pandas_metadata)
Expand Down Expand Up @@ -150,6 +149,7 @@ void BM_parq_read_varying_options(benchmark::State& state)
auto const data_processed = data_size * cols_to_read.size() / view.num_columns();
state.SetBytesProcessed(data_processed * state.iterations());
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

#define PARQ_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type) \
Expand Down
2 changes: 2 additions & 0 deletions cpp/benchmarks/io/parquet/parquet_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ void BM_parq_write_varying_inout(benchmark::State& state)

state.SetBytesProcessed(data_size * state.iterations());
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

void BM_parq_write_varying_options(benchmark::State& state)
Expand Down Expand Up @@ -93,6 +94,7 @@ void BM_parq_write_varying_options(benchmark::State& state)

state.SetBytesProcessed(data_size * state.iterations());
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

#define PARQ_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type) \
Expand Down
9 changes: 7 additions & 2 deletions cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/io/cuio_common.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/io/parquet.hpp>
Expand All @@ -48,15 +49,17 @@ void PQ_write(benchmark::State& state)
cudf::table_view view = tbl->view();

auto mem_stats_logger = cudf::memory_stats_logger();
cuio_source_sink_pair source_sink(io_type::VOID);
for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
cudf_io::parquet_writer_options opts =
cudf_io::parquet_writer_options::builder(cudf_io::sink_info(), view);
cudf_io::parquet_writer_options::builder(source_sink.make_sink_info(), view);
cudf_io::write_parquet(opts);
}

state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0));
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

void PQ_write_chunked(benchmark::State& state)
Expand All @@ -71,10 +74,11 @@ void PQ_write_chunked(benchmark::State& state)
}

auto mem_stats_logger = cudf::memory_stats_logger();
cuio_source_sink_pair source_sink(io_type::VOID);
for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
cudf_io::chunked_parquet_writer_options opts =
cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info());
cudf_io::chunked_parquet_writer_options::builder(source_sink.make_sink_info());
cudf_io::parquet_chunked_writer writer(opts);
std::for_each(tables.begin(), tables.end(), [&writer](std::unique_ptr<cudf::table> const& tbl) {
writer.write(*tbl);
Expand All @@ -84,6 +88,7 @@ void PQ_write_chunked(benchmark::State& state)

state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0));
state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
state.counters["encoded_file_size"] = source_sink.size();
}

#define PWBM_BENCHMARK_DEFINE(name, size, num_columns) \
Expand Down