From 72e6f9b08d3c52ca96ed64d963305ab9005ebff6 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 27 Dec 2023 13:40:43 -0800 Subject: [PATCH] Basic validation in reader benchmarks (#14647) Check the output table shape in the CSV, JSON, ORC and Parquet reader benchmarks. Other changes: Fixed some chunking logic in the CSV reader benchmark. Shortened the lifetime of the original table to reduce peak memory use (adopted the pattern from the JSON reader benchmark). Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/14647 --- cpp/benchmarks/io/csv/csv_reader_input.cpp | 19 ++- cpp/benchmarks/io/csv/csv_reader_options.cpp | 43 ++--- cpp/benchmarks/io/json/json_reader_input.cpp | 91 +++++------ cpp/benchmarks/io/orc/orc_reader_input.cpp | 69 ++++---- cpp/benchmarks/io/orc/orc_reader_options.cpp | 12 +- .../io/parquet/parquet_reader_input.cpp | 153 ++++++++++-------- .../io/parquet/parquet_reader_options.cpp | 12 +- 7 files changed, 221 insertions(+), 178 deletions(-) diff --git a/cpp/benchmarks/io/csv/csv_reader_input.cpp b/cpp/benchmarks/io/csv/csv_reader_input.cpp index 6216a9ecec2..2ad3bc36f59 100644 --- a/cpp/benchmarks/io/csv/csv_reader_input.cpp +++ b/cpp/benchmarks/io/csv/csv_reader_input.cpp @@ -47,14 +47,17 @@ void csv_read_common(DataType const& data_types, auto const mem_stats_logger = cudf::memory_stats_logger(); // init stats logger state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, - [&](nvbench::launch& launch, auto& timer) { - try_drop_l3_cache(); // Drop L3 cache for accurate measurement - - timer.start(); - cudf::io::read_csv(read_options); - timer.stop(); - }); + state.exec( + nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { + try_drop_l3_cache(); // Drop L3 cache for accurate measurement + + timer.start(); + auto const result = cudf::io::read_csv(read_options); + timer.stop(); + + CUDF_EXPECTS(result.tbl->num_columns() == view.num_columns(), "Unexpected number of columns"); + CUDF_EXPECTS(result.tbl->num_rows() == view.num_rows(), "Unexpected number of rows"); + }); auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); diff --git a/cpp/benchmarks/io/csv/csv_reader_options.cpp b/cpp/benchmarks/io/csv/csv_reader_options.cpp index 93ef5bed774..84c3106cbdf 100644 --- a/cpp/benchmarks/io/csv/csv_reader_options.cpp +++ b/cpp/benchmarks/io/csv/csv_reader_options.cpp @@ -19,8 +19,9 @@ #include #include +#include +#include #include -#include #include @@ -39,8 +40,9 @@ void BM_csv_read_varying_options( static_cast(data_type::DURATION), static_cast(data_type::STRING)}), ColSelection); - auto const cols_to_read = select_column_indexes(data_types.size(), ColSelection); - auto const num_chunks = state.get_int64("num_chunks"); + auto const cols_to_read = select_column_indexes(data_types.size(), ColSelection); + cudf::size_type const expected_num_cols = cols_to_read.size(); + size_t const num_chunks = state.get_int64("num_chunks"); auto const tbl = create_random_table(data_types, table_size_bytes{data_size}); auto const view = tbl->view(); @@ -60,43 +62,48 @@ void BM_csv_read_varying_options( .comment('#') .prefix("BM_"); - size_t const chunk_size = source_sink.size() / num_chunks; - cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks; - auto const mem_stats_logger = cudf::memory_stats_logger(); + size_t const chunk_size = cudf::util::div_rounding_up_safe(source_sink.size(), num_chunks); + auto const chunk_row_cnt = + cudf::util::div_rounding_up_safe(view.num_rows(), static_cast(num_chunks)); + auto const mem_stats_logger = cudf::memory_stats_logger(); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { try_drop_l3_cache(); // Drop L3 cache for accurate measurement - + cudf::size_type num_rows_read = 0; timer.start(); - for (int32_t chunk = 0; chunk < num_chunks; ++chunk) { - // only read the header in the first chunk - read_options.set_header(chunk == 0 ? 0 : -1); - - auto const is_last_chunk = chunk == (num_chunks - 1); + for (auto chunk = 0u; chunk < num_chunks; ++chunk) { switch (RowSelection) { case row_selection::ALL: break; case row_selection::BYTE_RANGE: + // with byte_range, we can't read the header in any chunk but the first + read_options.set_header(chunk == 0 ? 0 : -1); read_options.set_byte_range_offset(chunk * chunk_size); read_options.set_byte_range_size(chunk_size); - if (is_last_chunk) read_options.set_byte_range_size(0); break; case row_selection::NROWS: read_options.set_skiprows(chunk * chunk_row_cnt); read_options.set_nrows(chunk_row_cnt); - if (is_last_chunk) read_options.set_nrows(-1); break; - case row_selection::SKIPFOOTER: + case row_selection::SKIPFOOTER: { read_options.set_skiprows(chunk * chunk_row_cnt); - read_options.set_skipfooter(view.num_rows() - (chunk + 1) * chunk_row_cnt); - if (is_last_chunk) read_options.set_skipfooter(0); + cudf::size_type const next_chunk_start = (chunk + 1) * chunk_row_cnt; + auto const skip_footer = + view.num_rows() > next_chunk_start ? view.num_rows() - next_chunk_start : 0; + read_options.set_skipfooter(skip_footer); break; + } default: CUDF_FAIL("Unsupported row selection method"); } - cudf::io::read_csv(read_options); + auto const result = cudf::io::read_csv(read_options); + + num_rows_read += result.tbl->num_rows(); + CUDF_EXPECTS(result.tbl->num_columns() == expected_num_cols, + "Unexpected number of columns"); } timer.stop(); + CUDF_EXPECTS(num_rows_read == view.num_rows(), "Unexpected number of rows"); }); auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); diff --git a/cpp/benchmarks/io/json/json_reader_input.cpp b/cpp/benchmarks/io/json/json_reader_input.cpp index 31bb5dafa88..aa73dacdbc5 100644 --- a/cpp/benchmarks/io/json/json_reader_input.cpp +++ b/cpp/benchmarks/io/json/json_reader_input.cpp @@ -29,21 +29,26 @@ constexpr size_t data_size = 512 << 20; constexpr cudf::size_type num_cols = 64; -void json_read_common(cuio_source_sink_pair& source_sink, nvbench::state& state) +void json_read_common(cuio_source_sink_pair& source_sink, + cudf::size_type num_rows_to_read, + nvbench::state& state) { cudf::io::json_reader_options read_opts = cudf::io::json_reader_options::builder(source_sink.make_source_info()); auto mem_stats_logger = cudf::memory_stats_logger(); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, - [&](nvbench::launch& launch, auto& timer) { - try_drop_l3_cache(); + state.exec( + nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { + try_drop_l3_cache(); - timer.start(); - cudf::io::read_json(read_opts); - timer.stop(); - }); + timer.start(); + auto const result = cudf::io::read_json(read_opts); + timer.stop(); + + CUDF_EXPECTS(result.tbl->num_columns() == num_cols, "Unexpected number of columns"); + CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows"); + }); auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); @@ -52,55 +57,45 @@ void json_read_common(cuio_source_sink_pair& source_sink, nvbench::state& state) state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size"); } +cudf::size_type json_write_bm_data(cudf::io::sink_info sink, + std::vector const& dtypes) +{ + auto const tbl = create_random_table( + cycle_dtypes(dtypes, num_cols), table_size_bytes{data_size}, data_profile_builder()); + auto const view = tbl->view(); + + cudf::io::json_writer_options const write_opts = + cudf::io::json_writer_options::builder(sink, view).na_rep("null").rows_per_chunk(100'000); + cudf::io::write_json(write_opts); + return view.num_rows(); +} + template void BM_json_read_io(nvbench::state& state, nvbench::type_list>) { - auto const d_type = get_type_or_group({static_cast(data_type::INTEGRAL), - static_cast(data_type::FLOAT), - static_cast(data_type::DECIMAL), - static_cast(data_type::TIMESTAMP), - static_cast(data_type::DURATION), - static_cast(data_type::STRING), - static_cast(data_type::LIST), - static_cast(data_type::STRUCT)}); - - auto const source_type = IO; - cuio_source_sink_pair source_sink(source_type); - - { - auto const tbl = create_random_table( - cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, data_profile_builder()); - auto const view = tbl->view(); - - cudf::io::json_writer_options const write_opts = - cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view) - .na_rep("null") - .rows_per_chunk(100'000); - cudf::io::write_json(write_opts); - } - - json_read_common(source_sink, state); + cuio_source_sink_pair source_sink(IO); + auto const d_type = get_type_or_group({static_cast(data_type::INTEGRAL), + static_cast(data_type::FLOAT), + static_cast(data_type::DECIMAL), + static_cast(data_type::TIMESTAMP), + static_cast(data_type::DURATION), + static_cast(data_type::STRING), + static_cast(data_type::LIST), + static_cast(data_type::STRUCT)}); + auto const num_rows = json_write_bm_data(source_sink.make_sink_info(), d_type); + + json_read_common(source_sink, num_rows, state); } template void BM_json_read_data_type( nvbench::state& state, nvbench::type_list, nvbench::enum_type>) { - auto const d_type = get_type_or_group(static_cast(DataType)); - auto const source_type = IO; - cuio_source_sink_pair source_sink(source_type); - { - auto const tbl = create_random_table( - cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, data_profile_builder()); - auto const view = tbl->view(); - - cudf::io::json_writer_options const write_opts = - cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view) - .na_rep("null") - .rows_per_chunk(100'000); - cudf::io::write_json(write_opts); - } - json_read_common(source_sink, state); + cuio_source_sink_pair source_sink(IO); + auto const d_type = get_type_or_group(static_cast(DataType)); + auto const num_rows = json_write_bm_data(source_sink.make_sink_info(), d_type); + + json_read_common(source_sink, num_rows, state); } using d_type_list = nvbench::enum_type_listnum_columns() == num_cols, "Unexpected number of columns"); + CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows"); + }); auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); @@ -63,18 +64,22 @@ void BM_orc_read_data(nvbench::state& state, auto const d_type = get_type_or_group(static_cast(DataType)); cudf::size_type const cardinality = state.get_int64("cardinality"); cudf::size_type const run_length = state.get_int64("run_length"); + cuio_source_sink_pair source_sink(IOType); - auto const tbl = - create_random_table(cycle_dtypes(d_type, num_cols), - table_size_bytes{data_size}, - data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); - auto const view = tbl->view(); + auto const num_rows_written = [&]() { + auto const tbl = create_random_table( + cycle_dtypes(d_type, num_cols), + table_size_bytes{data_size}, + data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); + auto const view = tbl->view(); - cuio_source_sink_pair source_sink(IOType); - cudf::io::orc_writer_options opts = - cudf::io::orc_writer_options::builder(source_sink.make_sink_info(), view); + cudf::io::orc_writer_options opts = + cudf::io::orc_writer_options::builder(source_sink.make_sink_info(), view); + cudf::io::write_orc(opts); + return view.num_rows(); + }(); - orc_read_common(opts, source_sink, state); + orc_read_common(num_rows_written, source_sink, state); } template @@ -92,19 +97,23 @@ void BM_orc_read_io_compression( cudf::size_type const cardinality = state.get_int64("cardinality"); cudf::size_type const run_length = state.get_int64("run_length"); - - auto const tbl = - create_random_table(cycle_dtypes(d_type, num_cols), - table_size_bytes{data_size}, - data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); - auto const view = tbl->view(); - cuio_source_sink_pair source_sink(IOType); - cudf::io::orc_writer_options opts = - cudf::io::orc_writer_options::builder(source_sink.make_sink_info(), view) - .compression(Compression); - orc_read_common(opts, source_sink, state); + auto const num_rows_written = [&]() { + auto const tbl = create_random_table( + cycle_dtypes(d_type, num_cols), + table_size_bytes{data_size}, + data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); + auto const view = tbl->view(); + + cudf::io::orc_writer_options opts = + cudf::io::orc_writer_options::builder(source_sink.make_sink_info(), view) + .compression(Compression); + cudf::io::write_orc(opts); + return view.num_rows(); + }(); + + orc_read_common(num_rows_written, source_sink, state); } using d_type_list = nvbench::enum_type_listnum_rows(); + auto const result = cudf::io::read_orc(read_options); + + num_rows_read += result.tbl->num_rows(); + CUDF_EXPECTS(result.tbl->num_columns() == expected_num_cols, + "Unexpected number of columns"); } - CUDF_EXPECTS(rows_read == view.num_rows(), "Benchmark did not read the entire table"); timer.stop(); + CUDF_EXPECTS(num_rows_read == view.num_rows(), "Benchmark did not read the entire table"); }); auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp index 6db147cbfef..019e0f30fe9 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp @@ -29,25 +29,27 @@ constexpr size_t data_size = 512 << 20; constexpr cudf::size_type num_cols = 64; -void parquet_read_common(cudf::io::parquet_writer_options const& write_opts, +void parquet_read_common(cudf::size_type num_rows_to_read, + cudf::size_type num_cols_to_read, cuio_source_sink_pair& source_sink, nvbench::state& state) { - cudf::io::write_parquet(write_opts); - cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder(source_sink.make_source_info()); auto mem_stats_logger = cudf::memory_stats_logger(); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, - [&](nvbench::launch& launch, auto& timer) { - try_drop_l3_cache(); + state.exec( + nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { + try_drop_l3_cache(); + + timer.start(); + auto const result = cudf::io::read_parquet(read_opts); + timer.stop(); - timer.start(); - cudf::io::read_parquet(read_opts); - timer.stop(); - }); + CUDF_EXPECTS(result.tbl->num_columns() == num_cols_to_read, "Unexpected number of columns"); + CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows"); + }); auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); @@ -64,19 +66,23 @@ void BM_parquet_read_data(nvbench::state& state, nvbench::type_list(state.get_int64("run_length")); auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); auto const compression = cudf::io::compression_type::SNAPPY; - - auto const tbl = - create_random_table(cycle_dtypes(d_type, num_cols), - table_size_bytes{data_size}, - data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); - auto const view = tbl->view(); - cuio_source_sink_pair source_sink(source_type); - cudf::io::parquet_writer_options write_opts = - cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) - .compression(compression); - parquet_read_common(write_opts, source_sink, state); + auto const num_rows_written = [&]() { + auto const tbl = create_random_table( + cycle_dtypes(d_type, num_cols), + table_size_bytes{data_size}, + data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); + auto const view = tbl->view(); + + cudf::io::parquet_writer_options write_opts = + cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) + .compression(compression); + cudf::io::write_parquet(write_opts); + return view.num_rows(); + }(); + + parquet_read_common(num_rows_written, num_cols, source_sink, state); } void BM_parquet_read_io_compression(nvbench::state& state) @@ -94,19 +100,23 @@ void BM_parquet_read_io_compression(nvbench::state& state) auto const run_length = static_cast(state.get_int64("run_length")); auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); auto const compression = retrieve_compression_type_enum(state.get_string("compression_type")); - - auto const tbl = - create_random_table(cycle_dtypes(d_type, num_cols), - table_size_bytes{data_size}, - data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); - auto const view = tbl->view(); - cuio_source_sink_pair source_sink(source_type); - cudf::io::parquet_writer_options write_opts = - cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) - .compression(compression); - parquet_read_common(write_opts, source_sink, state); + auto const num_rows_written = [&]() { + auto const tbl = create_random_table( + cycle_dtypes(d_type, num_cols), + table_size_bytes{data_size}, + data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); + auto const view = tbl->view(); + + cudf::io::parquet_writer_options write_opts = + cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) + .compression(compression); + cudf::io::write_parquet(write_opts); + return view.num_rows(); + }(); + + parquet_read_common(num_rows_written, num_cols, source_sink, state); } void BM_parquet_read_io_small_mixed(nvbench::state& state) @@ -118,25 +128,28 @@ void BM_parquet_read_io_small_mixed(nvbench::state& state) auto const run_length = static_cast(state.get_int64("run_length")); auto const num_strings = static_cast(state.get_int64("num_string_cols")); auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); + cuio_source_sink_pair source_sink(source_type); // want 80 pages total, across 4 columns, so 20 pages per column cudf::size_type constexpr n_col = 4; cudf::size_type constexpr page_size_rows = 10'000; cudf::size_type constexpr num_rows = page_size_rows * (80 / n_col); - auto const tbl = - create_random_table(mix_dtypes(d_type, n_col, num_strings), - row_count{num_rows}, - data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); - auto const view = tbl->view(); - - cuio_source_sink_pair source_sink(source_type); - cudf::io::parquet_writer_options write_opts = - cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) - .max_page_size_rows(10'000) - .compression(cudf::io::compression_type::NONE); - - parquet_read_common(write_opts, source_sink, state); + { + auto const tbl = create_random_table( + mix_dtypes(d_type, n_col, num_strings), + row_count{num_rows}, + data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); + auto const view = tbl->view(); + + cudf::io::parquet_writer_options write_opts = + cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) + .max_page_size_rows(10'000) + .compression(cudf::io::compression_type::NONE); + cudf::io::write_parquet(write_opts); + } + + parquet_read_common(num_rows, n_col, source_sink, state); } template @@ -148,36 +161,44 @@ void BM_parquet_read_chunks(nvbench::state& state, nvbench::type_list(state.get_int64("byte_limit")); auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); auto const compression = cudf::io::compression_type::SNAPPY; + cuio_source_sink_pair source_sink(source_type); - auto const tbl = - create_random_table(cycle_dtypes(d_type, num_cols), - table_size_bytes{data_size}, - data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); - auto const view = tbl->view(); + auto const num_rows_written = [&]() { + auto const tbl = create_random_table( + cycle_dtypes(d_type, num_cols), + table_size_bytes{data_size}, + data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); + auto const view = tbl->view(); - cuio_source_sink_pair source_sink(source_type); - cudf::io::parquet_writer_options write_opts = - cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) - .compression(compression); + cudf::io::parquet_writer_options write_opts = + cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) + .compression(compression); - cudf::io::write_parquet(write_opts); + cudf::io::write_parquet(write_opts); + return view.num_rows(); + }(); cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder(source_sink.make_source_info()); auto mem_stats_logger = cudf::memory_stats_logger(); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, - [&](nvbench::launch& launch, auto& timer) { - try_drop_l3_cache(); - - timer.start(); - auto reader = cudf::io::chunked_parquet_reader(byte_limit, read_opts); - do { - [[maybe_unused]] auto const chunk = reader.read_chunk(); - } while (reader.has_next()); - timer.stop(); - }); + state.exec( + nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { + try_drop_l3_cache(); + + timer.start(); + auto reader = cudf::io::chunked_parquet_reader(byte_limit, read_opts); + cudf::size_type num_rows_read = 0; + do { + auto const result = reader.read_chunk(); + num_rows_read += result.tbl->num_rows(); + CUDF_EXPECTS(result.tbl->num_columns() == num_cols, "Unexpected number of columns"); + } while (reader.has_next()); + timer.stop(); + + CUDF_EXPECTS(num_rows_read == num_rows_written, "Benchmark did not read the entire table"); + }); auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); diff --git a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp index 9f221de7da2..62925e8d315 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp @@ -83,6 +83,7 @@ void BM_parquet_read_options(nvbench::state& state, auto const cols_to_read = select_column_names(get_top_level_col_names(source_sink.make_source_info()), ColSelection); + cudf::size_type const expected_num_cols = cols_to_read.size(); cudf::io::parquet_reader_options read_options = cudf::io::parquet_reader_options::builder(source_sink.make_source_info()) .columns(cols_to_read) @@ -98,9 +99,8 @@ void BM_parquet_read_options(nvbench::state& state, state.exec( nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { try_drop_l3_cache(); - + cudf::size_type num_rows_read = 0; timer.start(); - cudf::size_type rows_read = 0; for (int32_t chunk = 0; chunk < num_chunks; ++chunk) { switch (RowSelection) { case row_selection::ALL: break; @@ -114,11 +114,15 @@ void BM_parquet_read_options(nvbench::state& state, default: CUDF_FAIL("Unsupported row selection method"); } - rows_read += cudf::io::read_parquet(read_options).tbl->num_rows(); + auto const result = cudf::io::read_parquet(read_options); + + num_rows_read += result.tbl->num_rows(); + CUDF_EXPECTS(result.tbl->num_columns() == expected_num_cols, + "Unexpected number of columns"); } - CUDF_EXPECTS(rows_read == view.num_rows(), "Benchmark did not read the entire table"); timer.stop(); + CUDF_EXPECTS(num_rows_read == view.num_rows(), "Benchmark did not read the entire table"); }); auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");