From 46b59006556a4e5bc4ff4258399e813c397eb7fd Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 6 Apr 2023 18:54:39 -0700 Subject: [PATCH] Fix column selection `read_parquet` benchmarks (#13082) Helper function `get_col_names` in the Parquet reader benchmarks throws with nested columns. It should instead just ignore the children columns and return the top-level colum names. Also renamed the function to better reflect what it does. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - https://github.com/nvdbaranec - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/13082 --- cpp/benchmarks/io/orc/orc_reader_options.cpp | 4 ++-- cpp/benchmarks/io/parquet/parquet_reader_options.cpp | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/benchmarks/io/orc/orc_reader_options.cpp b/cpp/benchmarks/io/orc/orc_reader_options.cpp index 1e841f744ae..0361ba7c7a6 100644 --- a/cpp/benchmarks/io/orc/orc_reader_options.cpp +++ b/cpp/benchmarks/io/orc/orc_reader_options.cpp @@ -33,7 +33,7 @@ constexpr int64_t data_size = 512 << 20; // Each call reads roughly equal amounts of data constexpr int32_t chunked_read_num_chunks = 8; -std::vector get_col_names(cudf::io::source_info const& source) +std::vector get_top_level_col_names(cudf::io::source_info const& source) { auto const top_lvl_cols = cudf::io::read_orc_metadata(source).schema().root().children(); std::vector col_names; @@ -79,7 +79,7 @@ void BM_orc_read_varying_options(nvbench::state& state, cudf::io::write_orc(options); auto const cols_to_read = - select_column_names(get_col_names(source_sink.make_source_info()), ColSelection); + select_column_names(get_top_level_col_names(source_sink.make_source_info()), ColSelection); cudf::io::orc_reader_options read_options = cudf::io::orc_reader_options::builder(source_sink.make_source_info()) .columns(cols_to_read) diff --git a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp index 3fd46fa08f2..5a6e4a8cb72 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp @@ -30,7 +30,7 @@ constexpr std::size_t data_size = 512 << 20; constexpr std::size_t row_group_size = 128 << 20; -std::vector get_col_names(cudf::io::source_info const& source) +std::vector get_top_level_col_names(cudf::io::source_info const& source) { cudf::io::parquet_reader_options const read_options = cudf::io::parquet_reader_options::builder(source); @@ -39,7 +39,6 @@ std::vector get_col_names(cudf::io::source_info const& source) std::vector names; names.reserve(schema.size()); std::transform(schema.cbegin(), schema.cend(), std::back_inserter(names), [](auto const& c) { - CUDF_EXPECTS(c.children.empty(), "nested types are not supported"); return c.name; }); return names; @@ -81,7 +80,7 @@ void BM_parquet_read_options(nvbench::state& state, cudf::io::write_parquet(options); auto const cols_to_read = - select_column_names(get_col_names(source_sink.make_source_info()), ColSelection); + select_column_names(get_top_level_col_names(source_sink.make_source_info()), ColSelection); cudf::io::parquet_reader_options read_options = cudf::io::parquet_reader_options::builder(source_sink.make_source_info()) .columns(cols_to_read)