diff --git a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp index b5e4f6d8f2b..6e187afd6ab 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,7 +34,15 @@ std::vector get_col_names(cudf::io::source_info const& source) { cudf::io::parquet_reader_options const read_options = cudf::io::parquet_reader_options::builder(source); - return cudf::io::read_parquet(read_options).metadata.column_names; + auto const schema = cudf::io::read_parquet(read_options).metadata.schema_info; + + std::vector names; + names.reserve(schema.size()); + std::transform(schema.cbegin(), schema.cend(), std::back_inserter(names), [](auto const& c) { + CUDF_EXPECTS(c.children.empty(), "nested types are not supported"); + return c.name; + }); + return names; } template column_names; //!< Names of columns contained in the table std::vector schema_info; //!< Detailed name information for the entire output hierarchy std::map user_data; //!< Format-dependent metadata of the first input diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu index 614ae2817a4..d9da2f083d1 100644 --- a/cpp/src/io/avro/reader_impl.cu +++ b/cpp/src/io/avro/reader_impl.cu @@ -570,11 +570,13 @@ table_with_metadata read_avro(std::unique_ptr&& source, } } - // Return column names (must match order of returned columns) - metadata_out.column_names.resize(selected_columns.size()); - for (size_t i = 0; i < selected_columns.size(); i++) { - metadata_out.column_names[i] = selected_columns[i].second; - } + // Return column names + metadata_out.schema_info.reserve(selected_columns.size()); + std::transform(selected_columns.cbegin(), + selected_columns.cend(), + std::back_inserter(metadata_out.schema_info), + [](auto const& c) { return column_name_info{c.second}; }); + // Return user metadata metadata_out.user_data = meta.user_data; metadata_out.per_file_user_data = {{meta.user_data.begin(), meta.user_data.end()}}; diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 02712edba3b..2e38ea7f4ab 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -845,7 +845,7 @@ table_with_metadata read_csv(cudf::io::datasource* source, stream, mr); for (size_t i = 0; i < column_types.size(); ++i) { - metadata.column_names.emplace_back(out_buffers[i].name); + metadata.schema_info.emplace_back(out_buffers[i].name); if (column_types[i].id() == type_id::STRING && parse_opts.quotechar != '\0' && parse_opts.doublequote) { // PANDAS' default behavior of enabling doublequote for two consecutive @@ -869,7 +869,7 @@ table_with_metadata read_csv(cudf::io::datasource* source, // Handle empty metadata for (int col = 0; col < num_actual_columns; ++col) { if (column_flags[col] & column_parse::enabled) { - metadata.column_names.emplace_back(column_names[col]); + metadata.schema_info.emplace_back(column_names[col]); } } } diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index d57abbdd4b8..be911eca193 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -931,8 +931,7 @@ table_with_metadata device_parse_nested_json(device_span d_input, // Zero row entries if (data_root.type == json_col_t::ListColumn && data_root.child_columns.size() == 0) { - return table_with_metadata{std::make_unique(std::vector>{}), - {{}, std::vector{}}}; + return table_with_metadata{std::make_unique
(std::vector>{})}; } // Verify that we were in fact given a list of structs (or in JSON speech: an array of objects) @@ -1013,8 +1012,7 @@ table_with_metadata device_parse_nested_json(device_span d_input, column_index++; } - return table_with_metadata{std::make_unique
(std::move(out_columns)), - {{}, out_column_names}}; + return table_with_metadata{std::make_unique
(std::move(out_columns)), {out_column_names}}; } } // namespace detail diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index c0d9088abba..fb58b48d68d 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1772,8 +1772,7 @@ table_with_metadata host_parse_nested_json(device_span d_input, // Zero row entries if (data_root.type == json_col_t::ListColumn && data_root.child_columns.size() == 0) { - return table_with_metadata{std::make_unique
(std::vector>{}), - {{}, std::vector{}}}; + return table_with_metadata{std::make_unique
(std::vector>{})}; } // Verify that we were in fact given a list of structs (or in JSON speech: an array of objects) @@ -1848,8 +1847,7 @@ table_with_metadata host_parse_nested_json(device_span d_input, column_index++; } - return table_with_metadata{std::make_unique
(std::move(out_columns)), - {{}, out_column_names}}; + return table_with_metadata{std::make_unique
(std::move(out_columns)), {out_column_names}}; } } // namespace detail diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index bb35fb3576e..0ff1b1fa340 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -575,7 +575,7 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, CUDF_EXPECTS(!out_columns.empty(), "No columns created from json input"); - return table_with_metadata{std::make_unique
(std::move(out_columns)), {{}, column_infos}}; + return table_with_metadata{std::make_unique
(std::move(out_columns)), {column_infos}}; } /** diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 9cd14981d77..96eb20e1e66 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -1283,13 +1283,6 @@ table_with_metadata reader::impl::read(size_type skip_rows, create_columns(std::move(out_buffers), out_columns, schema_info, stream); } - // Return column names (must match order of returned columns) - out_metadata.column_names.reserve(schema_info.size()); - std::transform(schema_info.cbegin(), - schema_info.cend(), - std::back_inserter(out_metadata.column_names), - [](auto info) { return info.name; }); - out_metadata.schema_info = std::move(schema_info); std::transform(_metadata.per_file_metadata.cbegin(), diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 045e5d55ccd..d5dac10b8f6 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -311,11 +311,11 @@ table_with_metadata reader::impl::finalize_output(table_metadata& out_metadata, } if (!_output_metadata) { - // Return column names (must match order of returned columns) - out_metadata.column_names.resize(_output_buffers.size()); + // Return column names + out_metadata.schema_info.resize(_output_buffers.size()); for (size_t i = 0; i < _output_column_schemas.size(); i++) { - auto const& schema = _metadata->get_schema(_output_column_schemas[i]); - out_metadata.column_names[i] = schema.name; + auto const& schema = _metadata->get_schema(_output_column_schemas[i]); + out_metadata.schema_info[i].name = schema.name; } // Return user metadata diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index 05062130066..a0daab767c0 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -1814,7 +1814,9 @@ TEST_F(CsvReaderTest, StringsWithWriter) const auto result_table = result.tbl->view(); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_table.column(0), result_table.column(0)); check_string_column(input_table.column(1), result_table.column(1)); - ASSERT_EQ(names, result.metadata.column_names); + ASSERT_EQ(result.metadata.schema_info.size(), names.size()); + for (auto i = 0ul; i < names.size(); ++i) + EXPECT_EQ(names[i], result.metadata.schema_info[i].name); } TEST_F(CsvReaderTest, StringsWithWriterSimple) @@ -1839,7 +1841,9 @@ TEST_F(CsvReaderTest, StringsWithWriterSimple) const auto result_table = result.tbl->view(); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_table.column(0), result_table.column(0)); check_string_column(input_table.column(1), result_table.column(1)); - ASSERT_EQ(names, result.metadata.column_names); + ASSERT_EQ(result.metadata.schema_info.size(), names.size()); + for (auto i = 0ul; i < names.size(); ++i) + EXPECT_EQ(names[i], result.metadata.schema_info[i].name); } TEST_F(CsvReaderTest, StringsEmbeddedDelimiter) @@ -1860,7 +1864,9 @@ TEST_F(CsvReaderTest, StringsEmbeddedDelimiter) auto result = cudf::io::read_csv(in_opts); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result.tbl->view()); - ASSERT_EQ(names, result.metadata.column_names); + ASSERT_EQ(result.metadata.schema_info.size(), names.size()); + for (auto i = 0ul; i < names.size(); ++i) + EXPECT_EQ(names[i], result.metadata.schema_info[i].name); } TEST_F(CsvReaderTest, HeaderEmbeddedDelimiter) @@ -1888,7 +1894,9 @@ TEST_F(CsvReaderTest, HeaderEmbeddedDelimiter) auto result = cudf::io::read_csv(in_opts); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result.tbl->view()); - ASSERT_EQ(names, result.metadata.column_names); + ASSERT_EQ(result.metadata.schema_info.size(), names.size()); + for (auto i = 0ul; i < names.size(); ++i) + EXPECT_EQ(names[i], result.metadata.schema_info[i].name); } TEST_F(CsvReaderTest, EmptyFileWithWriter) @@ -1994,7 +2002,9 @@ TEST_F(CsvReaderTest, DurationsWithWriter) const auto result_table = result.tbl->view(); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result_table); - ASSERT_EQ(names, result.metadata.column_names); + ASSERT_EQ(result.metadata.schema_info.size(), names.size()); + for (auto i = 0ul; i < names.size(); ++i) + EXPECT_EQ(names[i], result.metadata.schema_info[i].name); } TEST_F(CsvReaderTest, ParseInRangeIntegers) @@ -2269,8 +2279,8 @@ TEST_F(CsvReaderTest, CsvDefaultOptionsWriteReadMatch) // verify that the tables are identical, or as identical as expected. const auto new_table_view = new_table_and_metadata.tbl->view(); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, new_table_view); - EXPECT_EQ(new_table_and_metadata.metadata.column_names[0], "0"); - EXPECT_EQ(new_table_and_metadata.metadata.column_names[1], "1"); + EXPECT_EQ(new_table_and_metadata.metadata.schema_info[0].name, "0"); + EXPECT_EQ(new_table_and_metadata.metadata.schema_info[1].name, "1"); } TEST_F(CsvReaderTest, UseColsValidation) diff --git a/python/cudf/cudf/_lib/avro.pyx b/python/cudf/cudf/_lib/avro.pyx index 0c8886ca356..0e24b5b7459 100644 --- a/python/cudf/cudf/_lib/avro.pyx +++ b/python/cudf/cudf/_lib/avro.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from libcpp.string cimport string from libcpp.utility cimport move @@ -50,6 +50,6 @@ cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1): with nogil: c_result = move(libcudf_read_avro(options)) - names = [name.decode() for name in c_result.metadata.column_names] + names = [info.name.decode() for info in c_result.metadata.schema_info] return data_from_unique_ptr(move(c_result.tbl), column_names=names) diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 1f66ed0ee83..eb6683aed31 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -425,7 +425,7 @@ def read_csv( with nogil: c_result = move(cpp_read_csv(read_csv_options_c)) - meta_names = [name.decode() for name in c_result.metadata.column_names] + meta_names = [info.name.decode() for info in c_result.metadata.schema_info] df = cudf.DataFrame._from_data(*data_from_unique_ptr( move(c_result.tbl), column_names=meta_names diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index a51d481c6cd..281b2cabc52 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import cudf from cudf.core.buffer import acquire_spill_lock @@ -120,7 +120,7 @@ cpdef read_orc(object filepaths_or_buffers, with nogil: c_result = move(libcudf_read_orc(c_orc_reader_options)) - names = [name.decode() for name in c_result.metadata.column_names] + names = [info.name.decode() for info in c_result.metadata.schema_info] actual_index_names, col_names, is_range_index, reset_index_name, \ range_idx = _get_index_from_metadata(c_result.metadata.user_data, names, diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 319c0f62158..e5520ae1987 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. # cython: boundscheck = False @@ -176,17 +176,17 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, args.set_columns(cpp_columns) # Read Parquet - cdef cudf_io_types.table_with_metadata c_out_table + cdef cudf_io_types.table_with_metadata c_result with nogil: - c_out_table = move(parquet_reader(args)) + c_result = move(parquet_reader(args)) - column_names = [x.decode() for x in c_out_table.metadata.column_names] + names = [info.name.decode() for info in c_result.metadata.schema_info] # Access the Parquet per_file_user_data to find the index index_col = None cdef vector[unordered_map[string, string]] per_file_user_data = \ - c_out_table.metadata.per_file_user_data + c_result.metadata.per_file_user_data index_col_names = None is_range_index = True @@ -207,11 +207,11 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, index_col_names[idx_col] = c['name'] df = cudf.DataFrame._from_data(*data_from_unique_ptr( - move(c_out_table.tbl), - column_names=column_names + move(c_result.tbl), + column_names=names )) - update_struct_field_names(df, c_out_table.metadata.schema_info) + update_struct_field_names(df, c_result.metadata.schema_info) if meta is not None: # Book keep each column metadata as the order @@ -222,7 +222,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, } # update the decimal precision of each column - for col in column_names: + for col in names: if is_decimal_dtype(df._data[col].dtype): df._data[col].dtype.precision = ( meta_data_per_column[col]["metadata"]["precision"] @@ -286,7 +286,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, ) df._index = idx - elif set(index_col).issubset(column_names): + elif set(index_col).issubset(names): index_data = df[index_col] actual_index_names = list(index_col_names.values()) if len(index_data._data) == 1: