Skip to content

Commit

Permalink
Remove column names (#12578)
Browse files Browse the repository at this point in the history
Closes #6411

Removed `column_names` in favor of `schema_info`.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Elias Stehle (https://github.com/elstehle)
  - Karthikeyan (https://github.com/karthikeyann)
  - Bradley Dice (https://github.com/bdice)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: #12578
  • Loading branch information
vuule authored Jan 24, 2023
1 parent 7f6ae05 commit 9c9862f
Show file tree
Hide file tree
Showing 14 changed files with 61 additions and 66 deletions.
12 changes: 10 additions & 2 deletions cpp/benchmarks/io/parquet/parquet_reader_options.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -34,7 +34,15 @@ std::vector<std::string> get_col_names(cudf::io::source_info const& source)
{
cudf::io::parquet_reader_options const read_options =
cudf::io::parquet_reader_options::builder(source);
return cudf::io::read_parquet(read_options).metadata.column_names;
auto const schema = cudf::io::read_parquet(read_options).metadata.schema_info;

std::vector<std::string> names;
names.reserve(schema.size());
std::transform(schema.cbegin(), schema.cend(), std::back_inserter(names), [](auto const& c) {
CUDF_EXPECTS(c.children.empty(), "nested types are not supported");
return c.name;
});
return names;
}

template <column_selection ColSelection,
Expand Down
16 changes: 1 addition & 15 deletions cpp/include/cudf/io/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,23 +128,9 @@ struct column_name_info {
};

/**
* @brief Table metadata for io readers/writers (primarily column names)
*
* For nested types (structs, maps, unions), the ordering of names in the column_names vector
* corresponds to a pre-order traversal of the column tree.
* In the example below (2 top-level columns: struct column "col1" and string column "col2"),
* column_names = {"col1", "s3", "f5", "f6", "f4", "col2"}.
*
* col1 col2
* / \
* / \
* s3 f4
* / \
* / \
* f5 f6
* @brief Table metadata returned by IO readers.
*/
struct table_metadata {
std::vector<std::string> column_names; //!< Names of columns contained in the table
std::vector<column_name_info>
schema_info; //!< Detailed name information for the entire output hierarchy
std::map<std::string, std::string> user_data; //!< Format-dependent metadata of the first input
Expand Down
12 changes: 7 additions & 5 deletions cpp/src/io/avro/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -570,11 +570,13 @@ table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
}
}

// Return column names (must match order of returned columns)
metadata_out.column_names.resize(selected_columns.size());
for (size_t i = 0; i < selected_columns.size(); i++) {
metadata_out.column_names[i] = selected_columns[i].second;
}
// Return column names
metadata_out.schema_info.reserve(selected_columns.size());
std::transform(selected_columns.cbegin(),
selected_columns.cend(),
std::back_inserter(metadata_out.schema_info),
[](auto const& c) { return column_name_info{c.second}; });

// Return user metadata
metadata_out.user_data = meta.user_data;
metadata_out.per_file_user_data = {{meta.user_data.begin(), meta.user_data.end()}};
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/io/csv/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -845,7 +845,7 @@ table_with_metadata read_csv(cudf::io::datasource* source,
stream,
mr);
for (size_t i = 0; i < column_types.size(); ++i) {
metadata.column_names.emplace_back(out_buffers[i].name);
metadata.schema_info.emplace_back(out_buffers[i].name);
if (column_types[i].id() == type_id::STRING && parse_opts.quotechar != '\0' &&
parse_opts.doublequote) {
// PANDAS' default behavior of enabling doublequote for two consecutive
Expand All @@ -869,7 +869,7 @@ table_with_metadata read_csv(cudf::io::datasource* source,
// Handle empty metadata
for (int col = 0; col < num_actual_columns; ++col) {
if (column_flags[col] & column_parse::enabled) {
metadata.column_names.emplace_back(column_names[col]);
metadata.schema_info.emplace_back(column_names[col]);
}
}
}
Expand Down
6 changes: 2 additions & 4 deletions cpp/src/io/json/json_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -931,8 +931,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,

// Zero row entries
if (data_root.type == json_col_t::ListColumn && data_root.child_columns.size() == 0) {
return table_with_metadata{std::make_unique<table>(std::vector<std::unique_ptr<column>>{}),
{{}, std::vector<column_name_info>{}}};
return table_with_metadata{std::make_unique<table>(std::vector<std::unique_ptr<column>>{})};
}

// Verify that we were in fact given a list of structs (or in JSON speech: an array of objects)
Expand Down Expand Up @@ -1013,8 +1012,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
column_index++;
}

return table_with_metadata{std::make_unique<table>(std::move(out_columns)),
{{}, out_column_names}};
return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {out_column_names}};
}

} // namespace detail
Expand Down
6 changes: 2 additions & 4 deletions cpp/src/io/json/nested_json_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1772,8 +1772,7 @@ table_with_metadata host_parse_nested_json(device_span<SymbolT const> d_input,

// Zero row entries
if (data_root.type == json_col_t::ListColumn && data_root.child_columns.size() == 0) {
return table_with_metadata{std::make_unique<table>(std::vector<std::unique_ptr<column>>{}),
{{}, std::vector<column_name_info>{}}};
return table_with_metadata{std::make_unique<table>(std::vector<std::unique_ptr<column>>{})};
}

// Verify that we were in fact given a list of structs (or in JSON speech: an array of objects)
Expand Down Expand Up @@ -1848,8 +1847,7 @@ table_with_metadata host_parse_nested_json(device_span<SymbolT const> d_input,
column_index++;
}

return table_with_metadata{std::make_unique<table>(std::move(out_columns)),
{{}, out_column_names}};
return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {out_column_names}};
}

} // namespace detail
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/json/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,7 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,

CUDF_EXPECTS(!out_columns.empty(), "No columns created from json input");

return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {{}, column_infos}};
return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {column_infos}};
}

/**
Expand Down
7 changes: 0 additions & 7 deletions cpp/src/io/orc/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1283,13 +1283,6 @@ table_with_metadata reader::impl::read(size_type skip_rows,
create_columns(std::move(out_buffers), out_columns, schema_info, stream);
}

// Return column names (must match order of returned columns)
out_metadata.column_names.reserve(schema_info.size());
std::transform(schema_info.cbegin(),
schema_info.cend(),
std::back_inserter(out_metadata.column_names),
[](auto info) { return info.name; });

out_metadata.schema_info = std::move(schema_info);

std::transform(_metadata.per_file_metadata.cbegin(),
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/io/parquet/reader_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -311,11 +311,11 @@ table_with_metadata reader::impl::finalize_output(table_metadata& out_metadata,
}

if (!_output_metadata) {
// Return column names (must match order of returned columns)
out_metadata.column_names.resize(_output_buffers.size());
// Return column names
out_metadata.schema_info.resize(_output_buffers.size());
for (size_t i = 0; i < _output_column_schemas.size(); i++) {
auto const& schema = _metadata->get_schema(_output_column_schemas[i]);
out_metadata.column_names[i] = schema.name;
auto const& schema = _metadata->get_schema(_output_column_schemas[i]);
out_metadata.schema_info[i].name = schema.name;
}

// Return user metadata
Expand Down
24 changes: 17 additions & 7 deletions cpp/tests/io/csv_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1814,7 +1814,9 @@ TEST_F(CsvReaderTest, StringsWithWriter)
const auto result_table = result.tbl->view();
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_table.column(0), result_table.column(0));
check_string_column(input_table.column(1), result_table.column(1));
ASSERT_EQ(names, result.metadata.column_names);
ASSERT_EQ(result.metadata.schema_info.size(), names.size());
for (auto i = 0ul; i < names.size(); ++i)
EXPECT_EQ(names[i], result.metadata.schema_info[i].name);
}

TEST_F(CsvReaderTest, StringsWithWriterSimple)
Expand All @@ -1839,7 +1841,9 @@ TEST_F(CsvReaderTest, StringsWithWriterSimple)
const auto result_table = result.tbl->view();
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_table.column(0), result_table.column(0));
check_string_column(input_table.column(1), result_table.column(1));
ASSERT_EQ(names, result.metadata.column_names);
ASSERT_EQ(result.metadata.schema_info.size(), names.size());
for (auto i = 0ul; i < names.size(); ++i)
EXPECT_EQ(names[i], result.metadata.schema_info[i].name);
}

TEST_F(CsvReaderTest, StringsEmbeddedDelimiter)
Expand All @@ -1860,7 +1864,9 @@ TEST_F(CsvReaderTest, StringsEmbeddedDelimiter)
auto result = cudf::io::read_csv(in_opts);

CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result.tbl->view());
ASSERT_EQ(names, result.metadata.column_names);
ASSERT_EQ(result.metadata.schema_info.size(), names.size());
for (auto i = 0ul; i < names.size(); ++i)
EXPECT_EQ(names[i], result.metadata.schema_info[i].name);
}

TEST_F(CsvReaderTest, HeaderEmbeddedDelimiter)
Expand Down Expand Up @@ -1888,7 +1894,9 @@ TEST_F(CsvReaderTest, HeaderEmbeddedDelimiter)
auto result = cudf::io::read_csv(in_opts);

CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result.tbl->view());
ASSERT_EQ(names, result.metadata.column_names);
ASSERT_EQ(result.metadata.schema_info.size(), names.size());
for (auto i = 0ul; i < names.size(); ++i)
EXPECT_EQ(names[i], result.metadata.schema_info[i].name);
}

TEST_F(CsvReaderTest, EmptyFileWithWriter)
Expand Down Expand Up @@ -1994,7 +2002,9 @@ TEST_F(CsvReaderTest, DurationsWithWriter)

const auto result_table = result.tbl->view();
CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result_table);
ASSERT_EQ(names, result.metadata.column_names);
ASSERT_EQ(result.metadata.schema_info.size(), names.size());
for (auto i = 0ul; i < names.size(); ++i)
EXPECT_EQ(names[i], result.metadata.schema_info[i].name);
}

TEST_F(CsvReaderTest, ParseInRangeIntegers)
Expand Down Expand Up @@ -2269,8 +2279,8 @@ TEST_F(CsvReaderTest, CsvDefaultOptionsWriteReadMatch)
// verify that the tables are identical, or as identical as expected.
const auto new_table_view = new_table_and_metadata.tbl->view();
CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, new_table_view);
EXPECT_EQ(new_table_and_metadata.metadata.column_names[0], "0");
EXPECT_EQ(new_table_and_metadata.metadata.column_names[1], "1");
EXPECT_EQ(new_table_and_metadata.metadata.schema_info[0].name, "0");
EXPECT_EQ(new_table_and_metadata.metadata.schema_info[1].name, "1");
}

TEST_F(CsvReaderTest, UseColsValidation)
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/avro.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

from libcpp.string cimport string
from libcpp.utility cimport move
Expand Down Expand Up @@ -50,6 +50,6 @@ cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1):
with nogil:
c_result = move(libcudf_read_avro(options))

names = [name.decode() for name in c_result.metadata.column_names]
names = [info.name.decode() for info in c_result.metadata.schema_info]

return data_from_unique_ptr(move(c_result.tbl), column_names=names)
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/csv.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,7 @@ def read_csv(
with nogil:
c_result = move(cpp_read_csv(read_csv_options_c))

meta_names = [name.decode() for name in c_result.metadata.column_names]
meta_names = [info.name.decode() for info in c_result.metadata.schema_info]
df = cudf.DataFrame._from_data(*data_from_unique_ptr(
move(c_result.tbl),
column_names=meta_names
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/orc.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

import cudf
from cudf.core.buffer import acquire_spill_lock
Expand Down Expand Up @@ -120,7 +120,7 @@ cpdef read_orc(object filepaths_or_buffers,
with nogil:
c_result = move(libcudf_read_orc(c_orc_reader_options))

names = [name.decode() for name in c_result.metadata.column_names]
names = [info.name.decode() for info in c_result.metadata.schema_info]
actual_index_names, col_names, is_range_index, reset_index_name, \
range_idx = _get_index_from_metadata(c_result.metadata.user_data,
names,
Expand Down
20 changes: 10 additions & 10 deletions python/cudf/cudf/_lib/parquet.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
# Copyright (c) 2019-2023, NVIDIA CORPORATION.

# cython: boundscheck = False

Expand Down Expand Up @@ -176,17 +176,17 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
args.set_columns(cpp_columns)

# Read Parquet
cdef cudf_io_types.table_with_metadata c_out_table
cdef cudf_io_types.table_with_metadata c_result

with nogil:
c_out_table = move(parquet_reader(args))
c_result = move(parquet_reader(args))

column_names = [x.decode() for x in c_out_table.metadata.column_names]
names = [info.name.decode() for info in c_result.metadata.schema_info]

# Access the Parquet per_file_user_data to find the index
index_col = None
cdef vector[unordered_map[string, string]] per_file_user_data = \
c_out_table.metadata.per_file_user_data
c_result.metadata.per_file_user_data

index_col_names = None
is_range_index = True
Expand All @@ -207,11 +207,11 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
index_col_names[idx_col] = c['name']

df = cudf.DataFrame._from_data(*data_from_unique_ptr(
move(c_out_table.tbl),
column_names=column_names
move(c_result.tbl),
column_names=names
))

update_struct_field_names(df, c_out_table.metadata.schema_info)
update_struct_field_names(df, c_result.metadata.schema_info)

if meta is not None:
# Book keep each column metadata as the order
Expand All @@ -222,7 +222,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
}

# update the decimal precision of each column
for col in column_names:
for col in names:
if is_decimal_dtype(df._data[col].dtype):
df._data[col].dtype.precision = (
meta_data_per_column[col]["metadata"]["precision"]
Expand Down Expand Up @@ -286,7 +286,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
)

df._index = idx
elif set(index_col).issubset(column_names):
elif set(index_col).issubset(names):
index_data = df[index_col]
actual_index_names = list(index_col_names.values())
if len(index_data._data) == 1:
Expand Down

0 comments on commit 9c9862f

Please sign in to comment.