Remove column names (#12578)

Closes #6411 Removed `column_names` in favor of `schema_info`. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Elias Stehle (https://github.com/elstehle) - Karthikeyan (https://github.com/karthikeyann) - Bradley Dice (https://github.com/bdice) - Matthew Roeschke (https://github.com/mroeschke) URL: #12578
rapidsai · Jan 24, 2023 · 9c9862f · 9c9862f
1 parent 7f6ae05
commit 9c9862f
Show file tree

Hide file tree

Showing 14 changed files with 61 additions and 66 deletions.
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,7 +34,15 @@ std::vector<std::string> get_col_names(cudf::io::source_info const& source)
 {
   cudf::io::parquet_reader_options const read_options =
     cudf::io::parquet_reader_options::builder(source);
-  return cudf::io::read_parquet(read_options).metadata.column_names;
+  auto const schema = cudf::io::read_parquet(read_options).metadata.schema_info;
+
+  std::vector<std::string> names;
+  names.reserve(schema.size());
+  std::transform(schema.cbegin(), schema.cend(), std::back_inserter(names), [](auto const& c) {
+    CUDF_EXPECTS(c.children.empty(), "nested types are not supported");
+    return c.name;
+  });
+  return names;
 }
 
 template <column_selection ColSelection,

diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
@@ -128,23 +128,9 @@ struct column_name_info {
 };
 
 /**
- * @brief Table metadata for io readers/writers (primarily column names)
- *
- * For nested types (structs, maps, unions), the ordering of names in the column_names vector
- * corresponds to a pre-order traversal of the column tree.
- * In the example below (2 top-level columns: struct column "col1" and string column "col2"),
- *  column_names = {"col1", "s3", "f5", "f6", "f4", "col2"}.
- *
- *     col1     col2
- *      / \
- *     /   \
- *   s3    f4
- *   / \
- *  /   \
- * f5    f6
+ * @brief Table metadata returned by IO readers.
  */
 struct table_metadata {
-  std::vector<std::string> column_names;  //!< Names of columns contained in the table
   std::vector<column_name_info>
     schema_info;  //!< Detailed name information for the entire output hierarchy
   std::map<std::string, std::string> user_data;  //!< Format-dependent metadata of the first input

diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
@@ -570,11 +570,13 @@ table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
     }
   }
 
-  // Return column names (must match order of returned columns)
-  metadata_out.column_names.resize(selected_columns.size());
-  for (size_t i = 0; i < selected_columns.size(); i++) {
-    metadata_out.column_names[i] = selected_columns[i].second;
-  }
+  // Return column names
+  metadata_out.schema_info.reserve(selected_columns.size());
+  std::transform(selected_columns.cbegin(),
+                 selected_columns.cend(),
+                 std::back_inserter(metadata_out.schema_info),
+                 [](auto const& c) { return column_name_info{c.second}; });
+
   // Return user metadata
   metadata_out.user_data          = meta.user_data;
   metadata_out.per_file_user_data = {{meta.user_data.begin(), meta.user_data.end()}};

diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
@@ -845,7 +845,7 @@ table_with_metadata read_csv(cudf::io::datasource* source,
       stream,
       mr);
     for (size_t i = 0; i < column_types.size(); ++i) {
-      metadata.column_names.emplace_back(out_buffers[i].name);
+      metadata.schema_info.emplace_back(out_buffers[i].name);
       if (column_types[i].id() == type_id::STRING && parse_opts.quotechar != '\0' &&
           parse_opts.doublequote) {
         // PANDAS' default behavior of enabling doublequote for two consecutive
@@ -869,7 +869,7 @@ table_with_metadata read_csv(cudf::io::datasource* source,
     // Handle empty metadata
     for (int col = 0; col < num_actual_columns; ++col) {
       if (column_flags[col] & column_parse::enabled) {
-        metadata.column_names.emplace_back(column_names[col]);
+        metadata.schema_info.emplace_back(column_names[col]);
       }
     }
   }

diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
@@ -931,8 +931,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
 
   // Zero row entries
   if (data_root.type == json_col_t::ListColumn && data_root.child_columns.size() == 0) {
-    return table_with_metadata{std::make_unique<table>(std::vector<std::unique_ptr<column>>{}),
-                               {{}, std::vector<column_name_info>{}}};
+    return table_with_metadata{std::make_unique<table>(std::vector<std::unique_ptr<column>>{})};
   }
 
   // Verify that we were in fact given a list of structs (or in JSON speech: an array of objects)
@@ -1013,8 +1012,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
     column_index++;
   }
 
-  return table_with_metadata{std::make_unique<table>(std::move(out_columns)),
-                             {{}, out_column_names}};
+  return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {out_column_names}};
 }
 
 }  // namespace detail

diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
@@ -1772,8 +1772,7 @@ table_with_metadata host_parse_nested_json(device_span<SymbolT const> d_input,
 
   // Zero row entries
   if (data_root.type == json_col_t::ListColumn && data_root.child_columns.size() == 0) {
-    return table_with_metadata{std::make_unique<table>(std::vector<std::unique_ptr<column>>{}),
-                               {{}, std::vector<column_name_info>{}}};
+    return table_with_metadata{std::make_unique<table>(std::vector<std::unique_ptr<column>>{})};
   }
 
   // Verify that we were in fact given a list of structs (or in JSON speech: an array of objects)
@@ -1848,8 +1847,7 @@ table_with_metadata host_parse_nested_json(device_span<SymbolT const> d_input,
     column_index++;
   }
 
-  return table_with_metadata{std::make_unique<table>(std::move(out_columns)),
-                             {{}, out_column_names}};
+  return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {out_column_names}};
 }
 
 }  // namespace detail

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
@@ -575,7 +575,7 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
 
   CUDF_EXPECTS(!out_columns.empty(), "No columns created from json input");
 
-  return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {{}, column_infos}};
+  return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {column_infos}};
 }
 
 /**

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
@@ -1283,13 +1283,6 @@ table_with_metadata reader::impl::read(size_type skip_rows,
     create_columns(std::move(out_buffers), out_columns, schema_info, stream);
   }
 
-  // Return column names (must match order of returned columns)
-  out_metadata.column_names.reserve(schema_info.size());
-  std::transform(schema_info.cbegin(),
-                 schema_info.cend(),
-                 std::back_inserter(out_metadata.column_names),
-                 [](auto info) { return info.name; });
-
   out_metadata.schema_info = std::move(schema_info);
 
   std::transform(_metadata.per_file_metadata.cbegin(),

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
@@ -311,11 +311,11 @@ table_with_metadata reader::impl::finalize_output(table_metadata& out_metadata,
   }
 
   if (!_output_metadata) {
-    // Return column names (must match order of returned columns)
-    out_metadata.column_names.resize(_output_buffers.size());
+    // Return column names
+    out_metadata.schema_info.resize(_output_buffers.size());
     for (size_t i = 0; i < _output_column_schemas.size(); i++) {
-      auto const& schema           = _metadata->get_schema(_output_column_schemas[i]);
-      out_metadata.column_names[i] = schema.name;
+      auto const& schema               = _metadata->get_schema(_output_column_schemas[i]);
+      out_metadata.schema_info[i].name = schema.name;
     }
 
     // Return user metadata

diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
@@ -1814,7 +1814,9 @@ TEST_F(CsvReaderTest, StringsWithWriter)
   const auto result_table = result.tbl->view();
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_table.column(0), result_table.column(0));
   check_string_column(input_table.column(1), result_table.column(1));
-  ASSERT_EQ(names, result.metadata.column_names);
+  ASSERT_EQ(result.metadata.schema_info.size(), names.size());
+  for (auto i = 0ul; i < names.size(); ++i)
+    EXPECT_EQ(names[i], result.metadata.schema_info[i].name);
 }
 
 TEST_F(CsvReaderTest, StringsWithWriterSimple)
@@ -1839,7 +1841,9 @@ TEST_F(CsvReaderTest, StringsWithWriterSimple)
   const auto result_table = result.tbl->view();
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_table.column(0), result_table.column(0));
   check_string_column(input_table.column(1), result_table.column(1));
-  ASSERT_EQ(names, result.metadata.column_names);
+  ASSERT_EQ(result.metadata.schema_info.size(), names.size());
+  for (auto i = 0ul; i < names.size(); ++i)
+    EXPECT_EQ(names[i], result.metadata.schema_info[i].name);
 }
 
 TEST_F(CsvReaderTest, StringsEmbeddedDelimiter)
@@ -1860,7 +1864,9 @@ TEST_F(CsvReaderTest, StringsEmbeddedDelimiter)
   auto result = cudf::io::read_csv(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result.tbl->view());
-  ASSERT_EQ(names, result.metadata.column_names);
+  ASSERT_EQ(result.metadata.schema_info.size(), names.size());
+  for (auto i = 0ul; i < names.size(); ++i)
+    EXPECT_EQ(names[i], result.metadata.schema_info[i].name);
 }
 
 TEST_F(CsvReaderTest, HeaderEmbeddedDelimiter)
@@ -1888,7 +1894,9 @@ TEST_F(CsvReaderTest, HeaderEmbeddedDelimiter)
   auto result = cudf::io::read_csv(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result.tbl->view());
-  ASSERT_EQ(names, result.metadata.column_names);
+  ASSERT_EQ(result.metadata.schema_info.size(), names.size());
+  for (auto i = 0ul; i < names.size(); ++i)
+    EXPECT_EQ(names[i], result.metadata.schema_info[i].name);
 }
 
 TEST_F(CsvReaderTest, EmptyFileWithWriter)
@@ -1994,7 +2002,9 @@ TEST_F(CsvReaderTest, DurationsWithWriter)
 
   const auto result_table = result.tbl->view();
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result_table);
-  ASSERT_EQ(names, result.metadata.column_names);
+  ASSERT_EQ(result.metadata.schema_info.size(), names.size());
+  for (auto i = 0ul; i < names.size(); ++i)
+    EXPECT_EQ(names[i], result.metadata.schema_info[i].name);
 }
 
 TEST_F(CsvReaderTest, ParseInRangeIntegers)
@@ -2269,8 +2279,8 @@ TEST_F(CsvReaderTest, CsvDefaultOptionsWriteReadMatch)
   // verify that the tables are identical, or as identical as expected.
   const auto new_table_view = new_table_and_metadata.tbl->view();
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, new_table_view);
-  EXPECT_EQ(new_table_and_metadata.metadata.column_names[0], "0");
-  EXPECT_EQ(new_table_and_metadata.metadata.column_names[1], "1");
+  EXPECT_EQ(new_table_and_metadata.metadata.schema_info[0].name, "0");
+  EXPECT_EQ(new_table_and_metadata.metadata.schema_info[1].name, "1");
 }
 
 TEST_F(CsvReaderTest, UseColsValidation)

diff --git a/python/cudf/cudf/_lib/avro.pyx b/python/cudf/cudf/_lib/avro.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp.string cimport string
 from libcpp.utility cimport move
@@ -50,6 +50,6 @@ cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1):
     with nogil:
         c_result = move(libcudf_read_avro(options))
 
-    names = [name.decode() for name in c_result.metadata.column_names]
+    names = [info.name.decode() for info in c_result.metadata.schema_info]
 
     return data_from_unique_ptr(move(c_result.tbl), column_names=names)
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
@@ -425,7 +425,7 @@ def read_csv(
     with nogil:
         c_result = move(cpp_read_csv(read_csv_options_c))
 
-    meta_names = [name.decode() for name in c_result.metadata.column_names]
+    meta_names = [info.name.decode() for info in c_result.metadata.schema_info]
     df = cudf.DataFrame._from_data(*data_from_unique_ptr(
         move(c_result.tbl),
         column_names=meta_names

diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 import cudf
 from cudf.core.buffer import acquire_spill_lock
@@ -120,7 +120,7 @@ cpdef read_orc(object filepaths_or_buffers,
     with nogil:
         c_result = move(libcudf_read_orc(c_orc_reader_options))
 
-    names = [name.decode() for name in c_result.metadata.column_names]
+    names = [info.name.decode() for info in c_result.metadata.schema_info]
     actual_index_names, col_names, is_range_index, reset_index_name, \
         range_idx = _get_index_from_metadata(c_result.metadata.user_data,
                                              names,

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 
 # cython: boundscheck = False
 
@@ -176,17 +176,17 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
         args.set_columns(cpp_columns)
 
     # Read Parquet
-    cdef cudf_io_types.table_with_metadata c_out_table
+    cdef cudf_io_types.table_with_metadata c_result
 
     with nogil:
-        c_out_table = move(parquet_reader(args))
+        c_result = move(parquet_reader(args))
 
-    column_names = [x.decode() for x in c_out_table.metadata.column_names]
+    names = [info.name.decode() for info in c_result.metadata.schema_info]
 
     # Access the Parquet per_file_user_data to find the index
     index_col = None
     cdef vector[unordered_map[string, string]] per_file_user_data = \
-        c_out_table.metadata.per_file_user_data
+        c_result.metadata.per_file_user_data
 
     index_col_names = None
     is_range_index = True
@@ -207,11 +207,11 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
                             index_col_names[idx_col] = c['name']
 
     df = cudf.DataFrame._from_data(*data_from_unique_ptr(
-        move(c_out_table.tbl),
-        column_names=column_names
+        move(c_result.tbl),
+        column_names=names
     ))
 
-    update_struct_field_names(df, c_out_table.metadata.schema_info)
+    update_struct_field_names(df, c_result.metadata.schema_info)
 
     if meta is not None:
         # Book keep each column metadata as the order
@@ -222,7 +222,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
         }
 
         # update the decimal precision of each column
-        for col in column_names:
+        for col in names:
             if is_decimal_dtype(df._data[col].dtype):
                 df._data[col].dtype.precision = (
                     meta_data_per_column[col]["metadata"]["precision"]
@@ -286,7 +286,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
                 )
 
             df._index = idx
-        elif set(index_col).issubset(column_names):
+        elif set(index_col).issubset(names):
             index_data = df[index_col]
             actual_index_names = list(index_col_names.values())
             if len(index_data._data) == 1:
-Original file line number
+Diff line change
@@ Expand Up @@
       CUDF_EXPECTS(!out_columns.empty(), "No columns created from json input");
-      return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {{}, column_infos}};
+      return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {column_infos}};
     }
     /**
@@ Expand Down @@