rapidsai · rapids-bot · Nov 17, 2022 · Oct 27, 2022 · Oct 27, 2022 · Oct 27, 2022
@@ -677,25 +677,29 @@ table_with_metadata read_csv(cudf::io::datasource* source,
   auto const& row_offsets = data_row_offsets.second;
 
   // Exclude the end-of-data row from number of rows with actual data
-  auto num_records        = std::max(row_offsets.size(), 1ul) - 1;
-  auto column_flags       = std::vector<column_parse::flags>();
-  auto column_names       = std::vector<std::string>();
-  auto num_actual_columns = static_cast<int32_t>(reader_opts.get_names().size());
-  auto num_active_columns = num_actual_columns;
-
-  // Check if the user gave us a list of column names
-  if (not reader_opts.get_names().empty()) {
-    column_flags.resize(reader_opts.get_names().size(),
-                        column_parse::enabled | column_parse::inferred);
-    column_names = reader_opts.get_names();
-  } else {
-    column_names = get_column_names(
-      header, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix());
-
-    num_actual_columns = num_active_columns = column_names.size();
-
-    column_flags.resize(num_actual_columns, column_parse::enabled | column_parse::inferred);
-
+  auto num_records                   = std::max(row_offsets.size(), 1ul) - 1;
+  auto const unique_use_cols_indexes = std::set<int>(reader_opts.get_use_cols_indexes().cbegin(),
+                                                     reader_opts.get_use_cols_indexes().cend());
+
+  auto const detected_column_names =
+    get_column_names(header, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix());
+  auto const opts_have_all_col_names =
+    not reader_opts.get_names().empty() and
+    (
+      // no data to detect (the number of) columns
+      detected_column_names.empty() or
+      // number of user specified names matches what is detected
+      reader_opts.get_names().size() == detected_column_names.size() or
+      // Columns are not selected by indices; read first reader_opts.get_names().size() columns
+      unique_use_cols_indexes.empty());
+  auto column_names = opts_have_all_col_names ? reader_opts.get_names() : detected_column_names;
+
+  auto const num_actual_columns = static_cast<int32_t>(column_names.size());
+  auto num_active_columns       = num_actual_columns;
+  auto column_flags             = std::vector<column_parse::flags>(
+    num_actual_columns, column_parse::enabled | column_parse::inferred);
+
+  if (not opts_have_all_col_names) {
     std::vector<size_t> col_loop_order(column_names.size());
     auto unnamed_it = std::copy_if(
       thrust::make_counting_iterator<size_t>(0),
@@ -756,24 +760,40 @@ table_with_metadata read_csv(cudf::io::datasource* source,
   }
 
   // User can specify which columns should be parsed
-  if (!reader_opts.get_use_cols_indexes().empty() || !reader_opts.get_use_cols_names().empty()) {
+  auto const unique_use_cols_names = std::unordered_set<std::string>(
+    reader_opts.get_use_cols_names().cbegin(), reader_opts.get_use_cols_names().cend());
+  auto const is_column_selection_used =
+    not unique_use_cols_names.empty() or not unique_use_cols_indexes.empty();
+
+  if (is_column_selection_used) {
     std::fill(column_flags.begin(), column_flags.end(), column_parse::disabled);
+    num_active_columns = 0;
+  }
 
-    for (const auto index : reader_opts.get_use_cols_indexes()) {
+  if (not unique_use_cols_indexes.empty()) {
+    auto const are_opts_col_names_used =
+      not reader_opts.get_names().empty() and not opts_have_all_col_names;
+    CUDF_EXPECTS(not are_opts_col_names_used or
+                   reader_opts.get_names().size() == unique_use_cols_indexes.size(),
+                 "Specify names of all columns in the file, or names of all selected columns");
+
+    for (auto const index : unique_use_cols_indexes) {
       column_flags[index] = column_parse::enabled | column_parse::inferred;
+      if (are_opts_col_names_used) {
+        column_names[index] = reader_opts.get_names()[num_active_columns];
+      }
+      ++num_active_columns;
     }
-    num_active_columns = std::unordered_set<int>(reader_opts.get_use_cols_indexes().begin(),
-                                                 reader_opts.get_use_cols_indexes().end())
-                           .size();
+  }
 
-    for (const auto& name : reader_opts.get_use_cols_names()) {
-      const auto it = std::find(column_names.begin(), column_names.end(), name);
-      if (it != column_names.end()) {
-        auto curr_it = it - column_names.begin();
-        if (column_flags[curr_it] == column_parse::disabled) {
-          column_flags[curr_it] = column_parse::enabled | column_parse::inferred;
-          num_active_columns++;
-        }
+  if (not unique_use_cols_names.empty()) {
+    for (auto const& name : unique_use_cols_names) {
+      auto const it = std::find(column_names.cbegin(), column_names.cend(), name);
+      CUDF_EXPECTS(it != column_names.end(), "Nonexistent column selected");
+      auto const curr_it = it - column_names.begin();
+      if (column_flags[curr_it] == column_parse::disabled) {
+        column_flags[curr_it] = column_parse::enabled | column_parse::inferred;
+        ++num_active_columns;
       }
     }
   }

@@ -904,7 +904,7 @@ TEST_F(CsvReaderTest, Strings)
   auto filepath = temp_env->get_temp_dir() + "Strings.csv";
   {
     std::ofstream outfile(filepath, std::ofstream::out);
-    outfile << names[0] << ',' << names[1] << ',' << '\n';
+    outfile << names[0] << ',' << names[1] << '\n';
     outfile << "10,abc def ghi" << '\n';
     outfile << "20,\"jkl mno pqr\"" << '\n';
     outfile << "30,stu \"\"vwx\"\" yz" << '\n';
@@ -934,7 +934,7 @@ TEST_F(CsvReaderTest, StringsQuotes)
   auto filepath = temp_env->get_temp_dir() + "StringsQuotes.csv";
   {
     std::ofstream outfile(filepath, std::ofstream::out);
-    outfile << names[0] << ',' << names[1] << ',' << '\n';
+    outfile << names[0] << ',' << names[1] << '\n';
     outfile << "10,`abc,\ndef, ghi`" << '\n';
     outfile << "20,`jkl, ``mno``, pqr`" << '\n';
     outfile << "30,stu `vwx` yz" << '\n';
@@ -963,7 +963,7 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored)
   auto filepath = temp_env->get_temp_dir() + "StringsQuotesIgnored.csv";
   {
     std::ofstream outfile(filepath, std::ofstream::out);
-    outfile << names[0] << ',' << names[1] << ',' << '\n';
+    outfile << names[0] << ',' << names[1] << '\n';
     outfile << "10,\"abcdef ghi\"" << '\n';
     outfile << "20,\"jkl \"\"mno\"\" pqr\"" << '\n';
     outfile << "30,stu \"vwx\" yz" << '\n';
@@ -2244,6 +2244,129 @@ TEST_F(CsvReaderTest, CsvDefaultOptionsWriteReadMatch)
   EXPECT_EQ(new_table_and_metadata.metadata.column_names[1], "1");
 }
 
+TEST_F(CsvReaderTest, UseColsValidation)
+{
+  const std::string buffer = "1,2,3";
+
+  const cudf::io::csv_reader_options idx_cnt_options =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
+      .names({"a", "b"})
+      .use_cols_indexes({0});
+  EXPECT_THROW(cudf::io::read_csv(idx_cnt_options), cudf::logic_error);
+
+  cudf::io::csv_reader_options unique_idx_cnt_options =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
+      .names({"a", "b"})
+      .use_cols_indexes({0, 0});
+  EXPECT_THROW(cudf::io::read_csv(unique_idx_cnt_options), cudf::logic_error);
+
+  cudf::io::csv_reader_options bad_name_options =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
+      .names({"a", "b", "c"})
+      .use_cols_names({"nonexistent_name"});
+  EXPECT_THROW(cudf::io::read_csv(bad_name_options), cudf::logic_error);
+}
+
+TEST_F(CsvReaderTest, CropColumns)
+{
+  const std::string csv_in{"12,9., 10\n34,8., 20\n56,7., 30"};
+
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()})
+      .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<float>()})
+      .names({"a", "b"})
+      .header(-1);
+  const auto result = cudf::io::read_csv(in_opts);
+
+  const auto result_table = result.tbl->view();
+  ASSERT_EQ(result_table.num_columns(), 2);
+  ASSERT_EQ(result_table.column(0).type(), data_type{type_id::INT32});
+  ASSERT_EQ(result_table.column(1).type(), data_type{type_id::FLOAT32});
+  expect_column_data_equal(std::vector<int32_t>{12, 34, 56}, result_table.column(0));
+  expect_column_data_equal(std::vector<float>{9., 8., 7.}, result_table.column(1));
+}
+
+TEST_F(CsvReaderTest, CropColumnsUseColsNames)
+{
+  std::string csv_in{"12,9., 10\n34,8., 20\n56,7., 30"};
+
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()})
+      .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<float>()})
+      .names({"a", "b"})
+      .use_cols_names({"b"})
+      .header(-1);
+  auto result = cudf::io::read_csv(in_opts);
+
+  const auto result_table = result.tbl->view();
+  ASSERT_EQ(result_table.num_columns(), 1);
+  ASSERT_EQ(result_table.column(0).type(), data_type{type_id::FLOAT32});
+  expect_column_data_equal(std::vector<float>{9., 8., 7.}, result_table.column(0));
+}
+
+TEST_F(CsvReaderTest, ExtraColumns)
+{
+  std::string csv_in{"12,9., 10\n34,8., 20\n56,7., 30"};
+  {
+    cudf::io::csv_reader_options opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()})
+        .names({"a", "b", "c", "d"})
+        .header(-1);
+    auto result = cudf::io::read_csv(opts);
+
+    const auto result_table = result.tbl->view();
+    ASSERT_EQ(result_table.num_columns(), 4);
+    ASSERT_EQ(result_table.column(3).type(), data_type{type_id::INT8});
+    ASSERT_EQ(result_table.column(3).null_count(), 3);
+  }
+  {
+    cudf::io::csv_reader_options with_dtypes_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()})
+        .names({"a", "b", "c", "d"})
+        .dtypes({dtype<int32_t>(), dtype<int32_t>(), dtype<int32_t>(), dtype<float>()})
+        .header(-1);
+    auto result = cudf::io::read_csv(with_dtypes_opts);
+
+    const auto result_table = result.tbl->view();
+    ASSERT_EQ(result_table.num_columns(), 4);
+    ASSERT_EQ(result_table.column(3).type(), data_type{type_id::FLOAT32});
+    ASSERT_EQ(result_table.column(3).null_count(), 3);
+  }
+}
+
+TEST_F(CsvReaderTest, ExtraColumnsUseCols)
+{
+  std::string csv_in{"12,9., 10\n34,8., 20\n56,7., 30"};
+
+  {
+    cudf::io::csv_reader_options in_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()})
+        .names({"a", "b", "c", "d"})
+        .use_cols_names({"b", "d"})
+        .header(-1);
+    auto result = cudf::io::read_csv(in_opts);
+
+    const auto result_table = result.tbl->view();
+    ASSERT_EQ(result_table.num_columns(), 2);
+    ASSERT_EQ(result_table.column(1).type(), data_type{type_id::INT8});
+    ASSERT_EQ(result_table.column(1).null_count(), 3);
+  }
+  {
+    cudf::io::csv_reader_options with_dtypes_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()})
+        .names({"a", "b", "c", "d"})
+        .use_cols_names({"b", "d"})
+        .dtypes({dtype<int32_t>(), dtype<int32_t>(), dtype<int32_t>(), dtype<cudf::string_view>()})
+        .header(-1);
+    auto result = cudf::io::read_csv(with_dtypes_opts);
+
+    const auto result_table = result.tbl->view();
+    ASSERT_EQ(result_table.num_columns(), 2);
+    ASSERT_EQ(result_table.column(1).type(), data_type{type_id::STRING});
+    ASSERT_EQ(result_table.column(1).null_count(), 3);
+  }
+}
+
 TEST_F(CsvReaderTest, EmptyColumns)
 {
   // First column only has empty fields. second column contains only "null" literals

@@ -2205,3 +2205,41 @@ def test_default_float_bitwidth_partial(default_float_bitwidth):
     )
     assert read["float1"].dtype == np.dtype(f"f{default_float_bitwidth//8}")
     assert read["float2"].dtype == np.dtype("f8")
+
+
+@pytest.mark.parametrize(
+    "usecols,names",
+    [
+        # selection using indices; only names of selected columns are specified
+        ([1, 2], ["b", "c"]),
+        # selection using indices; names of all columns are specified
+        ([1, 2], ["a", "b", "c"]),
+        # selection using indices; duplicates
+        ([2, 2], ["a", "b", "c"]),
+        # selection using indices; out of order
+        ([2, 1], ["a", "b", "c"]),
+        # selection using names
+        (["b"], ["a", "b", "c"]),
+        # selection using names; multiple columns
+        (["b", "c"], ["a", "b", "c"]),
+        # selection using names; duplicates
+        (["c", "c"], ["a", "b", "c"]),
+        # selection using names; out of order
+        (["c", "b"], ["a", "b", "c"]),
+    ],
+)
+def test_column_selection_plus_column_names(usecols, names):
+
+    lines = [
+        "num,datetime,text",
+        "123,2018-11-13T12:00:00,abc",
+        "456,2018-11-14T12:35:01,def",
+        "789,2018-11-15T18:02:59,ghi",
+    ]
+
+    buffer = "\n".join(lines) + "\n"
+
+    assert_eq(
+        pd.read_csv(StringIO(buffer), usecols=usecols, names=names),
+        cudf.read_csv(StringIO(buffer), usecols=usecols, names=names),
+    )
@@ -1029,16 +1029,22 @@
     the column names: if no names are passed, header=0;
     if column names are passed explicitly, header=None.
 names : list of str, default None
-    List of column names to be used.
+    List of column names to be used. Needs to include names of all columns in
+    the file, or names of all columns selected using `usecols` (only when
+    `usecols` holds integer indices). When `usecols` is not used to select
+    column indices, `names` can contain more names than there are columns i.n
+    the file. In this case the extra columns will only contain null rows.
 index_col : int, string or False, default None
     Column to use as the row labels of the DataFrame. Passing `index_col=False`
     explicitly disables index column inference and discards the last column.
 usecols : list of int or str, default None
     Returns subset of the columns given in the list. All elements must be
     either integer indices (column number) or strings that correspond to
-    column names
+    column names. When an integer index is passed for each name in the `names`
+    parameter, the names are interpreted as names in the output table, not as
+    names in the input file.
 prefix : str, default None
-    Prefix to add to column numbers when parsing without a header row
+    Prefix to add to column numbers when parsing without a header row.
 mangle_dupe_cols : boolean, default True
     Duplicate columns will be specified as 'X','X.1',...'X.N'.
 dtype : type, str, list of types, or dict of column -> type, default None