diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index f812f272c25..075e9e2c965 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -676,32 +676,37 @@ table_with_metadata read_csv(cudf::io::datasource* source, auto const& data = data_row_offsets.first; auto const& row_offsets = data_row_offsets.second; - // Exclude the end-of-data row from number of rows with actual data - auto num_records = std::max(row_offsets.size(), 1ul) - 1; - auto column_flags = std::vector(); - auto column_names = std::vector(); - auto num_actual_columns = static_cast(reader_opts.get_names().size()); - auto num_active_columns = num_actual_columns; - - // Check if the user gave us a list of column names - if (not reader_opts.get_names().empty()) { - column_flags.resize(reader_opts.get_names().size(), - column_parse::enabled | column_parse::inferred); - column_names = reader_opts.get_names(); - } else { - column_names = get_column_names( - header, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix()); - - num_actual_columns = num_active_columns = column_names.size(); - - column_flags.resize(num_actual_columns, column_parse::enabled | column_parse::inferred); - + auto const unique_use_cols_indexes = std::set(reader_opts.get_use_cols_indexes().cbegin(), + reader_opts.get_use_cols_indexes().cend()); + + auto const detected_column_names = + get_column_names(header, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix()); + auto const opts_have_all_col_names = + not reader_opts.get_names().empty() and + ( + // no data to detect (the number of) columns + detected_column_names.empty() or + // number of user specified names matches what is detected + reader_opts.get_names().size() == detected_column_names.size() or + // Columns are not selected by indices; read first reader_opts.get_names().size() columns + unique_use_cols_indexes.empty()); + auto column_names = opts_have_all_col_names ? reader_opts.get_names() : detected_column_names; + + auto const num_actual_columns = static_cast(column_names.size()); + auto num_active_columns = num_actual_columns; + auto column_flags = std::vector( + num_actual_columns, column_parse::enabled | column_parse::inferred); + + // User did not pass column names to override names in the file + // Process names from the file to remove empty and duplicated strings + if (not opts_have_all_col_names) { std::vector col_loop_order(column_names.size()); auto unnamed_it = std::copy_if( thrust::make_counting_iterator(0), thrust::make_counting_iterator(column_names.size()), col_loop_order.begin(), [&column_names](auto col_idx) -> bool { return not column_names[col_idx].empty(); }); + // Rename empty column names to "Unnamed: col_index" std::copy_if(thrust::make_counting_iterator(0), thrust::make_counting_iterator(column_names.size()), @@ -756,24 +761,44 @@ table_with_metadata read_csv(cudf::io::datasource* source, } // User can specify which columns should be parsed - if (!reader_opts.get_use_cols_indexes().empty() || !reader_opts.get_use_cols_names().empty()) { + auto const unique_use_cols_names = std::unordered_set(reader_opts.get_use_cols_names().cbegin(), + reader_opts.get_use_cols_names().cend()); + auto const is_column_selection_used = + not unique_use_cols_names.empty() or not unique_use_cols_indexes.empty(); + + // Reset flags and output column count; columns will be reactivated based on the selection options + if (is_column_selection_used) { std::fill(column_flags.begin(), column_flags.end(), column_parse::disabled); + num_active_columns = 0; + } + + // Column selection via column indexes + if (not unique_use_cols_indexes.empty()) { + // Users can pass names for the selected columns only, if selecting column by their indices + auto const are_opts_col_names_used = + not reader_opts.get_names().empty() and not opts_have_all_col_names; + CUDF_EXPECTS(not are_opts_col_names_used or + reader_opts.get_names().size() == unique_use_cols_indexes.size(), + "Specify names of all columns in the file, or names of all selected columns"); - for (const auto index : reader_opts.get_use_cols_indexes()) { + for (auto const index : unique_use_cols_indexes) { column_flags[index] = column_parse::enabled | column_parse::inferred; + if (are_opts_col_names_used) { + column_names[index] = reader_opts.get_names()[num_active_columns]; + } + ++num_active_columns; } - num_active_columns = std::unordered_set(reader_opts.get_use_cols_indexes().begin(), - reader_opts.get_use_cols_indexes().end()) - .size(); + } - for (const auto& name : reader_opts.get_use_cols_names()) { - const auto it = std::find(column_names.begin(), column_names.end(), name); - if (it != column_names.end()) { - auto curr_it = it - column_names.begin(); - if (column_flags[curr_it] == column_parse::disabled) { - column_flags[curr_it] = column_parse::enabled | column_parse::inferred; - num_active_columns++; - } + // Column selection via column names + if (not unique_use_cols_names.empty()) { + for (auto const& name : unique_use_cols_names) { + auto const it = std::find(column_names.cbegin(), column_names.cend(), name); + CUDF_EXPECTS(it != column_names.end(), "Nonexistent column selected"); + auto const col_idx = std::distance(column_names.cbegin(), it); + if (column_flags[col_idx] == column_parse::disabled) { + column_flags[col_idx] = column_parse::enabled | column_parse::inferred; + ++num_active_columns; } } } @@ -810,6 +835,8 @@ table_with_metadata read_csv(cudf::io::datasource* source, // Return empty table rather than exception if nothing to load if (num_active_columns == 0) { return {std::make_unique(), {}}; } + // Exclude the end-of-data row from number of rows with actual data + auto const num_records = std::max(row_offsets.size(), 1ul) - 1; auto const column_types = determine_column_types( reader_opts, parse_opts, column_names, data, row_offsets, num_records, column_flags, stream); diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index eeca87446ec..17fddffc93e 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -904,7 +904,7 @@ TEST_F(CsvReaderTest, Strings) auto filepath = temp_env->get_temp_dir() + "Strings.csv"; { std::ofstream outfile(filepath, std::ofstream::out); - outfile << names[0] << ',' << names[1] << ',' << '\n'; + outfile << names[0] << ',' << names[1] << '\n'; outfile << "10,abc def ghi" << '\n'; outfile << "20,\"jkl mno pqr\"" << '\n'; outfile << "30,stu \"\"vwx\"\" yz" << '\n'; @@ -934,7 +934,7 @@ TEST_F(CsvReaderTest, StringsQuotes) auto filepath = temp_env->get_temp_dir() + "StringsQuotes.csv"; { std::ofstream outfile(filepath, std::ofstream::out); - outfile << names[0] << ',' << names[1] << ',' << '\n'; + outfile << names[0] << ',' << names[1] << '\n'; outfile << "10,`abc,\ndef, ghi`" << '\n'; outfile << "20,`jkl, ``mno``, pqr`" << '\n'; outfile << "30,stu `vwx` yz" << '\n'; @@ -963,7 +963,7 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored) auto filepath = temp_env->get_temp_dir() + "StringsQuotesIgnored.csv"; { std::ofstream outfile(filepath, std::ofstream::out); - outfile << names[0] << ',' << names[1] << ',' << '\n'; + outfile << names[0] << ',' << names[1] << '\n'; outfile << "10,\"abcdef ghi\"" << '\n'; outfile << "20,\"jkl \"\"mno\"\" pqr\"" << '\n'; outfile << "30,stu \"vwx\" yz" << '\n'; @@ -2244,6 +2244,129 @@ TEST_F(CsvReaderTest, CsvDefaultOptionsWriteReadMatch) EXPECT_EQ(new_table_and_metadata.metadata.column_names[1], "1"); } +TEST_F(CsvReaderTest, UseColsValidation) +{ + const std::string buffer = "1,2,3"; + + const cudf::io::csv_reader_options idx_cnt_options = + cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()}) + .names({"a", "b"}) + .use_cols_indexes({0}); + EXPECT_THROW(cudf::io::read_csv(idx_cnt_options), cudf::logic_error); + + cudf::io::csv_reader_options unique_idx_cnt_options = + cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()}) + .names({"a", "b"}) + .use_cols_indexes({0, 0}); + EXPECT_THROW(cudf::io::read_csv(unique_idx_cnt_options), cudf::logic_error); + + cudf::io::csv_reader_options bad_name_options = + cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()}) + .names({"a", "b", "c"}) + .use_cols_names({"nonexistent_name"}); + EXPECT_THROW(cudf::io::read_csv(bad_name_options), cudf::logic_error); +} + +TEST_F(CsvReaderTest, CropColumns) +{ + const std::string csv_in{"12,9., 10\n34,8., 20\n56,7., 30"}; + + cudf::io::csv_reader_options in_opts = + cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()}) + .dtypes(std::vector{dtype(), dtype()}) + .names({"a", "b"}) + .header(-1); + const auto result = cudf::io::read_csv(in_opts); + + const auto result_table = result.tbl->view(); + ASSERT_EQ(result_table.num_columns(), 2); + ASSERT_EQ(result_table.column(0).type(), data_type{type_id::INT32}); + ASSERT_EQ(result_table.column(1).type(), data_type{type_id::FLOAT32}); + expect_column_data_equal(std::vector{12, 34, 56}, result_table.column(0)); + expect_column_data_equal(std::vector{9., 8., 7.}, result_table.column(1)); +} + +TEST_F(CsvReaderTest, CropColumnsUseColsNames) +{ + std::string csv_in{"12,9., 10\n34,8., 20\n56,7., 30"}; + + cudf::io::csv_reader_options in_opts = + cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()}) + .dtypes(std::vector{dtype(), dtype()}) + .names({"a", "b"}) + .use_cols_names({"b"}) + .header(-1); + auto result = cudf::io::read_csv(in_opts); + + const auto result_table = result.tbl->view(); + ASSERT_EQ(result_table.num_columns(), 1); + ASSERT_EQ(result_table.column(0).type(), data_type{type_id::FLOAT32}); + expect_column_data_equal(std::vector{9., 8., 7.}, result_table.column(0)); +} + +TEST_F(CsvReaderTest, ExtraColumns) +{ + std::string csv_in{"12,9., 10\n34,8., 20\n56,7., 30"}; + { + cudf::io::csv_reader_options opts = + cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()}) + .names({"a", "b", "c", "d"}) + .header(-1); + auto result = cudf::io::read_csv(opts); + + const auto result_table = result.tbl->view(); + ASSERT_EQ(result_table.num_columns(), 4); + ASSERT_EQ(result_table.column(3).type(), data_type{type_id::INT8}); + ASSERT_EQ(result_table.column(3).null_count(), 3); + } + { + cudf::io::csv_reader_options with_dtypes_opts = + cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()}) + .names({"a", "b", "c", "d"}) + .dtypes({dtype(), dtype(), dtype(), dtype()}) + .header(-1); + auto result = cudf::io::read_csv(with_dtypes_opts); + + const auto result_table = result.tbl->view(); + ASSERT_EQ(result_table.num_columns(), 4); + ASSERT_EQ(result_table.column(3).type(), data_type{type_id::FLOAT32}); + ASSERT_EQ(result_table.column(3).null_count(), 3); + } +} + +TEST_F(CsvReaderTest, ExtraColumnsUseCols) +{ + std::string csv_in{"12,9., 10\n34,8., 20\n56,7., 30"}; + + { + cudf::io::csv_reader_options in_opts = + cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()}) + .names({"a", "b", "c", "d"}) + .use_cols_names({"b", "d"}) + .header(-1); + auto result = cudf::io::read_csv(in_opts); + + const auto result_table = result.tbl->view(); + ASSERT_EQ(result_table.num_columns(), 2); + ASSERT_EQ(result_table.column(1).type(), data_type{type_id::INT8}); + ASSERT_EQ(result_table.column(1).null_count(), 3); + } + { + cudf::io::csv_reader_options with_dtypes_opts = + cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()}) + .names({"a", "b", "c", "d"}) + .use_cols_names({"b", "d"}) + .dtypes({dtype(), dtype(), dtype(), dtype()}) + .header(-1); + auto result = cudf::io::read_csv(with_dtypes_opts); + + const auto result_table = result.tbl->view(); + ASSERT_EQ(result_table.num_columns(), 2); + ASSERT_EQ(result_table.column(1).type(), data_type{type_id::STRING}); + ASSERT_EQ(result_table.column(1).null_count(), 3); + } +} + TEST_F(CsvReaderTest, EmptyColumns) { // First column only has empty fields. second column contains only "null" literals diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index e85d404d2c4..7e62f63b0e2 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -2205,3 +2205,41 @@ def test_default_float_bitwidth_partial(default_float_bitwidth): ) assert read["float1"].dtype == np.dtype(f"f{default_float_bitwidth//8}") assert read["float2"].dtype == np.dtype("f8") + + +@pytest.mark.parametrize( + "usecols,names", + [ + # selection using indices; only names of selected columns are specified + ([1, 2], ["b", "c"]), + # selection using indices; names of all columns are specified + ([1, 2], ["a", "b", "c"]), + # selection using indices; duplicates + ([2, 2], ["a", "b", "c"]), + # selection using indices; out of order + ([2, 1], ["a", "b", "c"]), + # selection using names + (["b"], ["a", "b", "c"]), + # selection using names; multiple columns + (["b", "c"], ["a", "b", "c"]), + # selection using names; duplicates + (["c", "c"], ["a", "b", "c"]), + # selection using names; out of order + (["c", "b"], ["a", "b", "c"]), + ], +) +def test_column_selection_plus_column_names(usecols, names): + + lines = [ + "num,datetime,text", + "123,2018-11-13T12:00:00,abc", + "456,2018-11-14T12:35:01,def", + "789,2018-11-15T18:02:59,ghi", + ] + + buffer = "\n".join(lines) + "\n" + + assert_eq( + pd.read_csv(StringIO(buffer), usecols=usecols, names=names), + cudf.read_csv(StringIO(buffer), usecols=usecols, names=names), + ) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 2c4b73666a5..96d4ea891b1 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -1029,16 +1029,22 @@ the column names: if no names are passed, header=0; if column names are passed explicitly, header=None. names : list of str, default None - List of column names to be used. + List of column names to be used. Needs to include names of all columns in + the file, or names of all columns selected using `usecols` (only when + `usecols` holds integer indices). When `usecols` is not used to select + column indices, `names` can contain more names than there are columns i.n + the file. In this case the extra columns will only contain null rows. index_col : int, string or False, default None Column to use as the row labels of the DataFrame. Passing `index_col=False` explicitly disables index column inference and discards the last column. usecols : list of int or str, default None Returns subset of the columns given in the list. All elements must be either integer indices (column number) or strings that correspond to - column names + column names. When an integer index is passed for each name in the `names` + parameter, the names are interpreted as names in the output table, not as + names in the input file. prefix : str, default None - Prefix to add to column numbers when parsing without a header row + Prefix to add to column numbers when parsing without a header row. mangle_dupe_cols : boolean, default True Duplicate columns will be specified as 'X','X.1',...'X.N'. dtype : type, str, list of types, or dict of column -> type, default None