From 6de2c4e7c98a40551924c9c8892ce59fc1b771cf Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 16 Nov 2022 21:58:16 -0800 Subject: [PATCH] Fix issues when both `usecols` and `names` options are used in `read_csv` (#12018) closes https://github.com/rapidsai/cudf/issues/8973 CSV reader has a few gaps in the logic for column selection and user specified column names: 1. Users cannot only specify the names of selected columns; 2. Reader fails in unpredictable ways when only a subset of column names is passed (w/o column selection); This PR fixes the issues above. Users can now specify column names (can be lower than the actual number of columns) or names of columns selected via their indices (must match the number of indices). If selection via indices is used, the number of column names has to match either the actual number of columns, or the number of selected columns. Also fixed test an error that went unnoticed due to issues above. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Karthikeyan (https://github.com/karthikeyann) - Vyas Ramasubramani (https://github.com/vyasr) - Nghia Truong (https://github.com/ttnghia) - https://github.com/nvdbaranec URL: https://github.com/rapidsai/cudf/pull/12018 --- cpp/src/io/csv/reader_impl.cu | 93 +++++++++++++-------- cpp/tests/io/csv_test.cpp | 129 ++++++++++++++++++++++++++++- python/cudf/cudf/tests/test_csv.py | 38 +++++++++ python/cudf/cudf/utils/ioutils.py | 12 ++- 4 files changed, 233 insertions(+), 39 deletions(-) diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index f812f272c25..075e9e2c965 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -676,32 +676,37 @@ table_with_metadata read_csv(cudf::io::datasource* source, auto const& data = data_row_offsets.first; auto const& row_offsets = data_row_offsets.second; - // Exclude the end-of-data row from number of rows with actual data - auto num_records = std::max(row_offsets.size(), 1ul) - 1; - auto column_flags = std::vector(); - auto column_names = std::vector(); - auto num_actual_columns = static_cast(reader_opts.get_names().size()); - auto num_active_columns = num_actual_columns; - - // Check if the user gave us a list of column names - if (not reader_opts.get_names().empty()) { - column_flags.resize(reader_opts.get_names().size(), - column_parse::enabled | column_parse::inferred); - column_names = reader_opts.get_names(); - } else { - column_names = get_column_names( - header, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix()); - - num_actual_columns = num_active_columns = column_names.size(); - - column_flags.resize(num_actual_columns, column_parse::enabled | column_parse::inferred); - + auto const unique_use_cols_indexes = std::set(reader_opts.get_use_cols_indexes().cbegin(), + reader_opts.get_use_cols_indexes().cend()); + + auto const detected_column_names = + get_column_names(header, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix()); + auto const opts_have_all_col_names = + not reader_opts.get_names().empty() and + ( + // no data to detect (the number of) columns + detected_column_names.empty() or + // number of user specified names matches what is detected + reader_opts.get_names().size() == detected_column_names.size() or + // Columns are not selected by indices; read first reader_opts.get_names().size() columns + unique_use_cols_indexes.empty()); + auto column_names = opts_have_all_col_names ? reader_opts.get_names() : detected_column_names; + + auto const num_actual_columns = static_cast(column_names.size()); + auto num_active_columns = num_actual_columns; + auto column_flags = std::vector( + num_actual_columns, column_parse::enabled | column_parse::inferred); + + // User did not pass column names to override names in the file + // Process names from the file to remove empty and duplicated strings + if (not opts_have_all_col_names) { std::vector col_loop_order(column_names.size()); auto unnamed_it = std::copy_if( thrust::make_counting_iterator(0), thrust::make_counting_iterator(column_names.size()), col_loop_order.begin(), [&column_names](auto col_idx) -> bool { return not column_names[col_idx].empty(); }); + // Rename empty column names to "Unnamed: col_index" std::copy_if(thrust::make_counting_iterator(0), thrust::make_counting_iterator(column_names.size()), @@ -756,24 +761,44 @@ table_with_metadata read_csv(cudf::io::datasource* source, } // User can specify which columns should be parsed - if (!reader_opts.get_use_cols_indexes().empty() || !reader_opts.get_use_cols_names().empty()) { + auto const unique_use_cols_names = std::unordered_set(reader_opts.get_use_cols_names().cbegin(), + reader_opts.get_use_cols_names().cend()); + auto const is_column_selection_used = + not unique_use_cols_names.empty() or not unique_use_cols_indexes.empty(); + + // Reset flags and output column count; columns will be reactivated based on the selection options + if (is_column_selection_used) { std::fill(column_flags.begin(), column_flags.end(), column_parse::disabled); + num_active_columns = 0; + } + + // Column selection via column indexes + if (not unique_use_cols_indexes.empty()) { + // Users can pass names for the selected columns only, if selecting column by their indices + auto const are_opts_col_names_used = + not reader_opts.get_names().empty() and not opts_have_all_col_names; + CUDF_EXPECTS(not are_opts_col_names_used or + reader_opts.get_names().size() == unique_use_cols_indexes.size(), + "Specify names of all columns in the file, or names of all selected columns"); - for (const auto index : reader_opts.get_use_cols_indexes()) { + for (auto const index : unique_use_cols_indexes) { column_flags[index] = column_parse::enabled | column_parse::inferred; + if (are_opts_col_names_used) { + column_names[index] = reader_opts.get_names()[num_active_columns]; + } + ++num_active_columns; } - num_active_columns = std::unordered_set(reader_opts.get_use_cols_indexes().begin(), - reader_opts.get_use_cols_indexes().end()) - .size(); + } - for (const auto& name : reader_opts.get_use_cols_names()) { - const auto it = std::find(column_names.begin(), column_names.end(), name); - if (it != column_names.end()) { - auto curr_it = it - column_names.begin(); - if (column_flags[curr_it] == column_parse::disabled) { - column_flags[curr_it] = column_parse::enabled | column_parse::inferred; - num_active_columns++; - } + // Column selection via column names + if (not unique_use_cols_names.empty()) { + for (auto const& name : unique_use_cols_names) { + auto const it = std::find(column_names.cbegin(), column_names.cend(), name); + CUDF_EXPECTS(it != column_names.end(), "Nonexistent column selected"); + auto const col_idx = std::distance(column_names.cbegin(), it); + if (column_flags[col_idx] == column_parse::disabled) { + column_flags[col_idx] = column_parse::enabled | column_parse::inferred; + ++num_active_columns; } } } @@ -810,6 +835,8 @@ table_with_metadata read_csv(cudf::io::datasource* source, // Return empty table rather than exception if nothing to load if (num_active_columns == 0) { return {std::make_unique(), {}}; } + // Exclude the end-of-data row from number of rows with actual data + auto const num_records = std::max(row_offsets.size(), 1ul) - 1; auto const column_types = determine_column_types( reader_opts, parse_opts, column_names, data, row_offsets, num_records, column_flags, stream); diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index eeca87446ec..17fddffc93e 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -904,7 +904,7 @@ TEST_F(CsvReaderTest, Strings) auto filepath = temp_env->get_temp_dir() + "Strings.csv"; { std::ofstream outfile(filepath, std::ofstream::out); - outfile << names[0] << ',' << names[1] << ',' << '\n'; + outfile << names[0] << ',' << names[1] << '\n'; outfile << "10,abc def ghi" << '\n'; outfile << "20,\"jkl mno pqr\"" << '\n'; outfile << "30,stu \"\"vwx\"\" yz" << '\n'; @@ -934,7 +934,7 @@ TEST_F(CsvReaderTest, StringsQuotes) auto filepath = temp_env->get_temp_dir() + "StringsQuotes.csv"; { std::ofstream outfile(filepath, std::ofstream::out); - outfile << names[0] << ',' << names[1] << ',' << '\n'; + outfile << names[0] << ',' << names[1] << '\n'; outfile << "10,`abc,\ndef, ghi`" << '\n'; outfile << "20,`jkl, ``mno``, pqr`" << '\n'; outfile << "30,stu `vwx` yz" << '\n'; @@ -963,7 +963,7 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored) auto filepath = temp_env->get_temp_dir() + "StringsQuotesIgnored.csv"; { std::ofstream outfile(filepath, std::ofstream::out); - outfile << names[0] << ',' << names[1] << ',' << '\n'; + outfile << names[0] << ',' << names[1] << '\n'; outfile << "10,\"abcdef ghi\"" << '\n'; outfile << "20,\"jkl \"\"mno\"\" pqr\"" << '\n'; outfile << "30,stu \"vwx\" yz" << '\n'; @@ -2244,6 +2244,129 @@ TEST_F(CsvReaderTest, CsvDefaultOptionsWriteReadMatch) EXPECT_EQ(new_table_and_metadata.metadata.column_names[1], "1"); } +TEST_F(CsvReaderTest, UseColsValidation) +{ + const std::string buffer = "1,2,3"; + + const cudf::io::csv_reader_options idx_cnt_options = + cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()}) + .names({"a", "b"}) + .use_cols_indexes({0}); + EXPECT_THROW(cudf::io::read_csv(idx_cnt_options), cudf::logic_error); + + cudf::io::csv_reader_options unique_idx_cnt_options = + cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()}) + .names({"a", "b"}) + .use_cols_indexes({0, 0}); + EXPECT_THROW(cudf::io::read_csv(unique_idx_cnt_options), cudf::logic_error); + + cudf::io::csv_reader_options bad_name_options = + cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()}) + .names({"a", "b", "c"}) + .use_cols_names({"nonexistent_name"}); + EXPECT_THROW(cudf::io::read_csv(bad_name_options), cudf::logic_error); +} + +TEST_F(CsvReaderTest, CropColumns) +{ + const std::string csv_in{"12,9., 10\n34,8., 20\n56,7., 30"}; + + cudf::io::csv_reader_options in_opts = + cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()}) + .dtypes(std::vector{dtype(), dtype()}) + .names({"a", "b"}) + .header(-1); + const auto result = cudf::io::read_csv(in_opts); + + const auto result_table = result.tbl->view(); + ASSERT_EQ(result_table.num_columns(), 2); + ASSERT_EQ(result_table.column(0).type(), data_type{type_id::INT32}); + ASSERT_EQ(result_table.column(1).type(), data_type{type_id::FLOAT32}); + expect_column_data_equal(std::vector{12, 34, 56}, result_table.column(0)); + expect_column_data_equal(std::vector{9., 8., 7.}, result_table.column(1)); +} + +TEST_F(CsvReaderTest, CropColumnsUseColsNames) +{ + std::string csv_in{"12,9., 10\n34,8., 20\n56,7., 30"}; + + cudf::io::csv_reader_options in_opts = + cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()}) + .dtypes(std::vector{dtype(), dtype()}) + .names({"a", "b"}) + .use_cols_names({"b"}) + .header(-1); + auto result = cudf::io::read_csv(in_opts); + + const auto result_table = result.tbl->view(); + ASSERT_EQ(result_table.num_columns(), 1); + ASSERT_EQ(result_table.column(0).type(), data_type{type_id::FLOAT32}); + expect_column_data_equal(std::vector{9., 8., 7.}, result_table.column(0)); +} + +TEST_F(CsvReaderTest, ExtraColumns) +{ + std::string csv_in{"12,9., 10\n34,8., 20\n56,7., 30"}; + { + cudf::io::csv_reader_options opts = + cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()}) + .names({"a", "b", "c", "d"}) + .header(-1); + auto result = cudf::io::read_csv(opts); + + const auto result_table = result.tbl->view(); + ASSERT_EQ(result_table.num_columns(), 4); + ASSERT_EQ(result_table.column(3).type(), data_type{type_id::INT8}); + ASSERT_EQ(result_table.column(3).null_count(), 3); + } + { + cudf::io::csv_reader_options with_dtypes_opts = + cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()}) + .names({"a", "b", "c", "d"}) + .dtypes({dtype(), dtype(), dtype(), dtype()}) + .header(-1); + auto result = cudf::io::read_csv(with_dtypes_opts); + + const auto result_table = result.tbl->view(); + ASSERT_EQ(result_table.num_columns(), 4); + ASSERT_EQ(result_table.column(3).type(), data_type{type_id::FLOAT32}); + ASSERT_EQ(result_table.column(3).null_count(), 3); + } +} + +TEST_F(CsvReaderTest, ExtraColumnsUseCols) +{ + std::string csv_in{"12,9., 10\n34,8., 20\n56,7., 30"}; + + { + cudf::io::csv_reader_options in_opts = + cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()}) + .names({"a", "b", "c", "d"}) + .use_cols_names({"b", "d"}) + .header(-1); + auto result = cudf::io::read_csv(in_opts); + + const auto result_table = result.tbl->view(); + ASSERT_EQ(result_table.num_columns(), 2); + ASSERT_EQ(result_table.column(1).type(), data_type{type_id::INT8}); + ASSERT_EQ(result_table.column(1).null_count(), 3); + } + { + cudf::io::csv_reader_options with_dtypes_opts = + cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()}) + .names({"a", "b", "c", "d"}) + .use_cols_names({"b", "d"}) + .dtypes({dtype(), dtype(), dtype(), dtype()}) + .header(-1); + auto result = cudf::io::read_csv(with_dtypes_opts); + + const auto result_table = result.tbl->view(); + ASSERT_EQ(result_table.num_columns(), 2); + ASSERT_EQ(result_table.column(1).type(), data_type{type_id::STRING}); + ASSERT_EQ(result_table.column(1).null_count(), 3); + } +} + TEST_F(CsvReaderTest, EmptyColumns) { // First column only has empty fields. second column contains only "null" literals diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index e85d404d2c4..7e62f63b0e2 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -2205,3 +2205,41 @@ def test_default_float_bitwidth_partial(default_float_bitwidth): ) assert read["float1"].dtype == np.dtype(f"f{default_float_bitwidth//8}") assert read["float2"].dtype == np.dtype("f8") + + +@pytest.mark.parametrize( + "usecols,names", + [ + # selection using indices; only names of selected columns are specified + ([1, 2], ["b", "c"]), + # selection using indices; names of all columns are specified + ([1, 2], ["a", "b", "c"]), + # selection using indices; duplicates + ([2, 2], ["a", "b", "c"]), + # selection using indices; out of order + ([2, 1], ["a", "b", "c"]), + # selection using names + (["b"], ["a", "b", "c"]), + # selection using names; multiple columns + (["b", "c"], ["a", "b", "c"]), + # selection using names; duplicates + (["c", "c"], ["a", "b", "c"]), + # selection using names; out of order + (["c", "b"], ["a", "b", "c"]), + ], +) +def test_column_selection_plus_column_names(usecols, names): + + lines = [ + "num,datetime,text", + "123,2018-11-13T12:00:00,abc", + "456,2018-11-14T12:35:01,def", + "789,2018-11-15T18:02:59,ghi", + ] + + buffer = "\n".join(lines) + "\n" + + assert_eq( + pd.read_csv(StringIO(buffer), usecols=usecols, names=names), + cudf.read_csv(StringIO(buffer), usecols=usecols, names=names), + ) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 2c4b73666a5..96d4ea891b1 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -1029,16 +1029,22 @@ the column names: if no names are passed, header=0; if column names are passed explicitly, header=None. names : list of str, default None - List of column names to be used. + List of column names to be used. Needs to include names of all columns in + the file, or names of all columns selected using `usecols` (only when + `usecols` holds integer indices). When `usecols` is not used to select + column indices, `names` can contain more names than there are columns i.n + the file. In this case the extra columns will only contain null rows. index_col : int, string or False, default None Column to use as the row labels of the DataFrame. Passing `index_col=False` explicitly disables index column inference and discards the last column. usecols : list of int or str, default None Returns subset of the columns given in the list. All elements must be either integer indices (column number) or strings that correspond to - column names + column names. When an integer index is passed for each name in the `names` + parameter, the names are interpreted as names in the output table, not as + names in the input file. prefix : str, default None - Prefix to add to column numbers when parsing without a header row + Prefix to add to column numbers when parsing without a header row. mangle_dupe_cols : boolean, default True Duplicate columns will be specified as 'X','X.1',...'X.N'. dtype : type, str, list of types, or dict of column -> type, default None