Skip to content

Commit

Permalink
Fix issues when both usecols and names options are used in `read_…
Browse files Browse the repository at this point in the history
…csv` (#12018)

closes #8973
CSV reader has a few gaps in the logic for column selection and user specified column names:
1. Users cannot only specify the names of selected columns;
2. Reader fails in unpredictable ways when only a subset of column names is passed (w/o column selection);

This PR fixes the issues above. Users can now specify column names (can be lower than the actual number of columns) or names of columns selected via their indices (must match the number of indices). If selection via indices is used, the number of column names has to match either the actual number of columns, or the number of selected columns.

Also fixed test an error that went unnoticed due to issues above.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Karthikeyan (https://github.com/karthikeyann)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Nghia Truong (https://github.com/ttnghia)
  - https://github.com/nvdbaranec

URL: #12018
  • Loading branch information
vuule authored Nov 17, 2022
1 parent ae101cc commit 6de2c4e
Show file tree
Hide file tree
Showing 4 changed files with 233 additions and 39 deletions.
93 changes: 60 additions & 33 deletions cpp/src/io/csv/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -676,32 +676,37 @@ table_with_metadata read_csv(cudf::io::datasource* source,
auto const& data = data_row_offsets.first;
auto const& row_offsets = data_row_offsets.second;

// Exclude the end-of-data row from number of rows with actual data
auto num_records = std::max(row_offsets.size(), 1ul) - 1;
auto column_flags = std::vector<column_parse::flags>();
auto column_names = std::vector<std::string>();
auto num_actual_columns = static_cast<int32_t>(reader_opts.get_names().size());
auto num_active_columns = num_actual_columns;

// Check if the user gave us a list of column names
if (not reader_opts.get_names().empty()) {
column_flags.resize(reader_opts.get_names().size(),
column_parse::enabled | column_parse::inferred);
column_names = reader_opts.get_names();
} else {
column_names = get_column_names(
header, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix());

num_actual_columns = num_active_columns = column_names.size();

column_flags.resize(num_actual_columns, column_parse::enabled | column_parse::inferred);

auto const unique_use_cols_indexes = std::set(reader_opts.get_use_cols_indexes().cbegin(),
reader_opts.get_use_cols_indexes().cend());

auto const detected_column_names =
get_column_names(header, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix());
auto const opts_have_all_col_names =
not reader_opts.get_names().empty() and
(
// no data to detect (the number of) columns
detected_column_names.empty() or
// number of user specified names matches what is detected
reader_opts.get_names().size() == detected_column_names.size() or
// Columns are not selected by indices; read first reader_opts.get_names().size() columns
unique_use_cols_indexes.empty());
auto column_names = opts_have_all_col_names ? reader_opts.get_names() : detected_column_names;

auto const num_actual_columns = static_cast<int32_t>(column_names.size());
auto num_active_columns = num_actual_columns;
auto column_flags = std::vector<column_parse::flags>(
num_actual_columns, column_parse::enabled | column_parse::inferred);

// User did not pass column names to override names in the file
// Process names from the file to remove empty and duplicated strings
if (not opts_have_all_col_names) {
std::vector<size_t> col_loop_order(column_names.size());
auto unnamed_it = std::copy_if(
thrust::make_counting_iterator<size_t>(0),
thrust::make_counting_iterator<size_t>(column_names.size()),
col_loop_order.begin(),
[&column_names](auto col_idx) -> bool { return not column_names[col_idx].empty(); });

// Rename empty column names to "Unnamed: col_index"
std::copy_if(thrust::make_counting_iterator<size_t>(0),
thrust::make_counting_iterator<size_t>(column_names.size()),
Expand Down Expand Up @@ -756,24 +761,44 @@ table_with_metadata read_csv(cudf::io::datasource* source,
}

// User can specify which columns should be parsed
if (!reader_opts.get_use_cols_indexes().empty() || !reader_opts.get_use_cols_names().empty()) {
auto const unique_use_cols_names = std::unordered_set(reader_opts.get_use_cols_names().cbegin(),
reader_opts.get_use_cols_names().cend());
auto const is_column_selection_used =
not unique_use_cols_names.empty() or not unique_use_cols_indexes.empty();

// Reset flags and output column count; columns will be reactivated based on the selection options
if (is_column_selection_used) {
std::fill(column_flags.begin(), column_flags.end(), column_parse::disabled);
num_active_columns = 0;
}

// Column selection via column indexes
if (not unique_use_cols_indexes.empty()) {
// Users can pass names for the selected columns only, if selecting column by their indices
auto const are_opts_col_names_used =
not reader_opts.get_names().empty() and not opts_have_all_col_names;
CUDF_EXPECTS(not are_opts_col_names_used or
reader_opts.get_names().size() == unique_use_cols_indexes.size(),
"Specify names of all columns in the file, or names of all selected columns");

for (const auto index : reader_opts.get_use_cols_indexes()) {
for (auto const index : unique_use_cols_indexes) {
column_flags[index] = column_parse::enabled | column_parse::inferred;
if (are_opts_col_names_used) {
column_names[index] = reader_opts.get_names()[num_active_columns];
}
++num_active_columns;
}
num_active_columns = std::unordered_set<int>(reader_opts.get_use_cols_indexes().begin(),
reader_opts.get_use_cols_indexes().end())
.size();
}

for (const auto& name : reader_opts.get_use_cols_names()) {
const auto it = std::find(column_names.begin(), column_names.end(), name);
if (it != column_names.end()) {
auto curr_it = it - column_names.begin();
if (column_flags[curr_it] == column_parse::disabled) {
column_flags[curr_it] = column_parse::enabled | column_parse::inferred;
num_active_columns++;
}
// Column selection via column names
if (not unique_use_cols_names.empty()) {
for (auto const& name : unique_use_cols_names) {
auto const it = std::find(column_names.cbegin(), column_names.cend(), name);
CUDF_EXPECTS(it != column_names.end(), "Nonexistent column selected");
auto const col_idx = std::distance(column_names.cbegin(), it);
if (column_flags[col_idx] == column_parse::disabled) {
column_flags[col_idx] = column_parse::enabled | column_parse::inferred;
++num_active_columns;
}
}
}
Expand Down Expand Up @@ -810,6 +835,8 @@ table_with_metadata read_csv(cudf::io::datasource* source,
// Return empty table rather than exception if nothing to load
if (num_active_columns == 0) { return {std::make_unique<table>(), {}}; }

// Exclude the end-of-data row from number of rows with actual data
auto const num_records = std::max(row_offsets.size(), 1ul) - 1;
auto const column_types = determine_column_types(
reader_opts, parse_opts, column_names, data, row_offsets, num_records, column_flags, stream);

Expand Down
129 changes: 126 additions & 3 deletions cpp/tests/io/csv_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -904,7 +904,7 @@ TEST_F(CsvReaderTest, Strings)
auto filepath = temp_env->get_temp_dir() + "Strings.csv";
{
std::ofstream outfile(filepath, std::ofstream::out);
outfile << names[0] << ',' << names[1] << ',' << '\n';
outfile << names[0] << ',' << names[1] << '\n';
outfile << "10,abc def ghi" << '\n';
outfile << "20,\"jkl mno pqr\"" << '\n';
outfile << "30,stu \"\"vwx\"\" yz" << '\n';
Expand Down Expand Up @@ -934,7 +934,7 @@ TEST_F(CsvReaderTest, StringsQuotes)
auto filepath = temp_env->get_temp_dir() + "StringsQuotes.csv";
{
std::ofstream outfile(filepath, std::ofstream::out);
outfile << names[0] << ',' << names[1] << ',' << '\n';
outfile << names[0] << ',' << names[1] << '\n';
outfile << "10,`abc,\ndef, ghi`" << '\n';
outfile << "20,`jkl, ``mno``, pqr`" << '\n';
outfile << "30,stu `vwx` yz" << '\n';
Expand Down Expand Up @@ -963,7 +963,7 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored)
auto filepath = temp_env->get_temp_dir() + "StringsQuotesIgnored.csv";
{
std::ofstream outfile(filepath, std::ofstream::out);
outfile << names[0] << ',' << names[1] << ',' << '\n';
outfile << names[0] << ',' << names[1] << '\n';
outfile << "10,\"abcdef ghi\"" << '\n';
outfile << "20,\"jkl \"\"mno\"\" pqr\"" << '\n';
outfile << "30,stu \"vwx\" yz" << '\n';
Expand Down Expand Up @@ -2244,6 +2244,129 @@ TEST_F(CsvReaderTest, CsvDefaultOptionsWriteReadMatch)
EXPECT_EQ(new_table_and_metadata.metadata.column_names[1], "1");
}

TEST_F(CsvReaderTest, UseColsValidation)
{
const std::string buffer = "1,2,3";

const cudf::io::csv_reader_options idx_cnt_options =
cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
.names({"a", "b"})
.use_cols_indexes({0});
EXPECT_THROW(cudf::io::read_csv(idx_cnt_options), cudf::logic_error);

cudf::io::csv_reader_options unique_idx_cnt_options =
cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
.names({"a", "b"})
.use_cols_indexes({0, 0});
EXPECT_THROW(cudf::io::read_csv(unique_idx_cnt_options), cudf::logic_error);

cudf::io::csv_reader_options bad_name_options =
cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
.names({"a", "b", "c"})
.use_cols_names({"nonexistent_name"});
EXPECT_THROW(cudf::io::read_csv(bad_name_options), cudf::logic_error);
}

TEST_F(CsvReaderTest, CropColumns)
{
const std::string csv_in{"12,9., 10\n34,8., 20\n56,7., 30"};

cudf::io::csv_reader_options in_opts =
cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()})
.dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<float>()})
.names({"a", "b"})
.header(-1);
const auto result = cudf::io::read_csv(in_opts);

const auto result_table = result.tbl->view();
ASSERT_EQ(result_table.num_columns(), 2);
ASSERT_EQ(result_table.column(0).type(), data_type{type_id::INT32});
ASSERT_EQ(result_table.column(1).type(), data_type{type_id::FLOAT32});
expect_column_data_equal(std::vector<int32_t>{12, 34, 56}, result_table.column(0));
expect_column_data_equal(std::vector<float>{9., 8., 7.}, result_table.column(1));
}

TEST_F(CsvReaderTest, CropColumnsUseColsNames)
{
std::string csv_in{"12,9., 10\n34,8., 20\n56,7., 30"};

cudf::io::csv_reader_options in_opts =
cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()})
.dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<float>()})
.names({"a", "b"})
.use_cols_names({"b"})
.header(-1);
auto result = cudf::io::read_csv(in_opts);

const auto result_table = result.tbl->view();
ASSERT_EQ(result_table.num_columns(), 1);
ASSERT_EQ(result_table.column(0).type(), data_type{type_id::FLOAT32});
expect_column_data_equal(std::vector<float>{9., 8., 7.}, result_table.column(0));
}

TEST_F(CsvReaderTest, ExtraColumns)
{
std::string csv_in{"12,9., 10\n34,8., 20\n56,7., 30"};
{
cudf::io::csv_reader_options opts =
cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()})
.names({"a", "b", "c", "d"})
.header(-1);
auto result = cudf::io::read_csv(opts);

const auto result_table = result.tbl->view();
ASSERT_EQ(result_table.num_columns(), 4);
ASSERT_EQ(result_table.column(3).type(), data_type{type_id::INT8});
ASSERT_EQ(result_table.column(3).null_count(), 3);
}
{
cudf::io::csv_reader_options with_dtypes_opts =
cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()})
.names({"a", "b", "c", "d"})
.dtypes({dtype<int32_t>(), dtype<int32_t>(), dtype<int32_t>(), dtype<float>()})
.header(-1);
auto result = cudf::io::read_csv(with_dtypes_opts);

const auto result_table = result.tbl->view();
ASSERT_EQ(result_table.num_columns(), 4);
ASSERT_EQ(result_table.column(3).type(), data_type{type_id::FLOAT32});
ASSERT_EQ(result_table.column(3).null_count(), 3);
}
}

TEST_F(CsvReaderTest, ExtraColumnsUseCols)
{
std::string csv_in{"12,9., 10\n34,8., 20\n56,7., 30"};

{
cudf::io::csv_reader_options in_opts =
cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()})
.names({"a", "b", "c", "d"})
.use_cols_names({"b", "d"})
.header(-1);
auto result = cudf::io::read_csv(in_opts);

const auto result_table = result.tbl->view();
ASSERT_EQ(result_table.num_columns(), 2);
ASSERT_EQ(result_table.column(1).type(), data_type{type_id::INT8});
ASSERT_EQ(result_table.column(1).null_count(), 3);
}
{
cudf::io::csv_reader_options with_dtypes_opts =
cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()})
.names({"a", "b", "c", "d"})
.use_cols_names({"b", "d"})
.dtypes({dtype<int32_t>(), dtype<int32_t>(), dtype<int32_t>(), dtype<cudf::string_view>()})
.header(-1);
auto result = cudf::io::read_csv(with_dtypes_opts);

const auto result_table = result.tbl->view();
ASSERT_EQ(result_table.num_columns(), 2);
ASSERT_EQ(result_table.column(1).type(), data_type{type_id::STRING});
ASSERT_EQ(result_table.column(1).null_count(), 3);
}
}

TEST_F(CsvReaderTest, EmptyColumns)
{
// First column only has empty fields. second column contains only "null" literals
Expand Down
38 changes: 38 additions & 0 deletions python/cudf/cudf/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -2205,3 +2205,41 @@ def test_default_float_bitwidth_partial(default_float_bitwidth):
)
assert read["float1"].dtype == np.dtype(f"f{default_float_bitwidth//8}")
assert read["float2"].dtype == np.dtype("f8")


@pytest.mark.parametrize(
"usecols,names",
[
# selection using indices; only names of selected columns are specified
([1, 2], ["b", "c"]),
# selection using indices; names of all columns are specified
([1, 2], ["a", "b", "c"]),
# selection using indices; duplicates
([2, 2], ["a", "b", "c"]),
# selection using indices; out of order
([2, 1], ["a", "b", "c"]),
# selection using names
(["b"], ["a", "b", "c"]),
# selection using names; multiple columns
(["b", "c"], ["a", "b", "c"]),
# selection using names; duplicates
(["c", "c"], ["a", "b", "c"]),
# selection using names; out of order
(["c", "b"], ["a", "b", "c"]),
],
)
def test_column_selection_plus_column_names(usecols, names):

lines = [
"num,datetime,text",
"123,2018-11-13T12:00:00,abc",
"456,2018-11-14T12:35:01,def",
"789,2018-11-15T18:02:59,ghi",
]

buffer = "\n".join(lines) + "\n"

assert_eq(
pd.read_csv(StringIO(buffer), usecols=usecols, names=names),
cudf.read_csv(StringIO(buffer), usecols=usecols, names=names),
)
12 changes: 9 additions & 3 deletions python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1029,16 +1029,22 @@
the column names: if no names are passed, header=0;
if column names are passed explicitly, header=None.
names : list of str, default None
List of column names to be used.
List of column names to be used. Needs to include names of all columns in
the file, or names of all columns selected using `usecols` (only when
`usecols` holds integer indices). When `usecols` is not used to select
column indices, `names` can contain more names than there are columns i.n
the file. In this case the extra columns will only contain null rows.
index_col : int, string or False, default None
Column to use as the row labels of the DataFrame. Passing `index_col=False`
explicitly disables index column inference and discards the last column.
usecols : list of int or str, default None
Returns subset of the columns given in the list. All elements must be
either integer indices (column number) or strings that correspond to
column names
column names. When an integer index is passed for each name in the `names`
parameter, the names are interpreted as names in the output table, not as
names in the input file.
prefix : str, default None
Prefix to add to column numbers when parsing without a header row
Prefix to add to column numbers when parsing without a header row.
mangle_dupe_cols : boolean, default True
Duplicate columns will be specified as 'X','X.1',...'X.N'.
dtype : type, str, list of types, or dict of column -> type, default None
Expand Down

0 comments on commit 6de2c4e

Please sign in to comment.