diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index cd070d28f38..d20155b4720 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -43,6 +43,7 @@ #include #include +#include #include #include @@ -696,37 +697,62 @@ table_with_metadata read_csv(cudf::io::datasource* source, column_flags.resize(num_actual_columns, column_parse::enabled | column_parse::inferred); + std::vector col_loop_order(column_names.size()); + auto unnamed_it = std::copy_if( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(column_names.size()), + col_loop_order.begin(), + [&column_names](auto col_idx) -> bool { return not column_names[col_idx].empty(); }); // Rename empty column names to "Unnamed: col_index" - for (size_t col_idx = 0; col_idx < column_names.size(); ++col_idx) { - if (column_names[col_idx].empty()) { - column_names[col_idx] = string("Unnamed: ") + std::to_string(col_idx); - } - } + std::copy_if(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(column_names.size()), + unnamed_it, + [&column_names](auto col_idx) -> bool { + auto is_empty = column_names[col_idx].empty(); + if (is_empty) + column_names[col_idx] = string("Unnamed: ") + std::to_string(col_idx); + return is_empty; + }); // Looking for duplicates - std::unordered_map col_names_histogram; - for (auto& col_name : column_names) { - // Operator [] inserts a default-initialized value if the given key is not - // present - if (++col_names_histogram[col_name] > 1) { - if (reader_opts.is_enabled_mangle_dupe_cols()) { - // Rename duplicates of column X as X.1, X.2, ...; First appearance - // stays as X - do { - col_name += "." + std::to_string(col_names_histogram[col_name] - 1); - } while (col_names_histogram[col_name]++); - } else { + std::unordered_map col_names_counts; + if (!reader_opts.is_enabled_mangle_dupe_cols()) { + for (auto& col_name : column_names) { + if (++col_names_counts[col_name] > 1) { // All duplicate columns will be ignored; First appearance is parsed const auto idx = &col_name - column_names.data(); column_flags[idx] = column_parse::disabled; } } + } else { + // For constant/linear search. + std::unordered_multiset header(column_names.begin(), column_names.end()); + for (auto const col_idx : col_loop_order) { + auto col = column_names[col_idx]; + auto cur_count = col_names_counts[col]; + if (cur_count > 0) { + auto const old_col = col; + // Rename duplicates of column X as X.1, X.2, ...; First appearance stays as X + while (cur_count > 0) { + col_names_counts[old_col] = cur_count + 1; + col = old_col + "." + std::to_string(cur_count); + if (header.find(col) != header.end()) { + cur_count++; + } else { + cur_count = col_names_counts[col]; + } + } + if (auto pos = header.find(old_col); pos != header.end()) { header.erase(pos); } + header.insert(col); + column_names[col_idx] = col; + } + col_names_counts[col] = cur_count + 1; + } } - // Update the number of columns to be processed, if some might have been - // removed + // Update the number of columns to be processed, if some might have been removed if (!reader_opts.is_enabled_mangle_dupe_cols()) { - num_active_columns = col_names_histogram.size(); + num_active_columns = col_names_counts.size(); } } diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index acad2507292..6ddc973b1a0 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -473,20 +473,27 @@ def test_csv_reader_usecols_int_char(tmpdir, pd_mixed_dataframe): assert_eq(df_out, out, check_names=False) -def test_csv_reader_mangle_dupe_cols(tmpdir): - buffer = "abc,ABC,abc,abcd,abc\n1,2,3,4,5\n" - +@pytest.mark.parametrize( + "buffer", + [ + "abc,ABC,abc,abcd,abc\n1,2,3,4,5\n", + "A,A,A.1,A,A.2,A,A.4,A,A\n1,2,3.1,4,a.2,a,a.4,a,a", + "A,A,A.1,,Unnamed: 4,A,A.4,A,A\n1,2,3.1,4,a.2,a,a.4,a,a", + ], +) +@pytest.mark.parametrize("mangle_dupe_cols", [True, False]) +def test_csv_reader_mangle_dupe_cols(tmpdir, buffer, mangle_dupe_cols): # Default: mangle_dupe_cols=True - pd_df = pd.read_csv(StringIO(buffer)) - cu_df = read_csv(StringIO(buffer)) + cu_df = read_csv(StringIO(buffer), mangle_dupe_cols=mangle_dupe_cols) + if mangle_dupe_cols: + pd_df = pd.read_csv(StringIO(buffer)) + else: + # Pandas does not support mangle_dupe_cols=False + head = buffer.split("\n")[0].split(",") + first_cols = np.unique(head, return_index=True)[1] + pd_df = pd.read_csv(StringIO(buffer), usecols=first_cols) assert_eq(cu_df, pd_df) - # Pandas does not support mangle_dupe_cols=False - cu_df = read_csv(StringIO(buffer), mangle_dupe_cols=False) - # check that the dupe columns were removed - assert len(cu_df.columns) == 3 - np.testing.assert_array_equal(cu_df["abc"].to_numpy(), [1]) - def test_csv_reader_float_decimal(tmpdir): fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file12.csv")