update mangle_dupe_cols behavior in csv reader to match pandas 1.4.0 …

…behavior (#10749) Fixes #10618 Depends on #10584 Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Yunsong Wang (https://github.com/PointKernel) URL: #10749
rapidsai · May 16, 2022 · e58d049 · e58d049
1 parent 6591a6a
commit e58d049
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 31 deletions.
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
@@ -43,6 +43,7 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/host_vector.h>
+#include <thrust/iterator/counting_iterator.h>
 
 #include <algorithm>
 #include <iostream>
@@ -696,37 +697,62 @@ table_with_metadata read_csv(cudf::io::datasource* source,
 
     column_flags.resize(num_actual_columns, column_parse::enabled | column_parse::inferred);
 
+    std::vector<size_t> col_loop_order(column_names.size());
+    auto unnamed_it = std::copy_if(
+      thrust::make_counting_iterator<size_t>(0),
+      thrust::make_counting_iterator<size_t>(column_names.size()),
+      col_loop_order.begin(),
+      [&column_names](auto col_idx) -> bool { return not column_names[col_idx].empty(); });
     // Rename empty column names to "Unnamed: col_index"
-    for (size_t col_idx = 0; col_idx < column_names.size(); ++col_idx) {
-      if (column_names[col_idx].empty()) {
-        column_names[col_idx] = string("Unnamed: ") + std::to_string(col_idx);
-      }
-    }
+    std::copy_if(thrust::make_counting_iterator<size_t>(0),
+                 thrust::make_counting_iterator<size_t>(column_names.size()),
+                 unnamed_it,
+                 [&column_names](auto col_idx) -> bool {
+                   auto is_empty = column_names[col_idx].empty();
+                   if (is_empty)
+                     column_names[col_idx] = string("Unnamed: ") + std::to_string(col_idx);
+                   return is_empty;
+                 });
 
     // Looking for duplicates
-    std::unordered_map<string, int> col_names_histogram;
-    for (auto& col_name : column_names) {
-      // Operator [] inserts a default-initialized value if the given key is not
-      // present
-      if (++col_names_histogram[col_name] > 1) {
-        if (reader_opts.is_enabled_mangle_dupe_cols()) {
-          // Rename duplicates of column X as X.1, X.2, ...; First appearance
-          // stays as X
-          do {
-            col_name += "." + std::to_string(col_names_histogram[col_name] - 1);
-          } while (col_names_histogram[col_name]++);
-        } else {
+    std::unordered_map<string, int> col_names_counts;
+    if (!reader_opts.is_enabled_mangle_dupe_cols()) {
+      for (auto& col_name : column_names) {
+        if (++col_names_counts[col_name] > 1) {
           // All duplicate columns will be ignored; First appearance is parsed
           const auto idx    = &col_name - column_names.data();
           column_flags[idx] = column_parse::disabled;
         }
       }
+    } else {
+      // For constant/linear search.
+      std::unordered_multiset<std::string> header(column_names.begin(), column_names.end());
+      for (auto const col_idx : col_loop_order) {
+        auto col       = column_names[col_idx];
+        auto cur_count = col_names_counts[col];
+        if (cur_count > 0) {
+          auto const old_col = col;
+          // Rename duplicates of column X as X.1, X.2, ...; First appearance stays as X
+          while (cur_count > 0) {
+            col_names_counts[old_col] = cur_count + 1;
+            col                       = old_col + "." + std::to_string(cur_count);
+            if (header.find(col) != header.end()) {
+              cur_count++;
+            } else {
+              cur_count = col_names_counts[col];
+            }
+          }
+          if (auto pos = header.find(old_col); pos != header.end()) { header.erase(pos); }
+          header.insert(col);
+          column_names[col_idx] = col;
+        }
+        col_names_counts[col] = cur_count + 1;
+      }
     }
 
-    // Update the number of columns to be processed, if some might have been
-    // removed
+    // Update the number of columns to be processed, if some might have been removed
     if (!reader_opts.is_enabled_mangle_dupe_cols()) {
-      num_active_columns = col_names_histogram.size();
+      num_active_columns = col_names_counts.size();
     }
   }
 

diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
@@ -473,20 +473,27 @@ def test_csv_reader_usecols_int_char(tmpdir, pd_mixed_dataframe):
     assert_eq(df_out, out, check_names=False)
 
 
-def test_csv_reader_mangle_dupe_cols(tmpdir):
-    buffer = "abc,ABC,abc,abcd,abc\n1,2,3,4,5\n"
-
+@pytest.mark.parametrize(
+    "buffer",
+    [
+        "abc,ABC,abc,abcd,abc\n1,2,3,4,5\n",
+        "A,A,A.1,A,A.2,A,A.4,A,A\n1,2,3.1,4,a.2,a,a.4,a,a",
+        "A,A,A.1,,Unnamed: 4,A,A.4,A,A\n1,2,3.1,4,a.2,a,a.4,a,a",
+    ],
+)
+@pytest.mark.parametrize("mangle_dupe_cols", [True, False])
+def test_csv_reader_mangle_dupe_cols(tmpdir, buffer, mangle_dupe_cols):
     # Default: mangle_dupe_cols=True
-    pd_df = pd.read_csv(StringIO(buffer))
-    cu_df = read_csv(StringIO(buffer))
+    cu_df = read_csv(StringIO(buffer), mangle_dupe_cols=mangle_dupe_cols)
+    if mangle_dupe_cols:
+        pd_df = pd.read_csv(StringIO(buffer))
+    else:
+        # Pandas does not support mangle_dupe_cols=False
+        head = buffer.split("\n")[0].split(",")
+        first_cols = np.unique(head, return_index=True)[1]
+        pd_df = pd.read_csv(StringIO(buffer), usecols=first_cols)
     assert_eq(cu_df, pd_df)
 
-    # Pandas does not support mangle_dupe_cols=False
-    cu_df = read_csv(StringIO(buffer), mangle_dupe_cols=False)
-    # check that the dupe columns were removed
-    assert len(cu_df.columns) == 3
-    np.testing.assert_array_equal(cu_df["abc"].to_numpy(), [1])
-
 
 def test_csv_reader_float_decimal(tmpdir):
     fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file12.csv")