Apply na_rep to column names in csv writer (#6708)

Fixes: #6688 na_rep was not being applied to column names, column names can be None too. So this PR introduces changes that will apply na_rep for column names as well.
rapidsai · Nov 9, 2020 · 3bd593d · 3bd593d
1 parent 8dd9323
commit 3bd593d
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -94,6 +94,7 @@
 - PR #6693 Fix issue related to `na_values` input in `read_csv`
 - PR #6701 Fix issue when `numpy.str_` is given as input to string parameters in io APIs
 - PR #6704 Fix leak warnings in JNI unit tests
+- PR #6708 Apply `na_rep` to column names in csv writer
 
 
 # cuDF 0.16.0 (21 Oct 2020)

diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
@@ -6,6 +6,7 @@ from libcpp.string cimport string
 from libcpp.vector cimport vector
 from libcpp.utility cimport move
 
+import pandas as pd
 import cudf
 
 from cudf._lib.cpp.types cimport size_type
@@ -428,7 +429,7 @@ cpdef write_csv(
     cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c)
 
     if header is True:
-        all_names = table._column_names
+        all_names = columns_apply_na_rep(table._column_names, na_rep)
         if index is True:
             all_names = table._index.names + all_names
 
@@ -465,3 +466,11 @@ cpdef write_csv(
 
     with nogil:
         cpp_write_csv(options)
+
+
+def columns_apply_na_rep(column_names, na_rep):
+    return tuple(
+        na_rep if pd.isnull(col_name)
+        else col_name
+        for col_name in column_names
+    )
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
@@ -1743,3 +1743,39 @@ def test_csv_write_empty_dataframe(df, index):
     actual = df.to_csv(index=index)
 
     assert expected == actual
+
+
+@pytest.mark.parametrize(
+    "df",
+    [
+        pd.DataFrame(
+            {
+                "a": [1, 2, 3, None],
+                "": ["a", "v", None, None],
+                None: [12, 12, 32, 44],
+            }
+        ),
+        pd.DataFrame(
+            {
+                np.nan: [1, 2, 3, None],
+                "": ["a", "v", None, None],
+                None: [12, 12, 32, 44],
+            }
+        ),
+        pd.DataFrame({"": [1, None, 3, 4]}),
+        pd.DataFrame({None: [1, None, 3, 4]}),
+        pd.DataFrame(columns=[None, "", "a", "b"]),
+        pd.DataFrame(columns=[None]),
+        pd.DataFrame(columns=[""]),
+    ],
+)
+@pytest.mark.parametrize(
+    "na_rep", ["", "_NA_", "---", "_____CUSTOM_NA_REP______"]
+)
+def test_csv_write_dataframe_na_rep(df, na_rep):
+    gdf = cudf.from_pandas(df)
+
+    expected = df.to_csv(na_rep=na_rep)
+    actual = gdf.to_csv(na_rep=na_rep)
+
+    assert expected == actual