Fix handling of empty column name in csv writer (#6692)

Fixes: #6663, #6420 This PR: Removed pre-maturely renaming index name to '' if it is None. Instead introduced a flag is_index_name_none is introduced, to let the python layer decide insert '' while metadata generation. Since we were renaming index name to '' and when a dataframe with column name '' is given our code will break in reset_index. This is now fixed. Handles empty dataframe column names correctly for both index=True/False combinations.
rapidsai · Nov 7, 2020 · bc43878 · bc43878
1 parent bfb1b88
commit bc43878
Show file tree

Hide file tree

Showing 5 changed files with 83 additions and 23 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -85,6 +85,7 @@
 - PR #6643 Fix csv writer handling embedded comma delimiter
 - PR #6640 Add error message for unsupported `axis` parameter in DataFrame APIs
 - PR #6687 Fix issue where index name of caller object is being modified in csv writer
+- PR #6692 Fix handling of empty column name in csv writer
 - PR #6693 Fix issue related to `na_values` input in `read_csv`
 
 

diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
@@ -38,7 +38,7 @@ from cudf._lib.cpp.io.types cimport (
     table_with_metadata
 )
 from cudf._lib.io.utils cimport make_source_info, make_sink_info
-from cudf._lib.table cimport Table
+from cudf._lib.table cimport Table, make_table_view
 from cudf._lib.cpp.table.table_view cimport table_view
 
 ctypedef int32_t underlying_type_t_compression
@@ -403,17 +403,18 @@ cpdef write_csv(
     bool header=True,
     str line_terminator="\n",
     int rows_per_chunk=8,
+    bool index=True,
 ):
     """
     Cython function to call into libcudf API, see `write_csv`.
 
     See Also
     --------
-    cudf.io.csv.write_csv
+    cudf.io.csv.to_csv
     """
 
-    # Index already been reset and added as main column, so just data_view
-    cdef table_view input_table_view = table.data_view()
+    cdef table_view input_table_view = \
+        table.view() if index is True else table.data_view()
     cdef bool include_header_c = header
     cdef char delim_c = ord(sep)
     cdef string line_term_c = line_terminator.encode()
@@ -425,10 +426,28 @@ cpdef write_csv(
     cdef unique_ptr[data_sink] data_sink_c
     cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c)
 
-    if header is True and table._column_names is not None:
-        metadata_.column_names.reserve(len(table._column_names))
-        for col_name in table._column_names:
-            metadata_.column_names.push_back(str(col_name).encode())
+    if header is True:
+        all_names = table._column_names
+        if index is True:
+            all_names = table._index.names + all_names
+
+        if len(all_names) > 0:
+            metadata_.column_names.reserve(len(all_names))
+            if len(all_names) == 1:
+                if all_names[0] in (None, ''):
+                    metadata_.column_names.push_back('""'.encode())
+                else:
+                    metadata_.column_names.push_back(
+                        str(all_names[0]).encode()
+                    )
+            else:
+                for idx, col_name in enumerate(all_names):
+                    if col_name is None:
+                        metadata_.column_names.push_back(''.encode())
+                    else:
+                        metadata_.column_names.push_back(
+                            str(col_name).encode()
+                        )
 
     cdef csv_writer_options options = move(
         csv_writer_options.builder(sink_info_c, input_table_view)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
@@ -166,11 +166,11 @@ def insert(self, name, value, loc=-1):
             loc = ncols
         if not (0 <= loc <= ncols):
             raise ValueError(
-                "insert: loc out of bounds: must be " " 0 <= loc <= ncols"
+                "insert: loc out of bounds: must be  0 <= loc <= ncols"
             )
         # TODO: we should move all insert logic here
         if name in self._data:
-            raise ValueError(f"Cannot insert {name}, already exists")
+            raise ValueError(f"Cannot insert '{name}', already exists")
         if loc == len(self._data):
             self._data[name] = value
         else:

diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2018-20, NVIDIA CORPORATION.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+
 from io import BytesIO, StringIO
 
 from nvtx import annotate
@@ -118,18 +119,6 @@ def to_csv(
         path_or_data=path_or_buf, mode="w", **kwargs
     )
 
-    if index:
-        from cudf import MultiIndex
-
-        if not isinstance(df.index, MultiIndex):
-            if df.index.name is None:
-                df = df.copy(deep=False)
-                df.index.name = ""
-            if columns is not None:
-                columns = columns.copy()
-                columns.insert(0, df.index.name)
-        df = df.reset_index()
-
     if columns is not None:
         try:
             df = df[columns]
@@ -151,6 +140,7 @@ def to_csv(
                 header=header,
                 line_terminator=line_terminator,
                 rows_per_chunk=rows_per_chunk,
+                index=index,
             )
     else:
         libcudf.csv.write_csv(
@@ -161,6 +151,7 @@ def to_csv(
             header=header,
             line_terminator=line_terminator,
             rows_per_chunk=rows_per_chunk,
+            index=index,
         )
 
     if return_as_string:

diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
@@ -1691,3 +1691,52 @@ def test_csv_write_no_caller_manipulation():
     df_copy = df.copy(deep=True)
     _ = df.to_csv(index=True)
     assert_eq(df, df_copy)
+
+
+@pytest.mark.parametrize(
+    "df",
+    [
+        cudf.DataFrame({"a": [1, 2, 3], "": [10, 20, 40]}),
+        cudf.DataFrame({"": [10, 20, 40], "a": [1, 2, 3]}),
+        cudf.DataFrame(
+            {"a": [1, 2, 3], "": [10, 20, 40]},
+            index=cudf.Index(["a", "z", "v"], name="custom name"),
+        ),
+    ],
+)
+@pytest.mark.parametrize("index", [True, False])
+@pytest.mark.parametrize("columns", [["a"], [""], None])
+def test_csv_write_empty_column_name(df, index, columns):
+    pdf = df.to_pandas()
+    expected = pdf.to_csv(index=index, columns=columns)
+    actual = df.to_csv(index=index, columns=columns)
+
+    assert expected == actual
+
+
+@pytest.mark.parametrize(
+    "df",
+    [
+        cudf.DataFrame(),
+        cudf.DataFrame(index=cudf.Index([], name="index name")),
+    ],
+)
+@pytest.mark.parametrize(
+    "index",
+    [
+        True,
+        pytest.param(
+            False,
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/6691"
+            ),
+        ),
+    ],
+)
+def test_csv_write_empty_dataframe(df, index):
+    pdf = df.to_pandas()
+
+    expected = pdf.to_csv(index=index)
+    actual = df.to_csv(index=index)
+
+    assert expected == actual