Skip to content

Commit

Permalink
Fix handling of empty column name in csv writer (#6692)
Browse files Browse the repository at this point in the history
Fixes: #6663, #6420

This PR:

 Removed pre-maturely renaming index name to '' if it is None. Instead introduced a flag is_index_name_none is introduced, to let the python layer decide insert '' while metadata generation. Since we were renaming index name to '' and when a dataframe with column name '' is given our code will break in reset_index. This is now fixed.

 Handles empty dataframe column names correctly for both index=True/False combinations.
  • Loading branch information
galipremsagar authored Nov 7, 2020
1 parent bfb1b88 commit bc43878
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 23 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
- PR #6643 Fix csv writer handling embedded comma delimiter
- PR #6640 Add error message for unsupported `axis` parameter in DataFrame APIs
- PR #6687 Fix issue where index name of caller object is being modified in csv writer
- PR #6692 Fix handling of empty column name in csv writer
- PR #6693 Fix issue related to `na_values` input in `read_csv`


Expand Down
35 changes: 27 additions & 8 deletions python/cudf/cudf/_lib/csv.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ from cudf._lib.cpp.io.types cimport (
table_with_metadata
)
from cudf._lib.io.utils cimport make_source_info, make_sink_info
from cudf._lib.table cimport Table
from cudf._lib.table cimport Table, make_table_view
from cudf._lib.cpp.table.table_view cimport table_view

ctypedef int32_t underlying_type_t_compression
Expand Down Expand Up @@ -403,17 +403,18 @@ cpdef write_csv(
bool header=True,
str line_terminator="\n",
int rows_per_chunk=8,
bool index=True,
):
"""
Cython function to call into libcudf API, see `write_csv`.
See Also
--------
cudf.io.csv.write_csv
cudf.io.csv.to_csv
"""

# Index already been reset and added as main column, so just data_view
cdef table_view input_table_view = table.data_view()
cdef table_view input_table_view = \
table.view() if index is True else table.data_view()
cdef bool include_header_c = header
cdef char delim_c = ord(sep)
cdef string line_term_c = line_terminator.encode()
Expand All @@ -425,10 +426,28 @@ cpdef write_csv(
cdef unique_ptr[data_sink] data_sink_c
cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c)

if header is True and table._column_names is not None:
metadata_.column_names.reserve(len(table._column_names))
for col_name in table._column_names:
metadata_.column_names.push_back(str(col_name).encode())
if header is True:
all_names = table._column_names
if index is True:
all_names = table._index.names + all_names

if len(all_names) > 0:
metadata_.column_names.reserve(len(all_names))
if len(all_names) == 1:
if all_names[0] in (None, ''):
metadata_.column_names.push_back('""'.encode())
else:
metadata_.column_names.push_back(
str(all_names[0]).encode()
)
else:
for idx, col_name in enumerate(all_names):
if col_name is None:
metadata_.column_names.push_back(''.encode())
else:
metadata_.column_names.push_back(
str(col_name).encode()
)

cdef csv_writer_options options = move(
csv_writer_options.builder(sink_info_c, input_table_view)
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,11 +166,11 @@ def insert(self, name, value, loc=-1):
loc = ncols
if not (0 <= loc <= ncols):
raise ValueError(
"insert: loc out of bounds: must be " " 0 <= loc <= ncols"
"insert: loc out of bounds: must be 0 <= loc <= ncols"
)
# TODO: we should move all insert logic here
if name in self._data:
raise ValueError(f"Cannot insert {name}, already exists")
raise ValueError(f"Cannot insert '{name}', already exists")
if loc == len(self._data):
self._data[name] = value
else:
Expand Down
17 changes: 4 additions & 13 deletions python/cudf/cudf/io/csv.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) 2018-20, NVIDIA CORPORATION.
# Copyright (c) 2018-2020, NVIDIA CORPORATION.

from io import BytesIO, StringIO

from nvtx import annotate
Expand Down Expand Up @@ -118,18 +119,6 @@ def to_csv(
path_or_data=path_or_buf, mode="w", **kwargs
)

if index:
from cudf import MultiIndex

if not isinstance(df.index, MultiIndex):
if df.index.name is None:
df = df.copy(deep=False)
df.index.name = ""
if columns is not None:
columns = columns.copy()
columns.insert(0, df.index.name)
df = df.reset_index()

if columns is not None:
try:
df = df[columns]
Expand All @@ -151,6 +140,7 @@ def to_csv(
header=header,
line_terminator=line_terminator,
rows_per_chunk=rows_per_chunk,
index=index,
)
else:
libcudf.csv.write_csv(
Expand All @@ -161,6 +151,7 @@ def to_csv(
header=header,
line_terminator=line_terminator,
rows_per_chunk=rows_per_chunk,
index=index,
)

if return_as_string:
Expand Down
49 changes: 49 additions & 0 deletions python/cudf/cudf/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -1691,3 +1691,52 @@ def test_csv_write_no_caller_manipulation():
df_copy = df.copy(deep=True)
_ = df.to_csv(index=True)
assert_eq(df, df_copy)


@pytest.mark.parametrize(
"df",
[
cudf.DataFrame({"a": [1, 2, 3], "": [10, 20, 40]}),
cudf.DataFrame({"": [10, 20, 40], "a": [1, 2, 3]}),
cudf.DataFrame(
{"a": [1, 2, 3], "": [10, 20, 40]},
index=cudf.Index(["a", "z", "v"], name="custom name"),
),
],
)
@pytest.mark.parametrize("index", [True, False])
@pytest.mark.parametrize("columns", [["a"], [""], None])
def test_csv_write_empty_column_name(df, index, columns):
pdf = df.to_pandas()
expected = pdf.to_csv(index=index, columns=columns)
actual = df.to_csv(index=index, columns=columns)

assert expected == actual


@pytest.mark.parametrize(
"df",
[
cudf.DataFrame(),
cudf.DataFrame(index=cudf.Index([], name="index name")),
],
)
@pytest.mark.parametrize(
"index",
[
True,
pytest.param(
False,
marks=pytest.mark.xfail(
reason="https://github.com/rapidsai/cudf/issues/6691"
),
),
],
)
def test_csv_write_empty_dataframe(df, index):
pdf = df.to_pandas()

expected = pdf.to_csv(index=index)
actual = df.to_csv(index=index)

assert expected == actual

0 comments on commit bc43878

Please sign in to comment.