Skip to content

Commit

Permalink
Apply na_rep to column names in csv writer (#6708)
Browse files Browse the repository at this point in the history
Fixes: #6688

na_rep was not being applied to column names, column names can be None too. So this PR introduces changes that will apply na_rep for column names as well.
  • Loading branch information
galipremsagar authored Nov 9, 2020
1 parent 8dd9323 commit 3bd593d
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@
- PR #6693 Fix issue related to `na_values` input in `read_csv`
- PR #6701 Fix issue when `numpy.str_` is given as input to string parameters in io APIs
- PR #6704 Fix leak warnings in JNI unit tests
- PR #6708 Apply `na_rep` to column names in csv writer


# cuDF 0.16.0 (21 Oct 2020)
Expand Down
11 changes: 10 additions & 1 deletion python/cudf/cudf/_lib/csv.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ from libcpp.string cimport string
from libcpp.vector cimport vector
from libcpp.utility cimport move

import pandas as pd
import cudf

from cudf._lib.cpp.types cimport size_type
Expand Down Expand Up @@ -428,7 +429,7 @@ cpdef write_csv(
cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c)

if header is True:
all_names = table._column_names
all_names = columns_apply_na_rep(table._column_names, na_rep)
if index is True:
all_names = table._index.names + all_names

Expand Down Expand Up @@ -465,3 +466,11 @@ cpdef write_csv(

with nogil:
cpp_write_csv(options)


def columns_apply_na_rep(column_names, na_rep):
return tuple(
na_rep if pd.isnull(col_name)
else col_name
for col_name in column_names
)
36 changes: 36 additions & 0 deletions python/cudf/cudf/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -1743,3 +1743,39 @@ def test_csv_write_empty_dataframe(df, index):
actual = df.to_csv(index=index)

assert expected == actual


@pytest.mark.parametrize(
"df",
[
pd.DataFrame(
{
"a": [1, 2, 3, None],
"": ["a", "v", None, None],
None: [12, 12, 32, 44],
}
),
pd.DataFrame(
{
np.nan: [1, 2, 3, None],
"": ["a", "v", None, None],
None: [12, 12, 32, 44],
}
),
pd.DataFrame({"": [1, None, 3, 4]}),
pd.DataFrame({None: [1, None, 3, 4]}),
pd.DataFrame(columns=[None, "", "a", "b"]),
pd.DataFrame(columns=[None]),
pd.DataFrame(columns=[""]),
],
)
@pytest.mark.parametrize(
"na_rep", ["", "_NA_", "---", "_____CUSTOM_NA_REP______"]
)
def test_csv_write_dataframe_na_rep(df, na_rep):
gdf = cudf.from_pandas(df)

expected = df.to_csv(na_rep=na_rep)
actual = gdf.to_csv(na_rep=na_rep)

assert expected == actual

0 comments on commit 3bd593d

Please sign in to comment.