Skip to content

Commit

Permalink
Add column names validation in parquet writer (rapidsai#7786)
Browse files Browse the repository at this point in the history
Fixes: rapidsai#7738 

Parquet writer requires all column names to be of string types, added a validation similar to that of pandas.

Authors:
  - GALI PREM SAGAR (@galipremsagar)

Approvers:
  - Michael Wang (@isVoid)
  - Keith Kraus (@kkraus14)

URL: rapidsai#7786
  • Loading branch information
galipremsagar authored Mar 31, 2021
1 parent b937112 commit c05dbed
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 1 deletion.
3 changes: 3 additions & 0 deletions python/cudf/cudf/_lib/parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,9 @@ cpdef write_parquet(
num_index_cols_meta = 0

for i, name in enumerate(table._column_names, num_index_cols_meta):
if not isinstance(name, str):
raise ValueError("parquet must have string column names")

tbl_meta.get().column_metadata[i].set_name(name.encode())
_set_col_metadata(
table[name]._column, tbl_meta.get().column_metadata[i]
Expand Down
14 changes: 13 additions & 1 deletion python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import cudf
from cudf.io.parquet import ParquetWriter, merge_parquet_filemetadata
from cudf.tests import dataset_generator as dg
from cudf.tests.utils import assert_eq
from cudf.tests.utils import assert_eq, assert_exceptions_equal


@pytest.fixture(scope="module")
Expand Down Expand Up @@ -1937,3 +1937,15 @@ def test_parquet_writer_decimal(tmpdir):

got = pd.read_parquet(fname)
assert_eq(gdf, got)


def test_parquet_writer_column_validation():
df = cudf.DataFrame({1: [1, 2, 3], "1": ["a", "b", "c"]})
pdf = df.to_pandas()

assert_exceptions_equal(
lfunc=df.to_parquet,
rfunc=pdf.to_parquet,
lfunc_args_and_kwargs=(["cudf.parquet"],),
rfunc_args_and_kwargs=(["pandas.parquet"],),
)

0 comments on commit c05dbed

Please sign in to comment.