From c05dbed52fdd15757e40463a64ce757d6cd21b46 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 31 Mar 2021 12:32:09 -0500 Subject: [PATCH] Add column names validation in parquet writer (#7786) Fixes: #7738 Parquet writer requires all column names to be of string types, added a validation similar to that of pandas. Authors: - GALI PREM SAGAR (@galipremsagar) Approvers: - Michael Wang (@isVoid) - Keith Kraus (@kkraus14) URL: https://github.com/rapidsai/cudf/pull/7786 --- python/cudf/cudf/_lib/parquet.pyx | 3 +++ python/cudf/cudf/tests/test_parquet.py | 14 +++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index d8b4fbbbe4b..4ea2adec23a 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -312,6 +312,9 @@ cpdef write_parquet( num_index_cols_meta = 0 for i, name in enumerate(table._column_names, num_index_cols_meta): + if not isinstance(name, str): + raise ValueError("parquet must have string column names") + tbl_meta.get().column_metadata[i].set_name(name.encode()) _set_col_metadata( table[name]._column, tbl_meta.get().column_metadata[i] diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index fe418d1ade1..4781ff995b0 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -19,7 +19,7 @@ import cudf from cudf.io.parquet import ParquetWriter, merge_parquet_filemetadata from cudf.tests import dataset_generator as dg -from cudf.tests.utils import assert_eq +from cudf.tests.utils import assert_eq, assert_exceptions_equal @pytest.fixture(scope="module") @@ -1937,3 +1937,15 @@ def test_parquet_writer_decimal(tmpdir): got = pd.read_parquet(fname) assert_eq(gdf, got) + + +def test_parquet_writer_column_validation(): + df = cudf.DataFrame({1: [1, 2, 3], "1": ["a", "b", "c"]}) + pdf = df.to_pandas() + + assert_exceptions_equal( + lfunc=df.to_parquet, + rfunc=pdf.to_parquet, + lfunc_args_and_kwargs=(["cudf.parquet"],), + rfunc_args_and_kwargs=(["pandas.parquet"],), + )