From 6894263481c954c7874776b0bd0cde967cc97b73 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 22 Mar 2023 16:47:06 -0700 Subject: [PATCH 1/2] Fix set_nullability --- python/cudf/cudf/_lib/parquet.pyx | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 59571b0e4b3..0ebda815ee9 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -698,7 +698,14 @@ cdef _set_col_metadata( column_in_metadata& col_meta, bool force_nullable_schema ): - col_meta.set_nullability(force_nullable_schema or col.nullable) + if force_nullable_schema or not col.nullable: + # only need to set/define `_nullable` in libcudf + # for two cases: + # 1. if `force_nullable_schema` is true. + # 2. if `force_nullable_schema` is false, and + # we want to enforce it to a column that has + # no-nulls. (A scenario for Chunked parquet writer). + col_meta.set_nullability(force_nullable_schema) if is_struct_dtype(col): for i, (child_col, name) in enumerate( From 7a50f0de34dc70f5eae7ee0b7ada2eb94c37fdbe Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 24 Mar 2023 10:29:33 -0700 Subject: [PATCH 2/2] change force_nullable_schema to single writer only --- python/cudf/cudf/_lib/parquet.pyx | 30 ++++++-------------------- python/cudf/cudf/tests/test_parquet.py | 20 ----------------- 2 files changed, 6 insertions(+), 44 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 0ebda815ee9..923f5c4089f 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -470,16 +470,6 @@ cdef class ParquetWriter: max_page_size_rows: int, default 20000 Maximum number of rows of each page of the output. By default, 20000 will be used. - force_nullable_schema : bool, default True. - If True, writes all columns as `null` in schema. - If False, columns are written as `null` if they contain null values, - otherwise as `not null`. - - Notes - ----- - `DataFrame.to_parquet` and `ParquetWriter` differ in the default - value for `force_nullable_schema` to enable all the chunks being - written by chunked parquet writer to be schema identical. See Also -------- @@ -497,15 +487,13 @@ cdef class ParquetWriter: cdef size_type row_group_size_rows cdef size_t max_page_size_bytes cdef size_type max_page_size_rows - cdef bool force_nullable_schema def __cinit__(self, object filepath_or_buffer, object index=None, object compression="snappy", str statistics="ROWGROUP", int row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT, int row_group_size_rows=1000000, int max_page_size_bytes=524288, - int max_page_size_rows=20000, - bool force_nullable_schema=True): + int max_page_size_rows=20000): filepaths_or_buffers = ( list(filepath_or_buffer) if is_list_like(filepath_or_buffer) @@ -520,7 +508,6 @@ cdef class ParquetWriter: self.row_group_size_rows = row_group_size_rows self.max_page_size_bytes = max_page_size_bytes self.max_page_size_rows = max_page_size_rows - self.force_nullable_schema = force_nullable_schema def write_table(self, table, object partitions_info=None): """ Writes a single table to the file """ @@ -615,7 +602,6 @@ cdef class ParquetWriter: _set_col_metadata( table[name]._column, self.tbl_meta.get().column_metadata[i], - self.force_nullable_schema ) index = ( @@ -696,16 +682,12 @@ cdef cudf_io_types.compression_type _get_comp_type(object compression): cdef _set_col_metadata( Column col, column_in_metadata& col_meta, - bool force_nullable_schema + bool force_nullable_schema=False, ): - if force_nullable_schema or not col.nullable: - # only need to set/define `_nullable` in libcudf - # for two cases: - # 1. if `force_nullable_schema` is true. - # 2. if `force_nullable_schema` is false, and - # we want to enforce it to a column that has - # no-nulls. (A scenario for Chunked parquet writer). - col_meta.set_nullability(force_nullable_schema) + if force_nullable_schema: + # Only set nullability if `force_nullable_schema` + # is true. + col_meta.set_nullability(True) if is_struct_dtype(col): for i, (child_col, name) in enumerate( diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 9b783b03dad..c24ff080033 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2788,23 +2788,3 @@ def test_parquet_writer_schema_nullability(data, force_nullable_schema): assert pa.parquet.read_schema(file_obj).field(0).nullable == ( force_nullable_schema or df.isnull().any().any() ) - - -@pytest.mark.parametrize("data", [{"a": [1, 2, 3, 4]}, {"b": [1, None, 2, 3]}]) -@pytest.mark.parametrize("force_nullable_schema", [True, False]) -def test_parquet_chunked_writer_schema_nullability( - data, force_nullable_schema -): - df = cudf.DataFrame(data) - file_obj = BytesIO() - - writer = ParquetWriter( - file_obj, force_nullable_schema=force_nullable_schema - ) - - writer.write_table(df) - - writer.close() - assert pa.parquet.read_schema(file_obj).field(0).nullable == ( - force_nullable_schema or df.isnull().any().any() - )