diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd index 519565fa48c..39da6b26502 100644 --- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd +++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd @@ -70,6 +70,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: column_in_metadata& set_nullability(bool nullable) column_in_metadata& set_list_column_as_map() column_in_metadata& set_int96_timestamps(bool req) + column_in_metadata& set_decimal_precision(uint8_t precision) column_in_metadata& child(size_type i) cdef cppclass table_input_metadata: diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 87179c02fe2..0158df46cc4 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -20,7 +20,8 @@ from cudf.utils.dtypes import ( np_to_pa_dtype, is_categorical_dtype, is_list_dtype, - is_struct_dtype + is_struct_dtype, + is_decimal_dtype, ) from cudf._lib.utils cimport get_column_names @@ -310,7 +311,7 @@ cpdef write_parquet( for i, name in enumerate(table._column_names, num_index_cols_meta): tbl_meta.get().column_metadata[i].set_name(name.encode()) - _set_col_children_names( + _set_col_metadata( table[name]._column, tbl_meta.get().column_metadata[i] ) @@ -448,7 +449,7 @@ cdef class ParquetWriter: for i, name in enumerate(table._column_names, num_index_cols_meta): self.tbl_meta.get().column_metadata[i].set_name(name.encode()) - _set_col_children_names( + _set_col_metadata( table[name]._column, self.tbl_meta.get().column_metadata[i] ) @@ -546,14 +547,16 @@ cdef Column _update_column_struct_field_names( col.set_base_children(tuple(children)) return col -cdef _set_col_children_names(Column col, column_in_metadata& col_meta): +cdef _set_col_metadata(Column col, column_in_metadata& col_meta): if is_struct_dtype(col): for i, (child_col, name) in enumerate( zip(col.children, list(col.dtype.fields)) ): col_meta.child(i).set_name(name.encode()) - _set_col_children_names(child_col, col_meta.child(i)) + _set_col_metadata(child_col, col_meta.child(i)) elif is_list_dtype(col): - _set_col_children_names(col.children[1], col_meta.child(1)) + _set_col_metadata(col.children[1], col_meta.child(1)) else: + if is_decimal_dtype(col): + col_meta.set_decimal_precision(col.dtype.precision) return diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 6698a47b416..4fe795e57a9 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -23,6 +23,7 @@ from cudf.utils.dtypes import ( is_categorical_dtype, is_list_dtype, is_struct_dtype, + is_decimal_dtype, ) @@ -80,7 +81,11 @@ cpdef generate_pandas_metadata(Table table, index): "'category' column dtypes are currently not " + "supported by the gpu accelerated parquet writer" ) - elif is_list_dtype(col) or is_struct_dtype(col): + elif ( + is_list_dtype(col) + or is_struct_dtype(col) + or is_decimal_dtype(col) + ): types.append(col.dtype.to_arrow()) else: types.append(np_to_pa_dtype(col.dtype)) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 6d50e4b6fee..a549dbc8b07 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1920,3 +1920,18 @@ def test_parquet_writer_nested(tmpdir, data): got = pd.read_parquet(fname) assert_eq(expect, got) + + +def test_parquet_writer_decimal(tmpdir): + from cudf.core.dtypes import Decimal64Dtype + + gdf = cudf.DataFrame({"val": [0.00, 0.01, 0.02]}) + + gdf["dec_val"] = gdf["val"].astype(Decimal64Dtype(7, 2)) + + fname = tmpdir.join("test_parquet_writer_decimal.parquet") + gdf.to_parquet(fname) + assert os.path.exists(fname) + + got = pd.read_parquet(fname) + assert_eq(gdf, got)