From 541e7e864c700bedfc667b5199a3415fca1b311d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 3 Dec 2024 14:58:20 -0800 Subject: [PATCH] Make `column_empty` mask buffer creation consistent with libcudf (#16715) Based on offline discussions, this PR makes `column_empty` consistent with libcudf where * A size 0 "empty" column should not have a mask buffer * A size > 0 "empty" (i.e all null) column should have a mask buffer Additionally removes `column_empty_like` which can be subsumed by `column_empty` (I didn't find any active usage of this method across RAPIDS https://github.com/search?q=org%3Arapidsai%20column_empty_like&type=code) `column_empty` will have an unused `masked` argument, but since there is usage of this method across RAPIDS I'll need to adjust them before removing that keyword here (https://github.com/search?q=org%3Arapidsai%20column_empty&type=code) Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16715 --- python/cudf/cudf/core/column/__init__.py | 1 - python/cudf/cudf/core/column/column.py | 66 +++++++++------------- python/cudf/cudf/core/dataframe.py | 14 ++--- python/cudf/cudf/core/reshape.py | 18 +++--- python/cudf/cudf/core/udf/groupby_utils.py | 5 +- python/cudf/cudf/tests/test_list.py | 2 +- python/cudf/cudf/tests/test_parquet.py | 25 ++++++++ python/cudf/cudf/tests/test_string_udfs.py | 4 +- python/cudf/cudf/utils/queryutils.py | 3 +- 9 files changed, 75 insertions(+), 63 deletions(-) diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index 0a9d339a6a8..db8d33f013a 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -6,7 +6,6 @@ as_column, build_column, column_empty, - column_empty_like, concat_columns, deserialize_columns, serialize_columns, diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index cdc3a03f445..c8cd80f45f4 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -882,7 +882,7 @@ def take( """ # Handle zero size if indices.size == 0: - return cast(Self, column_empty_like(self, newsize=0)) + return cast(Self, column_empty(row_count=0, dtype=self.dtype)) # TODO: For performance, the check and conversion of gather map should # be done by the caller. This check will be removed in future release. @@ -1222,7 +1222,6 @@ def __cuda_array_interface__(self) -> abc.Mapping[str, Any]: "data": (self.data_ptr, False), "version": 1, } - if self.nullable and self.has_nulls(): # Create a simple Python object that exposes the # `__cuda_array_interface__` attribute here since we need to modify @@ -1516,37 +1515,6 @@ def _return_sentinel_column(): return codes.fillna(na_sentinel.value) -def column_empty_like( - column: ColumnBase, - dtype: Dtype | None = None, - masked: bool = False, - newsize: int | None = None, -) -> ColumnBase: - """Allocate a new column like the given *column*""" - if dtype is None: - dtype = column.dtype - row_count = len(column) if newsize is None else newsize - - if ( - hasattr(column, "dtype") - and isinstance(column.dtype, cudf.CategoricalDtype) - and dtype == column.dtype - ): - catcolumn = cast("cudf.core.column.CategoricalColumn", column) - codes = column_empty_like( - catcolumn.codes, masked=masked, newsize=newsize - ) - return build_column( - data=None, - dtype=dtype, - mask=codes.base_mask, - children=(codes,), - size=codes.size, - ) - - return column_empty(row_count, dtype, masked) - - def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool: """Check if an object dtype Series or array contains NaN.""" return any( @@ -1556,9 +1524,31 @@ def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool: def column_empty( - row_count: int, dtype: Dtype = "object", masked: bool = False + row_count: int, + dtype: Dtype = "object", + masked: bool = False, + for_numba: bool = False, ) -> ColumnBase: - """Allocate a new column like the given row_count and dtype.""" + """ + Allocate a new column with the given row_count and dtype. + + * Passing row_count == 0 creates a size 0 column without a mask buffer. + * Passing row_count > 0 creates an all null column with a mask buffer. + + Parameters + ---------- + row_count : int + Number of elements in the column. + + dtype : Dtype + Type of the column. + + masked : bool + Unused. + + for_numba : bool, default False + If True, don't allocate a mask as it's not supported by numba. + """ dtype = cudf.dtype(dtype) children: tuple[ColumnBase, ...] = () @@ -1600,7 +1590,7 @@ def column_empty( else: data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize)) - if masked: + if row_count > 0 and not for_numba: mask = as_buffer( plc.null_mask.create_null_mask( row_count, plc.null_mask.MaskState.ALL_NULL @@ -2353,9 +2343,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: if not is_dtype_equal(obj.dtype, head.dtype): # if all null, cast to appropriate dtype if obj.null_count == len(obj): - objs[i] = column_empty_like( - head, dtype=head.dtype, masked=True, newsize=len(obj) - ) + objs[i] = column_empty(row_count=len(obj), dtype=head.dtype) else: raise ValueError("All columns must be the same type") diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index fa8d517a9ef..656274bca38 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1424,8 +1424,8 @@ def __setitem__(self, arg, value): new_columns = ( value if key == arg - else column.column_empty_like( - col, masked=True, newsize=length + else column.column_empty( + row_count=length, dtype=col.dtype ) for key, col in self._column_labels_and_values ) @@ -3385,10 +3385,8 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): if num_cols != 0: ca = self._data._from_columns_like_self( ( - column.column_empty_like( - col_data, masked=True, newsize=length - ) - for col_data in self._columns + column.column_empty(row_count=length, dtype=dtype) + for _, dtype in self._dtypes ), verify=False, ) @@ -6191,8 +6189,8 @@ def quantile( quant_index=False, )._column if len(res) == 0: - res = column.column_empty_like( - qs, dtype=ser.dtype, masked=True, newsize=len(qs) + res = column.column_empty( + row_count=len(qs), dtype=ser.dtype ) result[k] = res result = DataFrame._from_data(result) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index f37b44b1100..a6815da62c6 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -14,7 +14,7 @@ from cudf.api.extensions import no_default from cudf.api.types import is_scalar from cudf.core._compat import PANDAS_LT_300 -from cudf.core.column import ColumnBase, as_column, column_empty_like +from cudf.core.column import ColumnBase, as_column, column_empty from cudf.core.column_accessor import ColumnAccessor from cudf.utils.dtypes import min_unsigned_type @@ -421,8 +421,8 @@ def concat( # if join is inner and it contains an empty df # we return an empty df, hence creating an empty # column with dtype metadata retained. - result_data[name] = cudf.core.column.column_empty_like( - col, newsize=0 + result_data[name] = column_empty( + row_count=0, dtype=col.dtype ) else: result_data[name] = col @@ -458,8 +458,8 @@ def concat( else: col_label = (k, name) if empty_inner: - result_data[col_label] = ( - cudf.core.column.column_empty_like(col, newsize=0) + result_data[col_label] = column_empty( + row_count=0, dtype=col.dtype ) else: result_data[col_label] = col @@ -995,9 +995,7 @@ def as_tuple(x): ] new_size = nrows * len(names) scatter_map = (columns_idx * np.int32(nrows)) + index_idx - target_col = cudf.core.column.column_empty_like( - col, masked=True, newsize=new_size - ) + target_col = column_empty(row_count=new_size, dtype=col.dtype) target_col[scatter_map] = col target = cudf.Index._from_column(target_col) result.update( @@ -1300,7 +1298,9 @@ def _one_hot_encode_column( """ if isinstance(column.dtype, cudf.CategoricalDtype): if column.size == column.null_count: - column = column_empty_like(categories, newsize=column.size) + column = column_empty( + row_count=column.size, dtype=categories.dtype + ) else: column = column._get_decategorized_column() # type: ignore[attr-defined] diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index 3af662b62ea..814d3e9fc85 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -154,8 +154,9 @@ def jit_groupby_apply(offsets, grouped_values, function, *args): offsets = cp.asarray(offsets) ngroups = len(offsets) - 1 - output = cudf.core.column.column_empty(ngroups, dtype=return_type) - + output = cudf.core.column.column_empty( + ngroups, dtype=return_type, for_numba=True + ) launch_args = [ offsets, output, diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 7d87fc73621..260b481b933 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -852,7 +852,7 @@ def test_listcol_setitem_retain_dtype(): {"a": cudf.Series([["a", "b"], []]), "b": [1, 2], "c": [123, 321]} ) df1 = df.head(0) - # Performing a setitem on `b` triggers a `column.column_empty_like` call + # Performing a setitem on `b` triggers a `column.column_empty` call # which tries to create an empty ListColumn. df1["b"] = df1["c"] # Performing a copy to trigger a copy dtype which is obtained by accessing diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index de3636f7526..13efa71ebae 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -4158,6 +4158,31 @@ def test_parquet_reader_with_mismatched_schemas_error(): ) +def test_parquet_roundtrip_zero_rows_no_column_mask(): + expected = cudf.DataFrame._from_data( + { + "int": cudf.core.column.column_empty(0, "int64"), + "float": cudf.core.column.column_empty(0, "float64"), + "datetime": cudf.core.column.column_empty(0, "datetime64[ns]"), + "timedelta": cudf.core.column.column_empty(0, "timedelta64[ns]"), + "bool": cudf.core.column.column_empty(0, "bool"), + "decimal": cudf.core.column.column_empty( + 0, cudf.Decimal64Dtype(1) + ), + "struct": cudf.core.column.column_empty( + 0, cudf.StructDtype({"a": "int64"}) + ), + "list": cudf.core.column.column_empty( + 0, cudf.ListDtype("float64") + ), + } + ) + with BytesIO() as bio: + expected.to_parquet(bio) + result = cudf.read_parquet(bio) + assert_eq(result, expected) + + def test_parquet_reader_mismatched_nullability(): # Ensure that we can faithfully read the tables with mismatched nullabilities df1 = cudf.DataFrame( diff --git a/python/cudf/cudf/tests/test_string_udfs.py b/python/cudf/cudf/tests/test_string_udfs.py index 69876d97aad..f4841f42e91 100644 --- a/python/cudf/cudf/tests/test_string_udfs.py +++ b/python/cudf/cudf/tests/test_string_udfs.py @@ -82,7 +82,9 @@ def run_udf_test(data, func, dtype): ) else: dtype = np.dtype(dtype) - output = cudf.core.column.column_empty(len(data), dtype=dtype) + output = cudf.core.column.column_empty( + len(data), dtype=dtype, for_numba=True + ) cudf_column = cudf.core.column.as_column(data) str_views = column_to_string_view_array(cudf_column) diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py index 8966789fee8..4e3d32c8ed0 100644 --- a/python/cudf/cudf/utils/queryutils.py +++ b/python/cudf/cudf/utils/queryutils.py @@ -210,7 +210,6 @@ def query_execute(df, expr, callenv): Contains keys 'local_dict', 'locals' and 'globals' which are all dict. They represent the arg, local and global dictionaries of the caller. """ - # compile compiled = query_compile(expr) columns = compiled["colnames"] @@ -247,7 +246,7 @@ def query_execute(df, expr, callenv): # allocate output buffer nrows = len(df) - out = column_empty(nrows, dtype=np.bool_) + out = column_empty(nrows, dtype=np.bool_, for_numba=True) # run kernel args = [out, *colarrays, *envargs] with _CUDFNumbaConfig():