From 6fb28b923384dc5e753222e6bf6546b47324ca7c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 5 Apr 2024 17:08:18 -0700 Subject: [PATCH 1/3] Clean up __cuda_array_interface__ handling in as_column --- python/cudf/cudf/core/column/column.py | 100 +++++++------------------ 1 file changed, 27 insertions(+), 73 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 518513c66f0..21288702890 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1681,27 +1681,6 @@ def build_categorical_column( return cast("cudf.core.column.CategoricalColumn", result) -def _make_copy_replacing_NaT_with_null(column): - """Return a copy with NaT values replaced with nulls.""" - if np.issubdtype(column.dtype, np.timedelta64): - na_value = np.timedelta64("NaT", column.time_unit) - elif np.issubdtype(column.dtype, np.datetime64): - na_value = np.datetime64("NaT", column.time_unit) - else: - raise ValueError("This type does not support replacing NaT with null.") - - null = column_empty_like(column, masked=True, newsize=1) - out_col = cudf._lib.replace.replace( - column, - build_column( - as_buffer(np.array([na_value], dtype=column.dtype).view("|u1")), - dtype=column.dtype, - ), - null, - ) - return out_col - - def check_invalid_array(shape: tuple, dtype): """Invalid ndarrays properties that are not supported""" if len(shape) > 1: @@ -1784,50 +1763,32 @@ def as_column( return arbitrary elif hasattr(arbitrary, "__cuda_array_interface__"): desc = arbitrary.__cuda_array_interface__ - shape = desc["shape"] - current_dtype = np.dtype(desc["typestr"]) - - check_invalid_array(shape, current_dtype) - - arb_dtype = cudf.dtype(current_dtype) + check_invalid_array(desc["shape"], np.dtype(desc["typestr"])) if desc.get("mask", None) is not None: # Extract and remove the mask from arbitrary before # passing to cupy.asarray - mask = _mask_from_cuda_array_interface_desc(arbitrary) - arbitrary = SimpleNamespace(__cuda_array_interface__=desc.copy()) - arbitrary.__cuda_array_interface__["mask"] = None - desc = arbitrary.__cuda_array_interface__ + cai_copy = desc.copy() + mask = _mask_from_cuda_array_interface_desc( + arbitrary, cai_copy.pop("mask") + ) + arbitrary = SimpleNamespace(__cuda_array_interface__=cai_copy) else: mask = None arbitrary = cupy.asarray(arbitrary) + arbitrary = cupy.ascontiguousarray(arbitrary) - if arb_dtype != current_dtype: - arbitrary = arbitrary.astype(arb_dtype) - current_dtype = arb_dtype - + data = as_buffer(arbitrary, exposed=cudf.get_option("copy_on_write")) + col = build_column(data, dtype=arbitrary.dtype, mask=mask) if ( - desc["strides"] is not None - and not (arbitrary.itemsize,) == arbitrary.strides + nan_as_null + or (mask is None and nan_as_null is None) + and col.dtype.kind == "f" ): - arbitrary = cupy.ascontiguousarray(arbitrary) - - data = as_buffer(arbitrary, exposed=cudf.get_option("copy_on_write")) - col = build_column(data, dtype=current_dtype, mask=mask) - + col = col.nans_to_nulls() if dtype is not None: col = col.astype(dtype) - - if isinstance(col, cudf.core.column.CategoricalColumn): - return col - elif np.issubdtype(col.dtype, np.floating): - if nan_as_null or (mask is None and nan_as_null is None): - mask = libcudf.transform.nans_to_nulls(col.fillna(np.nan)) - col = col.set_mask(mask) - elif np.issubdtype(col.dtype, np.datetime64): - if nan_as_null or (mask is None and nan_as_null is None): - col = _make_copy_replacing_NaT_with_null(col) return col elif isinstance(arbitrary, (pa.Array, pa.ChunkedArray)): @@ -2301,27 +2262,20 @@ def _construct_array( return arbitrary -def _mask_from_cuda_array_interface_desc(obj) -> Union[Buffer, None]: - desc = obj.__cuda_array_interface__ - mask = desc.get("mask", None) - - if mask is not None: - desc = mask.__cuda_array_interface__ - ptr = desc["data"][0] - nelem = desc["shape"][0] - typestr = desc["typestr"] - typecode = typestr[1] - if typecode == "t": - mask_size = bitmask_allocation_size_bytes(nelem) - mask = as_buffer(data=ptr, size=mask_size, owner=obj) - elif typecode == "b": - col = as_column(mask) - mask = bools_to_mask(col) - else: - raise NotImplementedError( - f"Cannot infer mask from typestr {typestr}" - ) - return mask +def _mask_from_cuda_array_interface_desc(obj, cai_mask) -> Buffer: + desc = cai_mask.__cuda_array_interface__ + ptr = desc["data"][0] + nelem = desc["shape"][0] + typestr = desc["typestr"] + typecode = typestr[1] + if typecode == "t": + mask_size = bitmask_allocation_size_bytes(nelem) + return as_buffer(data=ptr, size=mask_size, owner=obj) + elif typecode == "b": + col = as_column(cai_mask) + return bools_to_mask(col) + else: + raise NotImplementedError(f"Cannot infer mask from typestr {typestr}") def serialize_columns(columns) -> Tuple[List[dict], List]: From 66527f14ffad006c773b03db321e56c5875b9ddc Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 5 Apr 2024 17:12:44 -0700 Subject: [PATCH 2/3] Reduce some access --- python/cudf/cudf/core/column/column.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 21288702890..5e1e4b2a5df 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2264,13 +2264,11 @@ def _construct_array( def _mask_from_cuda_array_interface_desc(obj, cai_mask) -> Buffer: desc = cai_mask.__cuda_array_interface__ - ptr = desc["data"][0] - nelem = desc["shape"][0] typestr = desc["typestr"] typecode = typestr[1] if typecode == "t": - mask_size = bitmask_allocation_size_bytes(nelem) - return as_buffer(data=ptr, size=mask_size, owner=obj) + mask_size = bitmask_allocation_size_bytes(desc["shape"][0]) + return as_buffer(data=desc["data"][0], size=mask_size, owner=obj) elif typecode == "b": col = as_column(cai_mask) return bools_to_mask(col) From 4ddad1b5a2b1c2d04c2601e36717a49e993eebac Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 8 Apr 2024 10:35:06 -0700 Subject: [PATCH 3/3] ensure dtype float condition is checked --- python/cudf/cudf/core/column/column.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index eee9e7c5aca..9fc9634a738 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1782,10 +1782,8 @@ def as_column( data = as_buffer(arbitrary, exposed=cudf.get_option("copy_on_write")) col = build_column(data, dtype=arbitrary.dtype, mask=mask) if ( - nan_as_null - or (mask is None and nan_as_null is None) - and col.dtype.kind == "f" - ): + nan_as_null or (mask is None and nan_as_null is None) + ) and col.dtype.kind == "f": col = col.nans_to_nulls() if dtype is not None: col = col.astype(dtype)