From d7484d306aa7faf87952ba79a4cfd57ed4a873d7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 3 Dec 2024 20:07:03 -0800 Subject: [PATCH] Remove cudf._lib.transform in favor of inlining pylibcudf --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/transform.pyx | 113 -------------------- python/cudf/cudf/core/column/categorical.py | 4 +- python/cudf/cudf/core/column/column.py | 34 ++++-- python/cudf/cudf/core/column/lists.py | 2 +- python/cudf/cudf/core/column/numerical.py | 30 +++++- python/cudf/cudf/core/dataframe.py | 30 +++--- python/cudf/cudf/core/df_protocol.py | 3 +- python/cudf/cudf/core/frame.py | 9 +- python/cudf/cudf/core/indexed_frame.py | 2 +- python/cudf/cudf/core/reshape.py | 7 +- python/cudf/cudf/core/series.py | 3 +- python/cudf/cudf/datasets.py | 3 +- python/cudf/cudf/tests/test_column.py | 9 +- 14 files changed, 85 insertions(+), 165 deletions(-) delete mode 100644 python/cudf/cudf/_lib/transform.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index e69a2672163..45c97b370a1 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -31,7 +31,6 @@ set(cython_sources string_casting.pyx strings_udf.pyx text.pyx - transform.pyx types.pyx utils.pyx ) diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx deleted file mode 100644 index a163bb07888..00000000000 --- a/python/cudf/cudf/_lib/transform.pyx +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from numba.np import numpy_support - -import cudf -from cudf.core.buffer import acquire_spill_lock, as_buffer -from cudf.utils import cudautils - -from pylibcudf cimport transform as plc_transform -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def bools_to_mask(Column col): - """ - Given an int8 (boolean) column, compress the data from booleans to bits and - return a Buffer - """ - mask, _ = plc_transform.bools_to_mask(col.to_pylibcudf(mode="read")) - return as_buffer(mask) - - -@acquire_spill_lock() -def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit): - """ - Given a mask buffer, returns a boolean column representng bit 0 -> False - and 1 -> True within range of [begin_bit, end_bit), - """ - if not isinstance(mask_buffer, cudf.core.buffer.Buffer): - raise TypeError("mask_buffer is not an instance of " - "cudf.core.buffer.Buffer") - plc_column = plc_transform.mask_to_bools( - mask_buffer.get_ptr(mode="read"), begin_bit, end_bit - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def nans_to_nulls(Column input): - mask, _ = plc_transform.nans_to_nulls( - input.to_pylibcudf(mode="read") - ) - return as_buffer(mask) - - -@acquire_spill_lock() -def transform(Column input, op): - nb_type = numpy_support.from_dtype(input.dtype) - nb_signature = (nb_type,) - compiled_op = cudautils.compile_udf(op, nb_signature) - np_dtype = cudf.dtype(compiled_op[1]) - - plc_column = plc_transform.transform( - input.to_pylibcudf(mode="read"), - compiled_op[0], - plc.column._datatype_from_dtype_desc(np_dtype.str[1:]), - True - ) - return Column.from_pylibcudf(plc_column) - - -def table_encode(list source_columns): - plc_table, plc_column = plc_transform.encode( - plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]) - ) - - return ( - [Column.from_pylibcudf(col) for col in plc_table.columns()], - Column.from_pylibcudf(plc_column) - ) - - -def one_hot_encode(Column input_column, Column categories): - plc_table = plc_transform.one_hot_encode( - input_column.to_pylibcudf(mode="read"), - categories.to_pylibcudf(mode="read"), - ) - result_columns = [ - Column.from_pylibcudf(col, data_ptr_exposed=True) - for col in plc_table.columns() - ] - result_labels = [ - x if x is not None else '' - for x in categories.to_arrow().to_pylist() - ] - return dict(zip(result_labels, result_columns)) - - -@acquire_spill_lock() -def compute_column(list columns, tuple column_names, str expr): - """Compute a new column by evaluating an expression on a set of columns. - - Parameters - ---------- - columns : list - The set of columns forming the table to evaluate the expression on. - column_names : tuple[str] - The names associated with each column. These names are necessary to map - column names in the expression to indices in the provided list of - columns, which are what will be used by libcudf to evaluate the - expression on the table. - expr : str - The expression to evaluate. - """ - result = plc_transform.compute_column( - plc.Table([col.to_pylibcudf(mode="read") for col in columns]), - plc.expressions.to_expression(expr, column_names), - ) - return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index c849a9d3d2b..71ec11e75af 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -13,7 +13,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.transform import bools_to_mask from cudf.core._internals import unary from cudf.core.column import column from cudf.core.column.methods import ColumnMethods @@ -775,12 +774,11 @@ def to_pandas( raise NotImplementedError(f"{arrow_type=} is not implemented.") if self.categories.dtype.kind == "f": - new_mask = bools_to_mask(self.notnull()) col = type(self)( data=self.data, # type: ignore[arg-type] size=self.size, dtype=self.dtype, - mask=new_mask, + mask=self.notnull().fillna(False).as_mask(), children=self.children, ) else: diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index c8cd80f45f4..a12df683190 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -32,7 +32,6 @@ drop_duplicates, drop_nulls, ) -from cudf._lib.transform import bools_to_mask from cudf._lib.types import size_type_dtype from cudf.api.types import ( _is_non_decimal_numeric_dtype, @@ -373,10 +372,14 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: return result._with_type_metadata(cudf_dtype_from_pa_type(array.type)) + @acquire_spill_lock() def _get_mask_as_column(self) -> ColumnBase: - return libcudf.transform.mask_to_bools( - self.base_mask, self.offset, self.offset + len(self) + plc_column = plc.transform.mask_to_bools( + self.base_mask.get_ptr(mode="read"), # type: ignore[union-attr] + self.offset, + self.offset + len(self), ) + return type(self).from_pylibcudf(plc_column) @cached_property def memory_usage(self) -> int: @@ -981,11 +984,14 @@ def as_mask(self) -> Buffer: ------- Buffer """ - if self.has_nulls(): raise ValueError("Column must have no nulls.") - return bools_to_mask(self) + with acquire_spill_lock(): + mask, _ = plc.transform.bools_to_mask( + self.to_pylibcudf(mode="read") + ) + return as_buffer(mask) @property def is_unique(self) -> bool: @@ -1514,6 +1520,18 @@ def _return_sentinel_column(): ) return codes.fillna(na_sentinel.value) + def one_hot_encode( + self, categories: ColumnBase + ) -> abc.Generator[ColumnBase]: + plc_table = plc.transform.one_hot_encode( + self.to_pylibcudf(mode="read"), + categories.to_pylibcudf(mode="read"), + ) + return ( + type(self).from_pylibcudf(col, data_ptr_exposed=True) + for col in plc_table.columns() + ) + def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool: """Check if an object dtype Series or array contains NaN.""" @@ -2093,8 +2111,7 @@ def as_column( ) # Consider NaT as NA in the mask # but maintain NaT as a value - bool_mask = as_column(~is_nat) - mask = as_buffer(bools_to_mask(bool_mask)) + mask = as_column(~is_nat).as_mask() buffer = as_buffer(arbitrary.view("|u1")) col = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype) if dtype: @@ -2264,8 +2281,7 @@ def _mask_from_cuda_array_interface_desc(obj, cai_mask) -> Buffer: ) return as_buffer(data=desc["data"][0], size=mask_size, owner=obj) elif typecode == "b": - col = as_column(cai_mask) - return bools_to_mask(col) + return as_column(cai_mask).as_mask() else: raise NotImplementedError(f"Cannot infer mask from typestr {typestr}") diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 789c4a7f3cb..0a02a48a38f 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -256,7 +256,7 @@ def from_sequences( data=None, size=len(arbitrary), dtype=cudf.ListDtype(data_col.dtype), - mask=cudf._lib.transform.bools_to_mask(as_column(mask_col)), + mask=as_column(mask_col).as_mask(), offset=0, null_count=0, children=(offset_col, data_col), diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 8ca42debb72..95ddeeaae82 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -7,9 +7,10 @@ import numpy as np import pandas as pd +from numba.np import numpy_support from typing_extensions import Self -import pylibcudf +import pylibcudf as plc import cudf import cudf.core.column.column as column @@ -17,11 +18,13 @@ from cudf import _lib as libcudf from cudf.api.types import is_integer, is_scalar from cudf.core._internals import unary +from cudf.core.buffer import acquire_spill_lock, as_buffer from cudf.core.column.column import ColumnBase, as_column from cudf.core.column.numerical_base import NumericalBaseColumn from cudf.core.dtypes import CategoricalDtype from cudf.core.mixins import BinaryOperand from cudf.errors import MixedTypeError +from cudf.utils import cudautils from cudf.utils.dtypes import ( find_common_type, min_column_type, @@ -179,13 +182,27 @@ def __setitem__(self, key: Any, value: Any): if out: self._mimic_inplace(out, inplace=True) + @acquire_spill_lock() + def transform(self, compiled_op, np_dtype: np.dtype) -> ColumnBase: + plc_column = plc.transform.transform( + self.to_pylibcudf(mode="read"), + compiled_op[0], + plc.column._datatype_from_dtype_desc(np_dtype.str[1:]), + True, + ) + return type(self).from_pylibcudf(plc_column) + def unary_operator(self, unaryop: str | Callable) -> ColumnBase: if callable(unaryop): - return libcudf.transform.transform(self, unaryop) + nb_type = numpy_support.from_dtype(self.dtype) + nb_signature = (nb_type,) + compiled_op = cudautils.compile_udf(unaryop, nb_signature) + np_dtype = np.dtype(compiled_op[1]) + return self.transform(compiled_op, np_dtype) unaryop = unaryop.upper() unaryop = _unaryop_map.get(unaryop, unaryop) - unaryop = pylibcudf.unary.UnaryOperator[unaryop] + unaryop = plc.unary.UnaryOperator[unaryop] return unary.unary_operation(self, unaryop) def __invert__(self): @@ -298,8 +315,11 @@ def nans_to_nulls(self: Self) -> Self: # Only floats can contain nan. if self.dtype.kind != "f" or self.nan_count == 0: return self - newmask = libcudf.transform.nans_to_nulls(self) - return self.set_mask(newmask) + with acquire_spill_lock(): + mask, _ = plc.transform.nans_to_nulls( + self.to_pylibcudf(mode="read") + ) + return self.set_mask(as_buffer(mask)) def normalize_binop_value( self, other: ScalarLike diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 656274bca38..325601e5311 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6772,9 +6772,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): ) result = column.as_column(result, dtype=result_dtype) if mask is not None: - result = result.set_mask( - cudf._lib.transform.bools_to_mask(mask._column) - ) + result = result.set_mask(mask._column.as_mask()) return Series._from_column(result, index=self.index) else: result_df = DataFrame(result, index=self.index) @@ -7883,6 +7881,16 @@ def interleave_columns(self): ) return self._constructor_sliced._from_column(result_col) + @acquire_spill_lock() + def _compute_columns(self, expr: str) -> ColumnBase: + plc_column = plc.transform.compute_column( + plc.Table( + [col.to_pylibcudf(mode="read") for col in self._columns] + ), + plc.expressions.to_expression(expr, self._column_names), + ) + return libcudf.column.Column.from_pylibcudf(plc_column) + @_performance_tracking def eval(self, expr: str, inplace: bool = False, **kwargs): """Evaluate a string describing operations on DataFrame columns. @@ -8010,11 +8018,7 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): raise ValueError( "Cannot operate inplace if there is no assignment" ) - return Series._from_column( - libcudf.transform.compute_column( - [*self._columns], self._column_names, statements[0] - ) - ) + return Series._from_column(self._compute_columns(statements[0])) targets = [] exprs = [] @@ -8030,15 +8034,9 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): targets.append(t.strip()) exprs.append(e.strip()) - cols = ( - libcudf.transform.compute_column( - [*self._columns], self._column_names, e - ) - for e in exprs - ) ret = self if inplace else self.copy(deep=False) - for name, col in zip(targets, cols): - ret._data[name] = col + for name, expr in zip(targets, exprs): + ret._data[name] = self._compute_columns(expr) if not inplace: return ret diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index aa601a2b322..a798041699e 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -799,8 +799,7 @@ def _set_missing_values( valid_mask = _ensure_gpu_buffer( valid_mask[0], valid_mask[1], allow_copy ) - boolmask = as_column(valid_mask._buf, dtype="bool") - bitmask = cudf._lib.transform.bools_to_mask(boolmask) + bitmask = as_column(valid_mask._buf, dtype="bool").as_mask() return cudf_col.set_mask(bitmask) elif null == _MaskKind.BITMASK: valid_mask = _ensure_gpu_buffer( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 0a7e6fefe6e..84a3caf905f 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1457,7 +1457,14 @@ def _split(self, splits): @_performance_tracking def _encode(self): - columns, indices = libcudf.transform.table_encode(list(self._columns)) + plc_table, plc_column = plc.transform.encode( + plc.Table([col.to_pylibcudf(mode="read") for col in self._columns]) + ) + columns = [ + libcudf.column.Column.from_pylibcudf(col) + for col in plc_table.columns() + ] + indices = libcudf.column.Column.from_pylibcudf(plc_column) keys = self._from_columns_like_self(columns) return keys, indices diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 21ac009e7ff..95f3d4d01d5 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3507,7 +3507,7 @@ def _apply(self, func, kernel_getter, *args, **kwargs): col = _post_process_output_col(ans_col, retty) - col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask)) + col.set_base_mask(ans_mask.as_mask()) result = cudf.Series._from_column(col, index=self.index) return result diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index a6815da62c6..ccd7dfa26fd 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -9,7 +9,6 @@ import pandas as pd import cudf -from cudf._lib.transform import one_hot_encode from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import is_scalar @@ -1310,7 +1309,11 @@ def _one_hot_encode_column( f"np.iinfo({size_type_dtype}).max. Consider reducing " "size of category" ) - data = one_hot_encode(column, categories) + result_labels = ( + x if x is not None else "" + for x in categories.to_arrow().to_pylist() + ) + data = dict(zip(result_labels, column.one_hot_encode(categories))) if drop_first and len(data): data.pop(next(iter(data))) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 58cefc6554e..be74b0f867a 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -17,7 +17,6 @@ from typing_extensions import Self, assert_never import cudf -from cudf import _lib as libcudf from cudf.api.extensions import no_default from cudf.api.types import ( _is_non_decimal_numeric_dtype, @@ -526,7 +525,7 @@ def from_categorical(cls, categorical, codes=None): mask = None if not valid_codes.all(): - mask = libcudf.transform.bools_to_mask(valid_codes) + mask = valid_codes.as_mask() col = CategoricalColumn( data=col.data, size=codes.size, diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py index e8d634598f4..a91a4951306 100644 --- a/python/cudf/cudf/datasets.py +++ b/python/cudf/cudf/datasets.py @@ -4,7 +4,6 @@ import pandas as pd import cudf -from cudf._lib.transform import bools_to_mask __all__ = ["randomdata", "timeseries"] @@ -70,7 +69,7 @@ def timeseries( size=len(index), p=[1 - nulls_frequency, nulls_frequency], ) - mask_buf = bools_to_mask(cudf.core.column.as_column(mask)) + mask_buf = cudf.core.column.as_column(mask).as_mask() masked_col = gdf[col]._column.set_mask(mask_buf) gdf[col] = cudf.Series._from_column(masked_col, index=gdf.index) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 65947efc2df..c3c9a1c5338 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -7,7 +7,6 @@ import pytest import cudf -from cudf._lib.transform import mask_to_bools from cudf.core.column.column import as_column from cudf.testing import assert_eq from cudf.testing._utils import assert_exceptions_equal @@ -489,9 +488,7 @@ def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype): # check mask expect_mask = [x is not pd.NA for x in pd_data["a"]] - got_mask = mask_to_bools( - gd_data["a"]._column.base_mask, 0, len(gd_data) - ).values_host + got_mask = gd_data["a"]._column._get_mask_as_column().values_host np.testing.assert_array_equal(expect_mask, got_mask) @@ -527,9 +524,7 @@ def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype): # check mask expect_mask = [x is not pd.NA for x in pd_data] - got_mask = mask_to_bools( - gd_data._column.base_mask, 0, len(gd_data) - ).values_host + got_mask = gd_data._column._get_mask_as_column().values_host np.testing.assert_array_equal(expect_mask, got_mask)