From 03255c753b821b6a91594c956d91f84636df3aa9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 13 Nov 2024 16:49:00 -0800 Subject: [PATCH 1/2] Move cudf._lib.unary to cudf.core._internals --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/column.pyi | 9 +++ python/cudf/cudf/_lib/unary.pyx | 60 ------------------- python/cudf/cudf/core/_internals/unary.py | 64 +++++++++++++++++++++ python/cudf/cudf/core/column/categorical.py | 9 +-- python/cudf/cudf/core/column/column.py | 9 +-- python/cudf/cudf/core/column/datetime.py | 3 +- python/cudf/cudf/core/column/decimal.py | 5 +- python/cudf/cudf/core/column/numerical.py | 11 ++-- python/cudf/cudf/core/column/timedelta.py | 3 +- python/cudf/cudf/core/tools/numeric.py | 3 +- 12 files changed, 98 insertions(+), 80 deletions(-) delete mode 100644 python/cudf/cudf/_lib/unary.pyx create mode 100644 python/cudf/cudf/core/_internals/unary.py diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 5d4b5421f16..e0b5e818516 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -51,7 +51,6 @@ set(cython_sources transform.pyx transpose.pyx types.pyx - unary.pyx utils.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 918edb6d3f1..4e0636a715d 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -36,7 +36,6 @@ text, timezone, transpose, - unary, ) MAX_COLUMN_SIZE = np.iinfo(np.int32).max diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index bb38488eefb..bdd90be45b8 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -2,8 +2,12 @@ from __future__ import annotations +from typing import Literal + from typing_extensions import Self +import pylibcudf as plc + from cudf._typing import Dtype, DtypeObj, ScalarLike from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase @@ -71,3 +75,8 @@ class Column: # TODO: The val parameter should be Scalar, not ScalarLike @staticmethod def from_scalar(val: ScalarLike, size: int) -> ColumnBase: ... + @staticmethod + def from_pylibcudf( + col: plc.Column, data_ptr_exposed: bool = False + ) -> ColumnBase: ... + def to_pylibcudf(self, mode: Literal["read", "write"]) -> plc.Column: ... diff --git a/python/cudf/cudf/_lib/unary.pyx b/python/cudf/cudf/_lib/unary.pyx deleted file mode 100644 index d5602fd5a1c..00000000000 --- a/python/cudf/cudf/_lib/unary.pyx +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf._lib.column cimport Column -from cudf._lib.types cimport dtype_to_pylibcudf_type - -import numpy as np - -import pylibcudf - -from cudf.api.types import is_decimal_dtype -from cudf.core.buffer import acquire_spill_lock - - -@acquire_spill_lock() -def unary_operation(Column input, object op): - return Column.from_pylibcudf( - pylibcudf.unary.unary_operation(input.to_pylibcudf(mode="read"), op) - ) - - -@acquire_spill_lock() -def is_null(Column input): - return Column.from_pylibcudf( - pylibcudf.unary.is_null(input.to_pylibcudf(mode="read")) - ) - - -@acquire_spill_lock() -def is_valid(Column input): - return Column.from_pylibcudf( - pylibcudf.unary.is_valid(input.to_pylibcudf(mode="read")) - ) - - -@acquire_spill_lock() -def cast(Column input, object dtype=np.float64): - result = Column.from_pylibcudf( - pylibcudf.unary.cast( - input.to_pylibcudf(mode="read"), - dtype_to_pylibcudf_type(dtype) - ) - ) - - if is_decimal_dtype(result.dtype): - result.dtype.precision = dtype.precision - return result - - -@acquire_spill_lock() -def is_nan(Column input): - return Column.from_pylibcudf( - pylibcudf.unary.is_nan(input.to_pylibcudf(mode="read")) - ) - - -@acquire_spill_lock() -def is_non_nan(Column input): - return Column.from_pylibcudf( - pylibcudf.unary.is_not_nan(input.to_pylibcudf(mode="read")) - ) diff --git a/python/cudf/cudf/core/_internals/unary.py b/python/cudf/cudf/core/_internals/unary.py new file mode 100644 index 00000000000..3b8e3db60a7 --- /dev/null +++ b/python/cudf/cudf/core/_internals/unary.py @@ -0,0 +1,64 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pylibcudf as plc + +from cudf._lib.types import dtype_to_pylibcudf_type +from cudf.api.types import is_decimal_dtype +from cudf.core.buffer import acquire_spill_lock + +if TYPE_CHECKING: + from cudf._typing import Dtype + from cudf.core.column import ColumnBase + + +@acquire_spill_lock() +def unary_operation( + col: ColumnBase, op: plc.unary.UnaryOperator +) -> ColumnBase: + return type(col).from_pylibcudf( + plc.unary.unary_operation(col.to_pylibcudf(mode="read"), op) + ) + + +@acquire_spill_lock() +def is_null(col: ColumnBase) -> ColumnBase: + return type(col).from_pylibcudf( + plc.unary.is_null(col.to_pylibcudf(mode="read")) + ) + + +@acquire_spill_lock() +def is_valid(col: ColumnBase) -> ColumnBase: + return type(col).from_pylibcudf( + plc.unary.is_valid(col.to_pylibcudf(mode="read")) + ) + + +@acquire_spill_lock() +def cast(col: ColumnBase, dtype: Dtype) -> ColumnBase: + result = type(col).from_pylibcudf( + plc.unary.cast( + col.to_pylibcudf(mode="read"), dtype_to_pylibcudf_type(dtype) + ) + ) + + if is_decimal_dtype(result.dtype): + result.dtype.precision = dtype.precision # type: ignore[union-attr] + return result + + +@acquire_spill_lock() +def is_nan(col: ColumnBase) -> ColumnBase: + return type(col).from_pylibcudf( + plc.unary.is_nan(col.to_pylibcudf(mode="read")) + ) + + +@acquire_spill_lock() +def is_non_nan(col: ColumnBase) -> ColumnBase: + return type(col).from_pylibcudf( + plc.unary.is_not_nan(col.to_pylibcudf(mode="read")) + ) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 087d0ed65f5..b7d5e8658a0 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -14,6 +14,7 @@ import cudf from cudf import _lib as libcudf from cudf._lib.transform import bools_to_mask +from cudf.core._internals import unary from cudf.core.column import column from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import CategoricalDtype, IntervalDtype @@ -1018,12 +1019,12 @@ def isnull(self) -> ColumnBase: """ Identify missing values in a CategoricalColumn. """ - result = libcudf.unary.is_null(self) + result = unary.is_null(self) if self.categories.dtype.kind == "f": # Need to consider `np.nan` values in case # of an underlying float column - categories = libcudf.unary.is_nan(self.categories) + categories = unary.is_nan(self.categories) if categories.any(): code = self._encode(np.nan) result = result | (self.codes == cudf.Scalar(code)) @@ -1034,12 +1035,12 @@ def notnull(self) -> ColumnBase: """ Identify non-missing values in a CategoricalColumn. """ - result = libcudf.unary.is_valid(self) + result = unary.is_valid(self) if self.categories.dtype.kind == "f": # Need to consider `np.nan` values in case # of an underlying float column - categories = libcudf.unary.is_nan(self.categories) + categories = unary.is_nan(self.categories) if categories.any(): code = self._encode(np.nan) result = result & (self.codes != cudf.Scalar(code)) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d2f9d208c77..03dcf6bec1e 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -47,6 +47,7 @@ is_string_dtype, ) from cudf.core._compat import PANDAS_GE_210 +from cudf.core._internals import unary from cudf.core._internals.timezones import get_compatible_timezone from cudf.core.abc import Serializable from cudf.core.buffer import ( @@ -713,12 +714,12 @@ def isnull(self) -> ColumnBase: if not self.has_nulls(include_nan=self.dtype.kind == "f"): return as_column(False, length=len(self)) - result = libcudf.unary.is_null(self) + result = unary.is_null(self) if self.dtype.kind == "f": # Need to consider `np.nan` values in case # of a float column - result = result | libcudf.unary.is_nan(self) + result = result | unary.is_nan(self) return result @@ -727,12 +728,12 @@ def notnull(self) -> ColumnBase: if not self.has_nulls(include_nan=self.dtype.kind == "f"): return as_column(True, length=len(self)) - result = libcudf.unary.is_valid(self) + result = unary.is_valid(self) if self.dtype.kind == "f": # Need to consider `np.nan` values in case # of a float column - result = result & libcudf.unary.is_non_nan(self) + result = result & unary.is_non_nan(self) return result diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index b6dc250e64d..c2418cb014a 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -19,6 +19,7 @@ from cudf._lib.labeling import label_bins from cudf._lib.search import search_sorted from cudf.core._compat import PANDAS_GE_220 +from cudf.core._internals import unary from cudf.core._internals.timezones import ( check_ambiguous_and_nonexistent, get_compatible_timezone, @@ -487,7 +488,7 @@ def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn: "Cannot use .astype to convert from timezone-naive dtype to timezone-aware dtype. " "Use tz_localize instead." ) - return libcudf.unary.cast(self, dtype=dtype) + return unary.cast(self, dtype=dtype) # type: ignore[return-value] def as_timedelta_column(self, dtype: Dtype) -> None: # type: ignore[override] raise TypeError( diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 8ae06f72d1e..540aa02b842 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -17,6 +17,7 @@ from_decimal as cpp_from_decimal, ) from cudf.api.types import is_scalar +from cudf.core._internals import unary from cudf.core.buffer import as_buffer from cudf.core.column import ColumnBase from cudf.core.dtypes import ( @@ -85,7 +86,7 @@ def as_decimal_column( if dtype == self.dtype: return self - return libcudf.unary.cast(self, dtype) + return unary.cast(self, dtype) # type: ignore[return-value] def as_string_column(self) -> cudf.core.column.StringColumn: if len(self) > 0: @@ -232,7 +233,7 @@ def _decimal_quantile( def as_numerical_column( self, dtype: Dtype ) -> "cudf.core.column.NumericalColumn": - return libcudf.unary.cast(self, dtype) + return unary.cast(self, dtype) # type: ignore[return-value] class Decimal32Column(DecimalBaseColumn): diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 620cae65374..313b5bad3bd 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -14,6 +14,7 @@ import cudf from cudf import _lib as libcudf from cudf.api.types import is_integer, is_scalar +from cudf.core._internals import unary from cudf.core.column import ColumnBase, as_column, column, string from cudf.core.dtypes import CategoricalDtype from cudf.core.mixins import BinaryOperand @@ -125,7 +126,7 @@ def indices_of(self, value: ScalarLike) -> NumericalColumn: and self.dtype.kind in {"c", "f"} and np.isnan(value) ): - nan_col = libcudf.unary.is_nan(self) + nan_col = unary.is_nan(self) return nan_col.indices_of(True) else: return super().indices_of(value) @@ -184,7 +185,7 @@ def unary_operator(self, unaryop: str | Callable) -> ColumnBase: unaryop = unaryop.upper() unaryop = _unaryop_map.get(unaryop, unaryop) unaryop = pylibcudf.unary.UnaryOperator[unaryop] - return libcudf.unary.unary_operation(self, unaryop) + return unary.unary_operation(self, unaryop) def __invert__(self): if self.dtype.kind in "ui": @@ -388,13 +389,13 @@ def as_timedelta_column( def as_decimal_column( self, dtype: Dtype ) -> "cudf.core.column.DecimalBaseColumn": - return libcudf.unary.cast(self, dtype) + return unary.cast(self, dtype) # type: ignore[return-value] def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: dtype = cudf.dtype(dtype) if dtype == self.dtype: return self - return libcudf.unary.cast(self, dtype) + return unary.cast(self, dtype) # type: ignore[return-value] def all(self, skipna: bool = True) -> bool: # If all entries are null the result is True, including when the column @@ -421,7 +422,7 @@ def any(self, skipna: bool = True) -> bool: def nan_count(self) -> int: if self.dtype.kind != "f": return 0 - nan_col = libcudf.unary.is_nan(self) + nan_col = unary.is_nan(self) return nan_col.sum() def _process_values_for_isin( diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 087d6474e7f..41679a8c272 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -13,6 +13,7 @@ import cudf from cudf import _lib as libcudf from cudf.api.types import is_scalar +from cudf.core._internals import unary from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column import ColumnBase, column, string from cudf.utils.dtypes import np_to_pa_dtype @@ -304,7 +305,7 @@ def as_string_column(self) -> cudf.core.column.StringColumn: def as_timedelta_column(self, dtype: Dtype) -> TimeDeltaColumn: if dtype == self.dtype: return self - return libcudf.unary.cast(self, dtype=dtype) + return unary.cast(self, dtype=dtype) # type: ignore[return-value] def mean(self, skipna=None) -> pd.Timedelta: return pd.Timedelta( diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 6cecf3fa170..9a22045ff78 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -11,6 +11,7 @@ from cudf import _lib as libcudf from cudf._lib import strings as libstrings from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype +from cudf.core._internals import unary from cudf.core.column import as_column from cudf.core.dtypes import CategoricalDtype from cudf.core.index import ensure_index @@ -171,7 +172,7 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None): downcast_dtype = cudf.dtype(t) if downcast_dtype.itemsize <= col.dtype.itemsize: if col.can_cast_safely(downcast_dtype): - col = libcudf.unary.cast(col, downcast_dtype) + col = unary.cast(col, downcast_dtype) break if isinstance(arg, (cudf.Series, pd.Series)): From 632cb7aad0293127d97d8045e5ac2c2f53be8996 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 13 Nov 2024 17:06:55 -0800 Subject: [PATCH 2/2] adjust usage in test --- python/cudf/cudf/testing/testing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 668e7a77454..8d342f8e6c6 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -10,8 +10,8 @@ from pandas import testing as tm import cudf -from cudf._lib.unary import is_nan from cudf.api.types import is_numeric_dtype, is_string_dtype +from cudf.core._internals.unary import is_nan from cudf.core.missing import NA, NaT