From e0af727a091c3ee98a873193ad3303abb4b3f240 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 25 Feb 2022 11:14:52 -0800 Subject: [PATCH] Refactor array_ufunc for Index and unify across all classes (#10346) This PR builds on #10217 and #10287 to bring full ufunc support for Index types, expanding well beyond the small set previously supported in the `cudf.core.ops` namespace. By using most of the machinery introduced for IndexedFrame in the prior two PRs we avoid duplicating much logic so that all ufunc dispatches flow through a relatively standard path of known methods prior to a common cupy dispatch. With this change we are also able to deprecate the various ufunc operations defined in cudf/core/ops.py that exist only for this purpose as well as a number of Frame methods that are not defined for the corresponding pandas types. Users of those APIs are recommended to calling the corresponding numpy/cupy ufuncs instead to leverage the new dispatch. This PR also fixes a bug where index binary operations that output booleans would previously return instances of GenericIndex, whereas those pandas operations would return numpy arrays. cudf now returns cupy arrays in those cases. Resolves #9083. Contributes to #9038. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/10346 --- python/cudf/cudf/core/_base_index.py | 8 - python/cudf/cudf/core/dataframe.py | 55 +++--- python/cudf/cudf/core/frame.py | 170 ++++++++++++++++- python/cudf/cudf/core/index.py | 55 +++++- python/cudf/cudf/core/indexed_frame.py | 186 ++++++------------- python/cudf/cudf/core/ops.py | 82 +++++++- python/cudf/cudf/core/series.py | 51 ++--- python/cudf/cudf/core/single_column_frame.py | 16 +- python/cudf/cudf/tests/test_array_ufunc.py | 89 ++++++++- python/cudf/cudf/tests/test_binops.py | 6 +- python/cudf/cudf/tests/test_dataframe.py | 8 - python/cudf/cudf/tests/test_pickling.py | 6 +- 12 files changed, 498 insertions(+), 234 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 16fb0cf99c1..b7b61e4d332 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -41,14 +41,6 @@ class BaseIndex(Serializable): _accessors: Set[Any] = set() _data: ColumnAccessor - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - - if method == "__call__" and hasattr(cudf, ufunc.__name__): - func = getattr(cudf, ufunc.__name__) - return func(*inputs) - else: - return NotImplemented - @cached_property def _values(self) -> ColumnBase: raise NotImplementedError diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 6b5f3809c98..4062f811975 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -11,7 +11,17 @@ import warnings from collections import defaultdict from collections.abc import Iterable, Sequence -from typing import Any, MutableMapping, Optional, Set, TypeVar +from typing import ( + Any, + Dict, + MutableMapping, + Optional, + Set, + Tuple, + Type, + TypeVar, + Union, +) import cupy import numpy as np @@ -44,6 +54,7 @@ from cudf.core.abc import Serializable from cudf.core.column import ( CategoricalColumn, + ColumnBase, as_column, build_categorical_column, build_column, @@ -1909,7 +1920,7 @@ def _get_columns_by_label(self, labels, downcast=False): ) return out - def _prep_for_binop( + def _make_operands_and_index_for_binop( self, other: Any, fn: str, @@ -1918,7 +1929,13 @@ def _prep_for_binop( can_reindex: bool = False, *args, **kwargs, - ): + ) -> Tuple[ + Union[ + Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], + Type[NotImplemented], + ], + Optional[BaseIndex], + ]: lhs, rhs = self, other if _is_scalar_or_zero_d_array(rhs): @@ -1999,28 +2016,6 @@ def _prep_for_binop( return operands, lhs._index - @annotate("DATAFRAME_BINARYOP", color="blue", domain="cudf_python") - def _binaryop( - self, - other: Any, - fn: str, - fill_value: Any = None, - reflect: bool = False, - can_reindex: bool = False, - *args, - **kwargs, - ): - operands, out_index = self._prep_for_binop( - other, fn, fill_value, reflect, can_reindex - ) - if operands is NotImplemented: - return NotImplemented - - return self._from_data( - ColumnAccessor(type(self)._colwise_binop(operands, fn)), - index=out_index, - ) - @annotate("DATAFRAME_UPDATE", color="blue", domain="cudf_python") def update( self, @@ -2183,9 +2178,7 @@ def columns(self, columns): columns = pd.Index(range(len(self._data.columns))) is_multiindex = isinstance(columns, pd.MultiIndex) - if isinstance( - columns, (Series, cudf.Index, cudf.core.column.ColumnBase) - ): + if isinstance(columns, (Series, cudf.Index, ColumnBase)): columns = pd.Index(columns.to_numpy(), tupleize_cols=is_multiindex) elif not isinstance(columns, pd.Index): columns = pd.Index(columns, tupleize_cols=is_multiindex) @@ -6626,7 +6619,7 @@ def _setitem_with_dataframe( input_df: DataFrame, replace_df: DataFrame, input_cols: Any = None, - mask: Optional[cudf.core.column.ColumnBase] = None, + mask: Optional[ColumnBase] = None, ignore_index: bool = False, ): """ @@ -6717,9 +6710,7 @@ def _get_union_of_series_names(series_list): def _get_host_unique(array): - if isinstance( - array, (cudf.Series, cudf.Index, cudf.core.column.ColumnBase) - ): + if isinstance(array, (cudf.Series, cudf.Index, ColumnBase)): return array.unique.to_pandas() elif isinstance(array, (str, numbers.Number)): return [array] diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 91d81a4c6de..0fd7848c7d1 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -54,6 +54,49 @@ T = TypeVar("T", bound="Frame") +# Mapping from ufuncs to the corresponding binary operators. +_ufunc_binary_operations = { + # Arithmetic binary operations. + "add": "add", + "subtract": "sub", + "multiply": "mul", + "matmul": "matmul", + "divide": "truediv", + "true_divide": "truediv", + "floor_divide": "floordiv", + "power": "pow", + "float_power": "pow", + "remainder": "mod", + "mod": "mod", + "fmod": "mod", + # Bitwise binary operations. + "bitwise_and": "and", + "bitwise_or": "or", + "bitwise_xor": "xor", + # Comparison binary operators + "greater": "gt", + "greater_equal": "ge", + "less": "lt", + "less_equal": "le", + "not_equal": "ne", + "equal": "eq", +} + +# These operators need to be mapped to their inverses when performing a +# reflected ufunc operation because no reflected version of the operators +# themselves exist. When these operators are invoked directly (not via +# __array_ufunc__) Python takes care of calling the inverse operation. +_ops_without_reflection = { + "gt": "lt", + "ge": "le", + "lt": "gt", + "le": "ge", + # ne and eq are symmetric, so they are their own inverse op + "ne": "ne", + "eq": "eq", +} + + class Frame: """A collection of Column objects with an optional index. @@ -2752,6 +2795,11 @@ def sin(self): 0.8011526357338306, 0.8939966636005579], dtype='float64') """ + warnings.warn( + "sin is deprecated and will be removed. Use numpy.sin instead", + FutureWarning, + ) + return self._unaryop("sin") @annotate("FRAME_COS", color="green", domain="cudf_python") @@ -2814,6 +2862,11 @@ def cos(self): -0.5984600690578581, -0.4480736161291701], dtype='float64') """ + warnings.warn( + "cos is deprecated and will be removed. Use numpy.cos instead", + FutureWarning, + ) + return self._unaryop("cos") @annotate("FRAME_TAN", color="green", domain="cudf_python") @@ -2876,6 +2929,11 @@ def tan(self): -1.3386902103511544, -1.995200412208242], dtype='float64') """ + warnings.warn( + "tan is deprecated and will be removed. Use numpy.tan instead", + FutureWarning, + ) + return self._unaryop("tan") @annotate("FRAME_ASIN", color="green", domain="cudf_python") @@ -2927,6 +2985,11 @@ def asin(self): 1.5707963267948966, 0.3046926540153975], dtype='float64') """ + warnings.warn( + "asin is deprecated and will be removed in the future", + FutureWarning, + ) + return self._unaryop("asin") @annotate("FRAME_ACOS", color="green", domain="cudf_python") @@ -2978,6 +3041,11 @@ def acos(self): 1.5707963267948966, 1.266103672779499], dtype='float64') """ + warnings.warn( + "acos is deprecated and will be removed. Use numpy.acos instead", + FutureWarning, + ) + result = self.copy(deep=False) for col in result._data: min_float_dtype = cudf.utils.dtypes.get_min_float_dtype( @@ -3047,6 +3115,11 @@ def atan(self): 0.2914567944778671], dtype='float64') """ + warnings.warn( + "atan is deprecated and will be removed. Use numpy.atan instead", + FutureWarning, + ) + return self._unaryop("atan") @annotate("FRAME_EXP", color="green", domain="cudf_python") @@ -3110,6 +3183,11 @@ def exp(self): 2.718281828459045, 1.0, 1.3498588075760032], dtype='float64') """ + warnings.warn( + "exp is deprecated and will be removed. Use numpy.exp instead", + FutureWarning, + ) + return self._unaryop("exp") @annotate("FRAME_LOG", color="green", domain="cudf_python") @@ -3172,6 +3250,11 @@ def log(self): Float64Index([2.302585092994046, 2.3978952727983707, 6.214608098422191], dtype='float64') """ + warnings.warn( + "log is deprecated and will be removed. Use numpy.log instead", + FutureWarning, + ) + return self._unaryop("log") @annotate("FRAME_SQRT", color="green", domain="cudf_python") @@ -3228,6 +3311,11 @@ def sqrt(self): >>> index.sqrt() Float64Index([nan, 10.0, 25.0], dtype='float64') """ + warnings.warn( + "sqrt is deprecated and will be removed. Use numpy.sqrt instead", + FutureWarning, + ) + return self._unaryop("sqrt") @annotate("FRAME_ABS", color="green", domain="cudf_python") @@ -3496,7 +3584,9 @@ def _binaryop( Frame A new instance containing the result of the operation. """ - raise NotImplementedError + raise NotImplementedError( + f"Binary operations are not supported for {self.__class__}" + ) @classmethod @annotate("FRAME_COLWISE_BINOP", color="green", domain="cudf_python") @@ -3658,6 +3748,84 @@ def _colwise_binop( return output + # For more detail on this function and how it should work, see + # https://numpy.org/doc/stable/reference/ufuncs.html + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + # We don't currently support reduction, accumulation, etc. We also + # don't support any special kwargs or higher arity ufuncs than binary. + if method != "__call__" or kwargs or ufunc.nin > 2: + return NotImplemented + + fname = ufunc.__name__ + if fname in _ufunc_binary_operations: + reflect = self is not inputs[0] + other = inputs[0] if reflect else inputs[1] + + op = _ufunc_binary_operations[fname] + if reflect and op in _ops_without_reflection: + op = _ops_without_reflection[op] + reflect = False + op = f"__{'r' if reflect else ''}{op}__" + + # Float_power returns float irrespective of the input type. + if fname == "float_power": + return getattr(self, op)(other).astype(float) + return getattr(self, op)(other) + + # Special handling for various unary operations. + if fname == "negative": + return self * -1 + if fname == "positive": + return self.copy(deep=True) + if fname == "invert": + return ~self + if fname == "absolute": + return self.abs() + if fname == "fabs": + return self.abs().astype(np.float64) + + # None is a sentinel used by subclasses to trigger cupy dispatch. + return None + + def _apply_cupy_ufunc_to_operands( + self, ufunc, cupy_func, operands, **kwargs + ): + # Note: There are some operations that may be supported by libcudf but + # are not supported by pandas APIs. In particular, libcudf binary + # operations support logical and/or operations as well as + # trigonometric, but those operations are not defined on + # pd.Series/DataFrame. For now those operations will dispatch to cupy, + # but if ufuncs are ever a bottleneck we could add special handling to + # dispatch those (or any other) functions that we could implement + # without cupy. + + mask = None + data = [{} for _ in range(ufunc.nout)] + for name, (left, right, _, _) in operands.items(): + cupy_inputs = [] + for inp in (left, right) if ufunc.nin == 2 else (left,): + if isinstance(inp, ColumnBase) and inp.has_nulls(): + new_mask = as_column(inp.nullmask) + + # TODO: This is a hackish way to perform a bitwise and + # of bitmasks. Once we expose + # cudf::detail::bitwise_and, then we can use that + # instead. + mask = new_mask if mask is None else (mask & new_mask) + + # Arbitrarily fill with zeros. For ufuncs, we assume + # that the end result propagates nulls via a bitwise + # and, so these elements are irrelevant. + inp = inp.fillna(0) + cupy_inputs.append(cupy.asarray(inp)) + + cp_output = cupy_func(*cupy_inputs, **kwargs) + if ufunc.nout == 1: + cp_output = (cp_output,) + for i, out in enumerate(cp_output): + data[i][name] = as_column(out).set_mask(mask) + return data + @annotate("FRAME_DOT", color="green", domain="cudf_python") def dot(self, other, reflect=False): """ diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 5b60e8dbd1c..5aab834d452 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -520,6 +520,11 @@ def _as_int64(self): # that are not defined directly on RangeIndex. return Int64Index._from_data(self._data) + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + return self._as_int64().__array_ufunc__( + ufunc, method, *inputs, **kwargs + ) + def __getattr__(self, key): # For methods that are not defined for RangeIndex we attempt to operate # on the corresponding integer index if possible. @@ -773,6 +778,41 @@ def __init__(self, data, **kwargs): name = kwargs.get("name") super().__init__({name: data}) + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + ret = super().__array_ufunc__(ufunc, method, *inputs, **kwargs) + + if ret is not None: + return ret + + # Attempt to dispatch all other functions to cupy. + cupy_func = getattr(cupy, ufunc.__name__) + if cupy_func: + if ufunc.nin == 2: + other = inputs[self is inputs[0]] + inputs = self._make_operands_for_binop(other) + else: + inputs = { + name: (col, None, False, None) + for name, col in self._data.items() + } + + data = self._apply_cupy_ufunc_to_operands( + ufunc, cupy_func, inputs, **kwargs + ) + + out = [_index_from_data(out) for out in data] + + # pandas returns numpy arrays when the outputs are boolean. + for i, o in enumerate(out): + # We explicitly _do not_ use isinstance here: we want only + # boolean GenericIndexes, not dtype-specific subclasses. + if type(o) is GenericIndex and o.dtype.kind == "b": + out[i] = o.values + + return out[0] if ufunc.nout == 1 else tuple(out) + + return NotImplemented + def _binaryop( self, other: T, @@ -784,11 +824,16 @@ def _binaryop( ) -> SingleColumnFrame: # Specialize binops to generate the appropriate output index type. operands = self._make_operands_for_binop(other, fill_value, reflect) - return ( - _index_from_data(data=self._colwise_binop(operands, fn),) - if operands is not NotImplemented - else NotImplemented - ) + if operands is NotImplemented: + return NotImplemented + ret = _index_from_data(self._colwise_binop(operands, fn)) + + # pandas returns numpy arrays when the outputs are boolean. We + # explicitly _do not_ use isinstance here: we want only boolean + # GenericIndexes, not dtype-specific subclasses. + if type(ret) is GenericIndex and ret.dtype.kind == "b": + return ret.values + return ret def _copy_type_metadata( self, other: Frame, include_index: bool = True diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 8ff3e39519c..3ae0a838873 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -7,7 +7,7 @@ import warnings from collections import Counter, abc from functools import cached_property -from typing import Callable, Type, TypeVar +from typing import Any, Callable, Dict, Optional, Tuple, Type, TypeVar, Union from uuid import uuid4 import cupy as cp @@ -1694,150 +1694,86 @@ def last(self, offset): slice_func=lambda i: self.iloc[i:], ) - # For more detail on this function and how it should work, see - # https://numpy.org/doc/stable/reference/ufuncs.html - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - # We don't currently support reduction, accumulation, etc. We also - # don't support any special kwargs or higher arity ufuncs than binary. - if method != "__call__" or kwargs or ufunc.nin > 2: + def _binaryop( + self, + other: Any, + fn: str, + fill_value: Any = None, + reflect: bool = False, + can_reindex: bool = False, + *args, + **kwargs, + ): + operands, out_index = self._make_operands_and_index_for_binop( + other, fn, fill_value, reflect, can_reindex + ) + if operands is NotImplemented: return NotImplemented - # Binary operations - binary_operations = { - # Arithmetic binary operations. - "add": "add", - "subtract": "sub", - "multiply": "mul", - "matmul": "matmul", - "divide": "truediv", - "true_divide": "truediv", - "floor_divide": "floordiv", - "power": "pow", - "float_power": "pow", - "remainder": "mod", - "mod": "mod", - "fmod": "mod", - # Bitwise binary operations. - "bitwise_and": "and", - "bitwise_or": "or", - "bitwise_xor": "xor", - # Comparison binary operators - "greater": "gt", - "greater_equal": "ge", - "less": "lt", - "less_equal": "le", - "not_equal": "ne", - "equal": "eq", - } + return self._from_data( + ColumnAccessor(type(self)._colwise_binop(operands, fn)), + index=out_index, + ) + + def _make_operands_and_index_for_binop( + self, + other: Any, + fn: str, + fill_value: Any = None, + reflect: bool = False, + can_reindex: bool = False, + *args, + **kwargs, + ) -> Tuple[ + Union[ + Dict[ + Optional[str], + Tuple[cudf.core.column.ColumnBase, Any, bool, Any], + ], + Type[NotImplemented], + ], + Optional[cudf.BaseIndex], + ]: + raise NotImplementedError( + "Binary operations are not supported for {self.__class__}" + ) - # First look for methods of the class. + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + ret = super().__array_ufunc__(ufunc, method, *inputs, **kwargs) fname = ufunc.__name__ - if fname in binary_operations: - reflect = self is not inputs[0] - other = inputs[0] if reflect else inputs[1] - - # These operators need to be mapped to their inverses when - # performing a reflected operation because no reflected version of - # the operators themselves exist. - ops_without_reflection = { - "gt": "lt", - "ge": "le", - "lt": "gt", - "le": "ge", - # ne and eq are symmetric, so they are their own inverse op - "ne": "ne", - "eq": "eq", - } - - op = binary_operations[fname] - if reflect and op in ops_without_reflection: - op = ops_without_reflection[op] - reflect = False - op = f"__{'r' if reflect else ''}{op}__" + if ret is not None: # pandas bitwise operations return bools if indexes are misaligned. - if ( - "bitwise" in fname - and isinstance(other, IndexedFrame) - and not self.index.equals(other.index) - ): - return getattr(self, op)(other).astype(bool) - # Float_power returns float irrespective of the input type. - if fname == "float_power": - return getattr(self, op)(other).astype(float) - return getattr(self, op)(other) - - # Special handling for unary operations. - if fname == "negative": - return self * -1 - if fname == "positive": - return self.copy(deep=True) - if fname == "invert": - return ~self - if fname == "absolute": - return self.abs() - if fname == "fabs": - return self.abs().astype(np.float64) - - # Note: There are some operations that may be supported by libcudf but - # are not supported by pandas APIs. In particular, libcudf binary - # operations support logical and/or operations, but those operations - # are not defined on pd.Series/DataFrame. For now those operations will - # dispatch to cupy, but if ufuncs are ever a bottleneck we could add - # special handling to dispatch those (or any other) functions that we - # could implement without cupy. + if "bitwise" in fname: + reflect = self is not inputs[0] + other = inputs[0] if reflect else inputs[1] + if isinstance(other, self.__class__) and not self.index.equals( + other.index + ): + ret = ret.astype(bool) + return ret # Attempt to dispatch all other functions to cupy. cupy_func = getattr(cp, fname) if cupy_func: - # Indices must be aligned before converting to arrays. if ufunc.nin == 2: other = inputs[self is inputs[0]] - inputs, index = self._prep_for_binop(other, fname) + inputs, index = self._make_operands_and_index_for_binop( + other, fname + ) else: + # This works for Index too inputs = { name: (col, None, False, None) for name, col in self._data.items() } index = self._index - mask = None - data = [{} for _ in range(ufunc.nout)] - for name, (left, right, _, _) in inputs.items(): - cupy_inputs = [] - # TODO: I'm jumping through multiple hoops to get the unary - # behavior to match up with the binary. I should see if there - # are better patterns to employ here. - for inp in (left, right) if ufunc.nin == 2 else (left,): - if ( - isinstance(inp, cudf.core.column.ColumnBase) - and inp.has_nulls() - ): - new_mask = cudf.core.column.as_column(inp.nullmask) - - # TODO: This is a hackish way to perform a bitwise and - # of bitmasks. Once we expose - # cudf::detail::bitwise_and, then we can use that - # instead. - mask = new_mask if mask is None else (mask & new_mask) - - # Arbitrarily fill with zeros. For ufuncs, we assume - # that the end result propagates nulls via a bitwise - # and, so these elements are irrelevant. - inp = inp.fillna(0) - cupy_inputs.append(cp.asarray(inp)) - - cp_output = cupy_func(*cupy_inputs, **kwargs) - if ufunc.nout == 1: - cp_output = (cp_output,) - for i, out in enumerate(cp_output): - data[i][name] = cudf.core.column.as_column(out).set_mask( - mask - ) - - out = tuple( - self.__class__._from_data(out, index=index) for out in data + data = self._apply_cupy_ufunc_to_operands( + ufunc, cupy_func, inputs, **kwargs ) + + out = tuple(self._from_data(out, index=index) for out in data) return out[0] if ufunc.nout == 1 else out return NotImplemented diff --git a/python/cudf/cudf/core/ops.py b/python/cudf/cudf/core/ops.py index fe9e012f406..c2a8c0e72fb 100644 --- a/python/cudf/cudf/core/ops.py +++ b/python/cudf/cudf/core/ops.py @@ -1,4 +1,5 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +import warnings from numbers import Number import numpy as np @@ -10,6 +11,10 @@ def sin(arbitrary): + warnings.warn( + "sin is deprecated and will be removed in the future", FutureWarning, + ) + if isinstance(arbitrary, Number): return np.sin(arbitrary) else: @@ -17,6 +22,10 @@ def sin(arbitrary): def cos(arbitrary): + warnings.warn( + "cos is deprecated and will be removed in the future", FutureWarning, + ) + if isinstance(arbitrary, Number): return np.cos(arbitrary) else: @@ -24,6 +33,10 @@ def cos(arbitrary): def tan(arbitrary): + warnings.warn( + "tan is deprecated and will be removed in the future", FutureWarning, + ) + if isinstance(arbitrary, Number): return np.tan(arbitrary) else: @@ -31,6 +44,11 @@ def tan(arbitrary): def arcsin(arbitrary): + warnings.warn( + "arcsin is deprecated and will be removed in the future", + FutureWarning, + ) + if isinstance(arbitrary, Number): return np.arcsin(arbitrary) else: @@ -38,6 +56,11 @@ def arcsin(arbitrary): def arccos(arbitrary): + warnings.warn( + "arcsin is deprecated and will be removed in the future", + FutureWarning, + ) + if isinstance(arbitrary, Number): return np.arccos(arbitrary) else: @@ -45,6 +68,11 @@ def arccos(arbitrary): def arctan(arbitrary): + warnings.warn( + "arctan is deprecated and will be removed in the future", + FutureWarning, + ) + if isinstance(arbitrary, Number): return np.arctan(arbitrary) else: @@ -52,6 +80,10 @@ def arctan(arbitrary): def exp(arbitrary): + warnings.warn( + "exp is deprecated and will be removed in the future", FutureWarning, + ) + if isinstance(arbitrary, Number): return np.exp(arbitrary) else: @@ -59,6 +91,10 @@ def exp(arbitrary): def log(arbitrary): + warnings.warn( + "log is deprecated and will be removed in the future", FutureWarning, + ) + if isinstance(arbitrary, Number): return np.log(arbitrary) else: @@ -66,6 +102,10 @@ def log(arbitrary): def sqrt(arbitrary): + warnings.warn( + "sqrt is deprecated and will be removed in the future", FutureWarning, + ) + if isinstance(arbitrary, Number): return np.sqrt(arbitrary) else: @@ -73,6 +113,11 @@ def sqrt(arbitrary): def logical_not(arbitrary): + warnings.warn( + "logical_not is deprecated and will be removed in the future", + FutureWarning, + ) + if isinstance(arbitrary, Number): return np.logical_not(arbitrary) else: @@ -80,6 +125,11 @@ def logical_not(arbitrary): def logical_and(lhs, rhs): + warnings.warn( + "logical_and is deprecated and will be removed in the future", + FutureWarning, + ) + if isinstance(lhs, Number) and isinstance(rhs, Number): return np.logical_and(lhs, rhs) else: @@ -87,6 +137,11 @@ def logical_and(lhs, rhs): def logical_or(lhs, rhs): + warnings.warn( + "logical_or is deprecated and will be removed in the future", + FutureWarning, + ) + if isinstance(lhs, Number) and isinstance(rhs, Number): return np.logical_or(lhs, rhs) else: @@ -94,6 +149,11 @@ def logical_or(lhs, rhs): def remainder(lhs, rhs): + warnings.warn( + "remainder is deprecated and will be removed in the future", + FutureWarning, + ) + if isinstance(lhs, Number) and isinstance(rhs, Number): return np.mod(lhs, rhs) elif isinstance(lhs, Frame): @@ -103,6 +163,10 @@ def remainder(lhs, rhs): def floor_divide(lhs, rhs): + warnings.warn( + "sin is deprecated and will be removed in the future", FutureWarning, + ) + if isinstance(lhs, Number) and isinstance(rhs, Number): return np.floor_divide(lhs, rhs) elif isinstance(lhs, Frame): @@ -112,6 +176,10 @@ def floor_divide(lhs, rhs): def subtract(lhs, rhs): + warnings.warn( + "sin is deprecated and will be removed in the future", FutureWarning, + ) + if isinstance(lhs, Number) and isinstance(rhs, Number): return np.subtract(lhs, rhs) elif isinstance(lhs, Frame): @@ -121,6 +189,10 @@ def subtract(lhs, rhs): def add(lhs, rhs): + warnings.warn( + "sin is deprecated and will be removed in the future", FutureWarning, + ) + if isinstance(lhs, Number) and isinstance(rhs, Number): return np.add(lhs, rhs) elif isinstance(rhs, Frame): @@ -130,6 +202,10 @@ def add(lhs, rhs): def true_divide(lhs, rhs): + warnings.warn( + "sin is deprecated and will be removed in the future", FutureWarning, + ) + if isinstance(lhs, Number) and isinstance(rhs, Number): return np.true_divide(lhs, rhs) elif isinstance(rhs, Frame): @@ -139,6 +215,10 @@ def true_divide(lhs, rhs): def multiply(lhs, rhs): + warnings.warn( + "sin is deprecated and will be removed in the future", FutureWarning, + ) + if isinstance(lhs, Number) and isinstance(rhs, Number): return np.multiply(lhs, rhs) elif isinstance(rhs, Frame): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 8574a152c44..45a44016449 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -9,7 +9,7 @@ from collections import abc as abc from numbers import Number from shutil import get_terminal_size -from typing import Any, MutableMapping, Optional, Set, Union +from typing import Any, Dict, MutableMapping, Optional, Set, Tuple, Type, Union import cupy import numpy as np @@ -39,6 +39,7 @@ ) from cudf.core.abc import Serializable from cudf.core.column import ( + ColumnBase, DatetimeColumn, TimeDeltaColumn, arange, @@ -435,7 +436,7 @@ def __init__( else: data = {} - if not isinstance(data, column.ColumnBase): + if not isinstance(data, ColumnBase): data = column.as_column(data, nan_as_null=nan_as_null, dtype=dtype) else: if dtype is not None: @@ -444,7 +445,7 @@ def __init__( if index is not None and not isinstance(index, BaseIndex): index = as_index(index) - assert isinstance(data, column.ColumnBase) + assert isinstance(data, ColumnBase) super().__init__({name: data}) self._index = RangeIndex(len(data)) if index is None else index @@ -1206,7 +1207,7 @@ def __repr__(self): lines.append(category_memory) return "\n".join(lines) - def _prep_for_binop( + def _make_operands_and_index_for_binop( self, other: Any, fn: str, @@ -1215,22 +1216,19 @@ def _prep_for_binop( can_reindex: bool = False, *args, **kwargs, - ): + ) -> Tuple[ + Union[ + Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], + Type[NotImplemented], + ], + Optional[BaseIndex], + ]: # Specialize binops to align indices. - if isinstance(other, SingleColumnFrame): + if isinstance(other, Series): if ( - # TODO: The can_reindex logic also needs to be applied for - # DataFrame (the methods that need it just don't exist yet). not can_reindex and fn in cudf.utils.utils._EQUALITY_OPS - and ( - isinstance(other, Series) - # TODO: mypy doesn't like this line because the index - # property is not defined on SingleColumnFrame (or Index, - # for that matter). Ignoring is the easy solution for now, - # a cleaner fix requires reworking the type hierarchy. - and not self.index.equals(other.index) # type: ignore - ) + and not self.index.equals(other.index) ): raise ValueError( "Can only compare identically-labeled Series objects" @@ -1242,27 +1240,6 @@ def _prep_for_binop( operands = lhs._make_operands_for_binop(other, fill_value, reflect) return operands, lhs._index - def _binaryop( - self, - other: Frame, - fn: str, - fill_value: Any = None, - reflect: bool = False, - can_reindex: bool = False, - *args, - **kwargs, - ): - operands, out_index = self._prep_for_binop( - other, fn, fill_value, reflect, can_reindex - ) - return ( - self._from_data( - data=self._colwise_binop(operands, fn), index=out_index, - ) - if operands is not NotImplemented - else NotImplemented - ) - def logical_and(self, other): warnings.warn( "Series.logical_and is deprecated and will be removed.", diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 50b206d3388..f02e3b9f959 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -4,7 +4,16 @@ from __future__ import annotations import builtins -from typing import Any, Dict, MutableMapping, Optional, Tuple, TypeVar, Union +from typing import ( + Any, + Dict, + MutableMapping, + Optional, + Tuple, + Type, + TypeVar, + Union, +) import cupy import numpy as np @@ -279,7 +288,10 @@ def _make_operands_for_binop( reflect: bool = False, *args, **kwargs, - ) -> Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]]: + ) -> Union[ + Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], + Type[NotImplemented], + ]: """Generate the dictionary of operands used for a binary operation. Parameters diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index e4b4d5020ea..9d762f26ebd 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -50,6 +50,83 @@ def _hide_ufunc_warnings(ufunc): yield +@pytest.mark.parametrize("ufunc", _UFUNCS) +def test_ufunc_index(ufunc): + # Note: This test assumes that all ufuncs are unary or binary. + fname = ufunc.__name__ + + N = 100 + # Avoid zeros in either array to skip division by 0 errors. Also limit the + # scale to avoid issues with overflow, etc. We use ints because some + # operations (like bitwise ops) are not defined for floats. + pandas_args = args = [ + cudf.Index(cp.random.randint(low=1, high=10, size=N),) + for _ in range(ufunc.nin) + ] + + try: + got = ufunc(*args) + except AttributeError as e: + # We xfail if we don't have an explicit dispatch and cupy doesn't have + # the method so that we can easily identify these methods. As of this + # writing, the only missing methods are isnat and heaviside. + if "module 'cupy' has no attribute" in str(e): + pytest.xfail(reason="Operation not supported by cupy") + raise + + expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) + + try: + if ufunc.nout > 1: + for g, e in zip(got, expect): + assert_eq(g, e, check_exact=False) + else: + assert_eq(got, expect, check_exact=False) + except AssertionError: + # TODO: This branch can be removed when + # https://github.com/rapidsai/cudf/issues/10178 is resolved + if fname in ("power", "float_power"): + if (got - expect).abs().max() == 1: + pytest.xfail("https://github.com/rapidsai/cudf/issues/10178") + raise + + +@pytest.mark.parametrize( + "ufunc", [np.add, np.greater, np.greater_equal, np.logical_and] +) +@pytest.mark.parametrize("type_", ["cupy", "numpy", "list"]) +@pytest.mark.parametrize("reflect", [True, False]) +def test_binary_ufunc_index_array(ufunc, type_, reflect): + N = 100 + # Avoid zeros in either array to skip division by 0 errors. Also limit the + # scale to avoid issues with overflow, etc. We use ints because some + # operations (like bitwise ops) are not defined for floats. + args = [cudf.Index(cp.random.rand(N)) for _ in range(ufunc.nin)] + + arg1 = args[1].to_cupy() if type_ == "cupy" else args[1].to_numpy() + if type_ == "list": + arg1 = arg1.tolist() + + if reflect: + got = ufunc(arg1, args[0]) + expect = ufunc(args[1].to_numpy(), args[0].to_pandas()) + else: + got = ufunc(args[0], arg1) + expect = ufunc(args[0].to_pandas(), args[1].to_numpy()) + + if ufunc.nout > 1: + for g, e in zip(got, expect): + if type_ == "cupy" and reflect: + assert (cp.asnumpy(g) == e).all() + else: + assert_eq(g, e, check_exact=False) + else: + if type_ == "cupy" and reflect: + assert (cp.asnumpy(got) == expect).all() + else: + assert_eq(got, expect, check_exact=False) + + @pytest.mark.parametrize("ufunc", _UFUNCS) @pytest.mark.parametrize("has_nulls", [True, False]) @pytest.mark.parametrize("indexed", [True, False]) @@ -117,11 +194,11 @@ def test_ufunc_series(ufunc, has_nulls, indexed): for g, e in zip(got, expect): if has_nulls: e[mask] = np.nan - assert_eq(g, e) + assert_eq(g, e, check_exact=False) else: if has_nulls: expect[mask] = np.nan - assert_eq(got, expect) + assert_eq(got, expect, check_exact=False) except AssertionError: # TODO: This branch can be removed when # https://github.com/rapidsai/cudf/issues/10178 is resolved @@ -195,14 +272,14 @@ def test_binary_ufunc_series_array(ufunc, has_nulls, indexed, type_, reflect): if type_ == "cupy" and reflect: assert (cp.asnumpy(g) == e).all() else: - assert_eq(g, e) + assert_eq(g, e, check_exact=False) else: if has_nulls: expect[mask] = np.nan if type_ == "cupy" and reflect: assert (cp.asnumpy(got) == expect).all() else: - assert_eq(got, expect) + assert_eq(got, expect, check_exact=False) @pytest.mark.parametrize( @@ -298,11 +375,11 @@ def test_ufunc_dataframe(ufunc, has_nulls, indexed): for g, e in zip(got, expect): if has_nulls: e[mask] = np.nan - assert_eq(g, e) + assert_eq(g, e, check_exact=False) else: if has_nulls: expect[mask] = np.nan - assert_eq(got, expect) + assert_eq(got, expect, check_exact=False) except AssertionError: # TODO: This branch can be removed when # https://github.com/rapidsai/cudf/issues/10178 is resolved diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index c98568d53a5..db12743ac17 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -1768,10 +1768,6 @@ def test_binops_with_lhs_numpy_scalar(frame, dtype): expected = data.to_pandas() == val got = data == val - # In case of index, expected would be a numpy array - if isinstance(data, cudf.BaseIndex): - expected = pd.Index(expected) - utils.assert_eq(expected, got) @@ -2969,7 +2965,7 @@ def test_binops_non_cudf_types(obj_class, binop, other_type): data = range(1, 100) lhs = obj_class(data) rhs = other_type(data) - assert cp.all((binop(lhs, rhs) == binop(lhs, lhs)).values) + assert (binop(lhs, rhs) == binop(lhs, lhs)).all() @pytest.mark.parametrize("binop", _binops + _binops_compare) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 1db91633c5e..d7a5d07a5fc 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3342,14 +3342,6 @@ def test_select_dtype_datetime_with_frequency(): ) -def test_array_ufunc(): - gdf = cudf.DataFrame({"x": [2, 3, 4.0], "y": [9.0, 2.5, 1.1]}) - pdf = gdf.to_pandas() - - assert_eq(np.sqrt(gdf), np.sqrt(pdf)) - assert_eq(np.sqrt(gdf.x), np.sqrt(pdf.x)) - - def test_dataframe_describe_exclude(): np.random.seed(12) data_length = 10000 diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py index 8d504edd669..57b297004bf 100644 --- a/python/cudf/cudf/tests/test_pickling.py +++ b/python/cudf/cudf/tests/test_pickling.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. import sys @@ -90,9 +90,7 @@ def test_pickle_index(): idx = GenericIndex(np.arange(nelem), name="a") pickled = pickle.dumps(idx) out = pickle.loads(pickled) - # TODO: Once operations like `all` are supported on Index objects, we can - # just use that without calling values first. - assert (idx == out).values.all() + assert (idx == out).all() def test_pickle_buffer():