diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx index 56bfa0ba332..64634b7a6f9 100644 --- a/python/cudf/cudf/_lib/reduce.pyx +++ b/python/cudf/cudf/_lib/reduce.pyx @@ -1,4 +1,5 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +import warnings import cudf from cudf.core.buffer import acquire_spill_lock @@ -26,11 +27,15 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs): A numpy data type to use for the output, defaults to the same type as the input column """ - - col_dtype = ( - dtype if dtype is not None - else incol._reduction_result_dtype(reduction_op) - ) + if dtype is not None: + warnings.warn( + "dtype is deprecated and will be remove in a future release. " + "Cast the result (e.g. .astype) after the operation instead.", + FutureWarning + ) + col_dtype = dtype + else: + col_dtype = incol._reduction_result_dtype(reduction_op) # check empty case if len(incol) <= incol.null_count: diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 9467bbeed15..5e77aa87e4e 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -261,7 +261,7 @@ def all(self, skipna: bool = True) -> bool: if self.null_count == self.size: return True - return libcudf.reduce.reduce("all", self, dtype=np.bool_) + return libcudf.reduce.reduce("all", self) def any(self, skipna: bool = True) -> bool: # Early exit for fast cases. @@ -271,7 +271,7 @@ def any(self, skipna: bool = True) -> bool: elif skipna and self.null_count == self.size: return False - return libcudf.reduce.reduce("any", self, dtype=np.bool_) + return libcudf.reduce.reduce("any", self) def dropna(self) -> Self: if self.has_nulls(): @@ -1305,7 +1305,10 @@ def _reduce( skipna=skipna, min_count=min_count ) if isinstance(preprocessed, ColumnBase): - return libcudf.reduce.reduce(op, preprocessed, **kwargs) + dtype = kwargs.pop("dtype", None) + return libcudf.reduce.reduce( + op, preprocessed, dtype=dtype, **kwargs + ) return preprocessed def _process_for_reduction( @@ -1336,6 +1339,8 @@ def _reduction_result_dtype(self, reduction_op: str) -> Dtype: Determine the correct dtype to pass to libcudf based on the input dtype, data dtype, and specific reduction op """ + if reduction_op in {"any", "all"}: + return np.dtype(np.bool_) return self.dtype def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase: diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 004a059af95..a4538179415 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -485,13 +485,11 @@ def as_string_column(self) -> cudf.core.column.StringColumn: format = format.split(" ")[0] return self.strftime(format) - def mean( - self, skipna=None, min_count: int = 0, dtype=np.float64 - ) -> ScalarLike: + def mean(self, skipna=None, min_count: int = 0) -> ScalarLike: return pd.Timestamp( cast( "cudf.core.column.NumericalColumn", self.astype("int64") - ).mean(skipna=skipna, min_count=min_count, dtype=dtype), + ).mean(skipna=skipna, min_count=min_count), unit=self.time_unit, ).as_unit(self.time_unit) @@ -499,12 +497,11 @@ def std( self, skipna: bool | None = None, min_count: int = 0, - dtype: Dtype = np.float64, ddof: int = 1, ) -> pd.Timedelta: return pd.Timedelta( cast("cudf.core.column.NumericalColumn", self.astype("int64")).std( - skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof + skipna=skipna, min_count=min_count, ddof=ddof ) * _unit_to_nanoseconds_conversion[self.time_unit], ).as_unit(self.time_unit) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index cea68c88c90..ba080863722 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -395,7 +395,7 @@ def all(self, skipna: bool = True) -> bool: if result_col.null_count == result_col.size: return True - return libcudf.reduce.reduce("all", result_col, dtype=np.bool_) + return libcudf.reduce.reduce("all", result_col) def any(self, skipna: bool = True) -> bool: # Early exit for fast cases. @@ -406,7 +406,7 @@ def any(self, skipna: bool = True) -> bool: elif skipna and result_col.null_count == result_col.size: return False - return libcudf.reduce.reduce("any", result_col, dtype=np.bool_) + return libcudf.reduce.reduce("any", result_col) @functools.cached_property def nan_count(self) -> int: @@ -684,15 +684,16 @@ def to_pandas( return super().to_pandas(nullable=nullable, arrow_type=arrow_type) def _reduction_result_dtype(self, reduction_op: str) -> Dtype: - col_dtype = self.dtype if reduction_op in {"sum", "product"}: - col_dtype = ( - col_dtype if col_dtype.kind == "f" else np.dtype("int64") - ) + if self.dtype.kind == "f": + return self.dtype + return np.dtype("int64") elif reduction_op == "sum_of_squares": - col_dtype = np.result_dtype(col_dtype, np.dtype("uint64")) + return np.result_dtype(self.dtype, np.dtype("uint64")) + elif reduction_op in {"var", "std", "mean"}: + return np.dtype("float64") - return col_dtype + return super()._reduction_result_dtype(reduction_op) def _normalize_find_and_replace_input( diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 95c78c5efcb..f41010062c8 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -144,32 +144,27 @@ def mean( self, skipna: bool | None = None, min_count: int = 0, - dtype=np.float64, ): - return self._reduce( - "mean", skipna=skipna, min_count=min_count, dtype=dtype - ) + return self._reduce("mean", skipna=skipna, min_count=min_count) def var( self, skipna: bool | None = None, min_count: int = 0, - dtype=np.float64, ddof=1, ): return self._reduce( - "var", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof + "var", skipna=skipna, min_count=min_count, ddof=ddof ) def std( self, skipna: bool | None = None, min_count: int = 0, - dtype=np.float64, ddof=1, ): return self._reduce( - "std", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof + "std", skipna=skipna, min_count=min_count, ddof=ddof ) def median(self, skipna: bool | None = None) -> NumericalBaseColumn: diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 36d7d9f9614..59ea1cc002c 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -287,11 +287,11 @@ def as_timedelta_column(self, dtype: Dtype) -> TimeDeltaColumn: return self return libcudf.unary.cast(self, dtype=dtype) - def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta: + def mean(self, skipna=None) -> pd.Timedelta: return pd.Timedelta( cast( "cudf.core.column.NumericalColumn", self.astype("int64") - ).mean(skipna=skipna, dtype=dtype), + ).mean(skipna=skipna), unit=self.time_unit, ).as_unit(self.time_unit) @@ -345,12 +345,11 @@ def std( self, skipna: bool | None = None, min_count: int = 0, - dtype: Dtype = np.float64, ddof: int = 1, ) -> pd.Timedelta: return pd.Timedelta( cast("cudf.core.column.NumericalColumn", self.astype("int64")).std( - skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype + skipna=skipna, min_count=min_count, ddof=ddof ), unit=self.time_unit, ).as_unit(self.time_unit) diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index 1247fa362ce..8be6463c699 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -248,16 +248,11 @@ def test_sum_masked(nelem): def test_sum_boolean(): s = Series(np.arange(100000)) - got = (s > 1).sum(dtype=np.int32) + got = (s > 1).sum() expect = 99998 assert expect == got - got = (s > 1).sum(dtype=np.bool_) - expect = True - - assert expect == got - def test_date_minmax(): np_data = np.random.normal(size=10**3) @@ -371,3 +366,11 @@ def test_reduction_column_multiindex(): result = df.mean() expected = df.to_pandas().mean() assert_eq(result, expected) + + +@pytest.mark.parametrize("op", ["sum", "product"]) +def test_dtype_deprecated(op): + ser = cudf.Series(range(5)) + with pytest.warns(FutureWarning): + result = getattr(ser, op)(dtype=np.dtype(np.int8)) + assert isinstance(result, np.int8)