Skip to content

Commit

Permalink
Deprecate dtype= parameter in reduction methods (#16313)
Browse files Browse the repository at this point in the history
In terms of pandas alignment, this argument doesn't exist in reduction ops. Additionally, the same result can be easily achieved by calling `astype` after the operation, and it appears libcudf does not support any arbitrary casting to an output type.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #16313
  • Loading branch information
mroeschke authored Jul 19, 2024
1 parent dc62177 commit cb570fe
Show file tree
Hide file tree
Showing 7 changed files with 45 additions and 40 deletions.
15 changes: 10 additions & 5 deletions python/cudf/cudf/_lib/reduce.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
import warnings

import cudf
from cudf.core.buffer import acquire_spill_lock
Expand Down Expand Up @@ -26,11 +27,15 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
A numpy data type to use for the output, defaults
to the same type as the input column
"""

col_dtype = (
dtype if dtype is not None
else incol._reduction_result_dtype(reduction_op)
)
if dtype is not None:
warnings.warn(
"dtype is deprecated and will be remove in a future release. "
"Cast the result (e.g. .astype) after the operation instead.",
FutureWarning
)
col_dtype = dtype
else:
col_dtype = incol._reduction_result_dtype(reduction_op)

# check empty case
if len(incol) <= incol.null_count:
Expand Down
11 changes: 8 additions & 3 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ def all(self, skipna: bool = True) -> bool:
if self.null_count == self.size:
return True

return libcudf.reduce.reduce("all", self, dtype=np.bool_)
return libcudf.reduce.reduce("all", self)

def any(self, skipna: bool = True) -> bool:
# Early exit for fast cases.
Expand All @@ -271,7 +271,7 @@ def any(self, skipna: bool = True) -> bool:
elif skipna and self.null_count == self.size:
return False

return libcudf.reduce.reduce("any", self, dtype=np.bool_)
return libcudf.reduce.reduce("any", self)

def dropna(self) -> Self:
if self.has_nulls():
Expand Down Expand Up @@ -1305,7 +1305,10 @@ def _reduce(
skipna=skipna, min_count=min_count
)
if isinstance(preprocessed, ColumnBase):
return libcudf.reduce.reduce(op, preprocessed, **kwargs)
dtype = kwargs.pop("dtype", None)
return libcudf.reduce.reduce(
op, preprocessed, dtype=dtype, **kwargs
)
return preprocessed

def _process_for_reduction(
Expand Down Expand Up @@ -1336,6 +1339,8 @@ def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
Determine the correct dtype to pass to libcudf based on
the input dtype, data dtype, and specific reduction op
"""
if reduction_op in {"any", "all"}:
return np.dtype(np.bool_)
return self.dtype

def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
Expand Down
9 changes: 3 additions & 6 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,26 +485,23 @@ def as_string_column(self) -> cudf.core.column.StringColumn:
format = format.split(" ")[0]
return self.strftime(format)

def mean(
self, skipna=None, min_count: int = 0, dtype=np.float64
) -> ScalarLike:
def mean(self, skipna=None, min_count: int = 0) -> ScalarLike:
return pd.Timestamp(
cast(
"cudf.core.column.NumericalColumn", self.astype("int64")
).mean(skipna=skipna, min_count=min_count, dtype=dtype),
).mean(skipna=skipna, min_count=min_count),
unit=self.time_unit,
).as_unit(self.time_unit)

def std(
self,
skipna: bool | None = None,
min_count: int = 0,
dtype: Dtype = np.float64,
ddof: int = 1,
) -> pd.Timedelta:
return pd.Timedelta(
cast("cudf.core.column.NumericalColumn", self.astype("int64")).std(
skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
skipna=skipna, min_count=min_count, ddof=ddof
)
* _unit_to_nanoseconds_conversion[self.time_unit],
).as_unit(self.time_unit)
Expand Down
17 changes: 9 additions & 8 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ def all(self, skipna: bool = True) -> bool:
if result_col.null_count == result_col.size:
return True

return libcudf.reduce.reduce("all", result_col, dtype=np.bool_)
return libcudf.reduce.reduce("all", result_col)

def any(self, skipna: bool = True) -> bool:
# Early exit for fast cases.
Expand All @@ -406,7 +406,7 @@ def any(self, skipna: bool = True) -> bool:
elif skipna and result_col.null_count == result_col.size:
return False

return libcudf.reduce.reduce("any", result_col, dtype=np.bool_)
return libcudf.reduce.reduce("any", result_col)

@functools.cached_property
def nan_count(self) -> int:
Expand Down Expand Up @@ -684,15 +684,16 @@ def to_pandas(
return super().to_pandas(nullable=nullable, arrow_type=arrow_type)

def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
col_dtype = self.dtype
if reduction_op in {"sum", "product"}:
col_dtype = (
col_dtype if col_dtype.kind == "f" else np.dtype("int64")
)
if self.dtype.kind == "f":
return self.dtype
return np.dtype("int64")
elif reduction_op == "sum_of_squares":
col_dtype = np.result_dtype(col_dtype, np.dtype("uint64"))
return np.result_dtype(self.dtype, np.dtype("uint64"))
elif reduction_op in {"var", "std", "mean"}:
return np.dtype("float64")

return col_dtype
return super()._reduction_result_dtype(reduction_op)


def _normalize_find_and_replace_input(
Expand Down
11 changes: 3 additions & 8 deletions python/cudf/cudf/core/column/numerical_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,32 +144,27 @@ def mean(
self,
skipna: bool | None = None,
min_count: int = 0,
dtype=np.float64,
):
return self._reduce(
"mean", skipna=skipna, min_count=min_count, dtype=dtype
)
return self._reduce("mean", skipna=skipna, min_count=min_count)

def var(
self,
skipna: bool | None = None,
min_count: int = 0,
dtype=np.float64,
ddof=1,
):
return self._reduce(
"var", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
"var", skipna=skipna, min_count=min_count, ddof=ddof
)

def std(
self,
skipna: bool | None = None,
min_count: int = 0,
dtype=np.float64,
ddof=1,
):
return self._reduce(
"std", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
"std", skipna=skipna, min_count=min_count, ddof=ddof
)

def median(self, skipna: bool | None = None) -> NumericalBaseColumn:
Expand Down
7 changes: 3 additions & 4 deletions python/cudf/cudf/core/column/timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,11 +287,11 @@ def as_timedelta_column(self, dtype: Dtype) -> TimeDeltaColumn:
return self
return libcudf.unary.cast(self, dtype=dtype)

def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta:
def mean(self, skipna=None) -> pd.Timedelta:
return pd.Timedelta(
cast(
"cudf.core.column.NumericalColumn", self.astype("int64")
).mean(skipna=skipna, dtype=dtype),
).mean(skipna=skipna),
unit=self.time_unit,
).as_unit(self.time_unit)

Expand Down Expand Up @@ -345,12 +345,11 @@ def std(
self,
skipna: bool | None = None,
min_count: int = 0,
dtype: Dtype = np.float64,
ddof: int = 1,
) -> pd.Timedelta:
return pd.Timedelta(
cast("cudf.core.column.NumericalColumn", self.astype("int64")).std(
skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype
skipna=skipna, min_count=min_count, ddof=ddof
),
unit=self.time_unit,
).as_unit(self.time_unit)
Expand Down
15 changes: 9 additions & 6 deletions python/cudf/cudf/tests/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,16 +248,11 @@ def test_sum_masked(nelem):

def test_sum_boolean():
s = Series(np.arange(100000))
got = (s > 1).sum(dtype=np.int32)
got = (s > 1).sum()
expect = 99998

assert expect == got

got = (s > 1).sum(dtype=np.bool_)
expect = True

assert expect == got


def test_date_minmax():
np_data = np.random.normal(size=10**3)
Expand Down Expand Up @@ -371,3 +366,11 @@ def test_reduction_column_multiindex():
result = df.mean()
expected = df.to_pandas().mean()
assert_eq(result, expected)


@pytest.mark.parametrize("op", ["sum", "product"])
def test_dtype_deprecated(op):
ser = cudf.Series(range(5))
with pytest.warns(FutureWarning):
result = getattr(ser, op)(dtype=np.dtype(np.int8))
assert isinstance(result, np.int8)

0 comments on commit cb570fe

Please sign in to comment.