Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deprecate dtype= parameter in reduction methods #16313

Merged
merged 1 commit into from
Jul 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions python/cudf/cudf/_lib/reduce.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
import warnings

import cudf
from cudf.core.buffer import acquire_spill_lock
Expand Down Expand Up @@ -26,11 +27,15 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
A numpy data type to use for the output, defaults
to the same type as the input column
"""

col_dtype = (
dtype if dtype is not None
else incol._reduction_result_dtype(reduction_op)
)
if dtype is not None:
warnings.warn(
"dtype is deprecated and will be remove in a future release. "
"Cast the result (e.g. .astype) after the operation instead.",
FutureWarning
)
col_dtype = dtype
else:
col_dtype = incol._reduction_result_dtype(reduction_op)

# check empty case
if len(incol) <= incol.null_count:
Expand Down
11 changes: 8 additions & 3 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ def all(self, skipna: bool = True) -> bool:
if self.null_count == self.size:
return True

return libcudf.reduce.reduce("all", self, dtype=np.bool_)
return libcudf.reduce.reduce("all", self)

def any(self, skipna: bool = True) -> bool:
# Early exit for fast cases.
Expand All @@ -271,7 +271,7 @@ def any(self, skipna: bool = True) -> bool:
elif skipna and self.null_count == self.size:
return False

return libcudf.reduce.reduce("any", self, dtype=np.bool_)
return libcudf.reduce.reduce("any", self)

def dropna(self) -> Self:
if self.has_nulls():
Expand Down Expand Up @@ -1305,7 +1305,10 @@ def _reduce(
skipna=skipna, min_count=min_count
)
if isinstance(preprocessed, ColumnBase):
return libcudf.reduce.reduce(op, preprocessed, **kwargs)
dtype = kwargs.pop("dtype", None)
return libcudf.reduce.reduce(
op, preprocessed, dtype=dtype, **kwargs
)
return preprocessed

def _process_for_reduction(
Expand Down Expand Up @@ -1336,6 +1339,8 @@ def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
Determine the correct dtype to pass to libcudf based on
the input dtype, data dtype, and specific reduction op
"""
if reduction_op in {"any", "all"}:
return np.dtype(np.bool_)
return self.dtype

def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
Expand Down
9 changes: 3 additions & 6 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,26 +485,23 @@ def as_string_column(self) -> cudf.core.column.StringColumn:
format = format.split(" ")[0]
return self.strftime(format)

def mean(
self, skipna=None, min_count: int = 0, dtype=np.float64
) -> ScalarLike:
def mean(self, skipna=None, min_count: int = 0) -> ScalarLike:
return pd.Timestamp(
cast(
"cudf.core.column.NumericalColumn", self.astype("int64")
).mean(skipna=skipna, min_count=min_count, dtype=dtype),
).mean(skipna=skipna, min_count=min_count),
unit=self.time_unit,
).as_unit(self.time_unit)

def std(
self,
skipna: bool | None = None,
min_count: int = 0,
dtype: Dtype = np.float64,
ddof: int = 1,
) -> pd.Timedelta:
return pd.Timedelta(
cast("cudf.core.column.NumericalColumn", self.astype("int64")).std(
skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
skipna=skipna, min_count=min_count, ddof=ddof
)
* _unit_to_nanoseconds_conversion[self.time_unit],
).as_unit(self.time_unit)
Expand Down
17 changes: 9 additions & 8 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ def all(self, skipna: bool = True) -> bool:
if result_col.null_count == result_col.size:
return True

return libcudf.reduce.reduce("all", result_col, dtype=np.bool_)
return libcudf.reduce.reduce("all", result_col)

def any(self, skipna: bool = True) -> bool:
# Early exit for fast cases.
Expand All @@ -406,7 +406,7 @@ def any(self, skipna: bool = True) -> bool:
elif skipna and result_col.null_count == result_col.size:
return False

return libcudf.reduce.reduce("any", result_col, dtype=np.bool_)
return libcudf.reduce.reduce("any", result_col)

@functools.cached_property
def nan_count(self) -> int:
Expand Down Expand Up @@ -684,15 +684,16 @@ def to_pandas(
return super().to_pandas(nullable=nullable, arrow_type=arrow_type)

def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
col_dtype = self.dtype
if reduction_op in {"sum", "product"}:
col_dtype = (
col_dtype if col_dtype.kind == "f" else np.dtype("int64")
)
if self.dtype.kind == "f":
return self.dtype
return np.dtype("int64")
elif reduction_op == "sum_of_squares":
col_dtype = np.result_dtype(col_dtype, np.dtype("uint64"))
return np.result_dtype(self.dtype, np.dtype("uint64"))
elif reduction_op in {"var", "std", "mean"}:
return np.dtype("float64")

return col_dtype
return super()._reduction_result_dtype(reduction_op)


def _normalize_find_and_replace_input(
Expand Down
11 changes: 3 additions & 8 deletions python/cudf/cudf/core/column/numerical_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,32 +144,27 @@ def mean(
self,
skipna: bool | None = None,
min_count: int = 0,
dtype=np.float64,
):
return self._reduce(
"mean", skipna=skipna, min_count=min_count, dtype=dtype
)
return self._reduce("mean", skipna=skipna, min_count=min_count)

def var(
self,
skipna: bool | None = None,
min_count: int = 0,
dtype=np.float64,
ddof=1,
):
return self._reduce(
"var", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
"var", skipna=skipna, min_count=min_count, ddof=ddof
)

def std(
self,
skipna: bool | None = None,
min_count: int = 0,
dtype=np.float64,
ddof=1,
):
return self._reduce(
"std", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
"std", skipna=skipna, min_count=min_count, ddof=ddof
)

def median(self, skipna: bool | None = None) -> NumericalBaseColumn:
Expand Down
7 changes: 3 additions & 4 deletions python/cudf/cudf/core/column/timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,11 +287,11 @@ def as_timedelta_column(self, dtype: Dtype) -> TimeDeltaColumn:
return self
return libcudf.unary.cast(self, dtype=dtype)

def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta:
def mean(self, skipna=None) -> pd.Timedelta:
return pd.Timedelta(
cast(
"cudf.core.column.NumericalColumn", self.astype("int64")
).mean(skipna=skipna, dtype=dtype),
).mean(skipna=skipna),
unit=self.time_unit,
).as_unit(self.time_unit)

Expand Down Expand Up @@ -345,12 +345,11 @@ def std(
self,
skipna: bool | None = None,
min_count: int = 0,
dtype: Dtype = np.float64,
ddof: int = 1,
) -> pd.Timedelta:
return pd.Timedelta(
cast("cudf.core.column.NumericalColumn", self.astype("int64")).std(
skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype
skipna=skipna, min_count=min_count, ddof=ddof
),
unit=self.time_unit,
).as_unit(self.time_unit)
Expand Down
15 changes: 9 additions & 6 deletions python/cudf/cudf/tests/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,16 +248,11 @@ def test_sum_masked(nelem):

def test_sum_boolean():
s = Series(np.arange(100000))
got = (s > 1).sum(dtype=np.int32)
got = (s > 1).sum()
expect = 99998

assert expect == got

got = (s > 1).sum(dtype=np.bool_)
expect = True

assert expect == got


def test_date_minmax():
np_data = np.random.normal(size=10**3)
Expand Down Expand Up @@ -371,3 +366,11 @@ def test_reduction_column_multiindex():
result = df.mean()
expected = df.to_pandas().mean()
assert_eq(result, expected)


@pytest.mark.parametrize("op", ["sum", "product"])
def test_dtype_deprecated(op):
ser = cudf.Series(range(5))
with pytest.warns(FutureWarning):
result = getattr(ser, op)(dtype=np.dtype(np.int8))
assert isinstance(result, np.int8)
Loading