diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 01a450ce1d0..2919b62b49c 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -675,7 +675,11 @@ def append(self, other: ColumnBase) -> ColumnBase: return concat_columns([self, as_column(other)]) def quantile( - self, q: Union[float, Sequence[float]], interpolation: str, exact: bool + self, + q: np.ndarray, + interpolation: str, + exact: bool, + return_scalar: bool, ) -> ColumnBase: raise TypeError(f"cannot perform quantile with type {self.dtype}") diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 63c934b2b6c..b312f99829f 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -6,7 +6,6 @@ import locale import re from locale import nl_langinfo -from numbers import Number from types import SimpleNamespace from typing import Any, Mapping, Sequence, Union, cast @@ -373,12 +372,19 @@ def median(self, skipna: bool = None) -> pd.Timestamp: ) def quantile( - self, q: Union[float, Sequence[float]], interpolation: str, exact: bool + self, + q: np.ndarray, + interpolation: str, + exact: bool, + return_scalar: bool, ) -> ColumnBase: result = self.as_numerical.quantile( - q=q, interpolation=interpolation, exact=exact + q=q, + interpolation=interpolation, + exact=exact, + return_scalar=return_scalar, ) - if isinstance(q, Number): + if return_scalar: return pd.Timestamp(result, unit=self.time_unit) return result.astype(self.dtype) diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 87e1a87e68b..b547cb43cf5 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -3,8 +3,7 @@ from __future__ import annotations -from numbers import Number -from typing import Sequence, Union +from typing import cast import numpy as np @@ -92,18 +91,28 @@ def skew(self, skipna: bool = None) -> ScalarLike: return skew def quantile( - self, q: Union[float, Sequence[float]], interpolation: str, exact: bool + self, + q: np.ndarray, + interpolation: str, + exact: bool, + return_scalar: bool, ) -> NumericalBaseColumn: - if isinstance(q, Number) or cudf.api.types.is_list_like(q): - np_array_q = np.asarray(q) - if np.logical_or(np_array_q < 0, np_array_q > 1).any(): - raise ValueError( - "percentiles should all be in the interval [0, 1]" - ) + if np.logical_or(q < 0, q > 1).any(): + raise ValueError( + "percentiles should all be in the interval [0, 1]" + ) # Beyond this point, q either being scalar or list-like # will only have values in range [0, 1] - result = self._numeric_quantile(q, interpolation, exact) - if isinstance(q, Number): + if len(self) == 0: + result = cast( + NumericalBaseColumn, + cudf.core.column.column_empty( + row_count=len(q), dtype=self.dtype, masked=True + ), + ) + else: + result = self._numeric_quantile(q, interpolation, exact) + if return_scalar: return ( cudf.utils.dtypes._get_nan_for_dtype(self.dtype) if result[0] is cudf.NA @@ -137,12 +146,16 @@ def median(self, skipna: bool = None) -> NumericalBaseColumn: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) # enforce linear in case the default ever changes - return self.quantile(0.5, interpolation="linear", exact=True) + return self.quantile( + np.array([0.5]), + interpolation="linear", + exact=True, + return_scalar=True, + ) def _numeric_quantile( - self, q: Union[float, Sequence[float]], interpolation: str, exact: bool + self, q: np.ndarray, interpolation: str, exact: bool ) -> NumericalBaseColumn: - quant = [float(q)] if not isinstance(q, (Sequence, np.ndarray)) else q # get sorted indices and exclude nulls sorted_indices = self.as_frame()._get_sorted_inds( ascending=True, na_position="first" @@ -150,7 +163,7 @@ def _numeric_quantile( sorted_indices = sorted_indices[self.null_count :] return libcudf.quantiles.quantile( - self, quant, interpolation, sorted_indices, exact + self, q, interpolation, sorted_indices, exact ) def cov(self, other: NumericalBaseColumn) -> float: diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index d6aedf7f4f4..66e6271a4d1 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -3,8 +3,7 @@ from __future__ import annotations import datetime as dt -from numbers import Number -from typing import Any, Sequence, Tuple, Union, cast +from typing import Any, Sequence, Tuple, cast import numpy as np import pandas as pd @@ -330,12 +329,19 @@ def isin(self, values: Sequence) -> ColumnBase: return cudf.core.tools.datetimes._isin_datetimelike(self, values) def quantile( - self, q: Union[float, Sequence[float]], interpolation: str, exact: bool + self, + q: np.ndarray, + interpolation: str, + exact: bool, + return_scalar: bool, ) -> "column.ColumnBase": result = self.as_numerical.quantile( - q=q, interpolation=interpolation, exact=exact + q=q, + interpolation=interpolation, + exact=exact, + return_scalar=return_scalar, ) - if isinstance(q, Number): + if return_scalar: return pd.Timedelta(result, unit=self.time_unit) return result.astype(self.dtype) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index ef5850ecc17..094e949cdd1 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -7,7 +7,6 @@ import pickle import warnings from collections import abc as abc -from numbers import Number from shutil import get_terminal_size from typing import Any, Dict, MutableMapping, Optional, Set, Tuple, Type, Union @@ -2746,21 +2745,31 @@ def quantile( dtype: float64 """ - result = self._column.quantile(q, interpolation, exact) + return_scalar = is_scalar(q) + if return_scalar: + np_array_q = np.asarray([float(q)]) + else: + try: + np_array_q = np.asarray(q) + except TypeError: + try: + np_array_q = cudf.core.column.as_column(q).values_host + except TypeError: + raise TypeError( + f"q must be a scalar or array-like, got {type(q)}" + ) - if isinstance(q, Number): - return result + result = self._column.quantile( + np_array_q, interpolation, exact, return_scalar=return_scalar + ) - if quant_index: - index = np.asarray(q) - if len(self) == 0: - result = column_empty_like( - index, dtype=self.dtype, masked=True, newsize=len(index), - ) - else: - index = None + if return_scalar: + return result - return Series(result, index=index, name=self.name) + return Series._from_data( + data={self.name: result}, + index=as_index(np_array_q) if quant_index else None, + ) @docutils.doc_describe() @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py index 4055485c49a..c4e3f690d13 100644 --- a/python/cudf/cudf/tests/test_quantiles.py +++ b/python/cudf/cudf/tests/test_quantiles.py @@ -1,7 +1,12 @@ +# Copyright (c) 2020-2022, NVIDIA CORPORATION. + +import re + import pandas as pd +import pytest import cudf -from cudf.testing._utils import assert_eq +from cudf.testing._utils import assert_eq, assert_exceptions_equal def test_single_q(): @@ -46,3 +51,30 @@ def test_with_multiindex(): gdf_q = gdf.quantiles(q, interpolation="nearest") assert_eq(pdf_q, gdf_q, check_index_type=False) + + +@pytest.mark.parametrize("q", [2, [1, 2, 3]]) +def test_quantile_range_error(q): + ps = pd.Series([1, 2, 3]) + gs = cudf.from_pandas(ps) + assert_exceptions_equal( + lfunc=ps.quantile, + rfunc=gs.quantile, + lfunc_args_and_kwargs=([q],), + rfunc_args_and_kwargs=([q],), + expected_error_message=re.escape( + "percentiles should all be in the interval [0, 1]" + ), + ) + + +def test_quantile_q_type(): + gs = cudf.Series([1, 2, 3]) + with pytest.raises( + TypeError, + match=re.escape( + "q must be a scalar or array-like, got " + ), + ): + gs.quantile(cudf.DataFrame()) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 7bf339d6ab7..11ab69bda41 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -2,6 +2,7 @@ from concurrent.futures import ThreadPoolExecutor +import cupy as cp import numpy as np import pandas as pd import pytest @@ -201,13 +202,25 @@ def test_approx_quantiles_int(): @pytest.mark.parametrize("data", [[], [1, 2, 3, 10, 326497]]) -@pytest.mark.parametrize("q", [[], 0.5, 1, 0.234, [0.345], [0.243, 0.5, 1]]) +@pytest.mark.parametrize( + "q", + [ + [], + 0.5, + 1, + 0.234, + [0.345], + [0.243, 0.5, 1], + np.array([0.5, 1]), + cp.array([0.5, 1]), + ], +) def test_misc_quantiles(data, q): pdf_series = cudf.utils.utils._create_pandas_series(data=data) gdf_series = cudf.Series(data) - expected = pdf_series.quantile(q) + expected = pdf_series.quantile(q.get() if isinstance(q, cp.ndarray) else q) actual = gdf_series.quantile(q) assert_eq(expected, actual)