Skip to content

Commit

Permalink
Match pandas.Series.quantile behavior
Browse files Browse the repository at this point in the history
  • Loading branch information
galipremsagar committed Aug 31, 2023
1 parent f999e1c commit a48fb54
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 5 deletions.
10 changes: 10 additions & 0 deletions python/cudf/cudf/core/column/numerical_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,16 @@ def quantile(
result = self._numeric_quantile(q, interpolation, exact)
if return_scalar:
scalar_result = result.element_indexing(0)
if interpolation in {"lower", "higher", "nearest"}:
try:
new_scalar = self.dtype.type(scalar_result)
scalar_result = (
new_scalar
if new_scalar == scalar_result
else scalar_result
)
except (TypeError, ValueError):
pass
return (
cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
if scalar_result is NA
Expand Down
13 changes: 10 additions & 3 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5487,16 +5487,23 @@ def quantile(
numeric_only : bool, default True
If False, the quantile of datetime and timedelta data will be
computed as well.
interpolation : {`linear`, `lower`, `higher`, `midpoint`, `nearest`}
interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
This parameter specifies the interpolation method to use,
when the desired quantile lies between two data points i and j.
Default is ``linear`` for ``method="single"``, and ``nearest``
Default is ``'linear'`` for ``method="single"``, and ``'nearest'``
for ``method="table"``.
* linear: `i + (j - i) * fraction`, where `fraction` is the
fractional part of the index surrounded by `i` and `j`.
* lower: `i`.
* higher: `j`.
* nearest: `i` or `j` whichever is nearest.
* midpoint: (`i` + `j`) / 2.
columns : list of str
List of column names to include.
exact : boolean
Whether to use approximate or exact quantile algorithm.
method : {`single`, `table`}, default `single`
method : {'single', 'table'}, default `'single'`
Whether to compute quantiles per-column ('single') or over all
columns ('table'). When 'table', the only allowed interpolation
methods are 'nearest', 'lower', and 'higher'.
Expand Down
9 changes: 7 additions & 2 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3132,8 +3132,13 @@ def quantile(
interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
This optional parameter specifies the interpolation method to use,
when the desired quantile lies between two data points i and j:
columns : list of str
List of column names to include.
* linear: `i + (j - i) * fraction`, where `fraction` is the
fractional part of the index surrounded by `i` and `j`.
* lower: `i`.
* higher: `j`.
* nearest: `i` or `j` whichever is nearest.
* midpoint: (`i` + `j`) / 2.
exact : boolean
Whether to use approximate or exact quantile algorithm.
quant_index : boolean
Expand Down
15 changes: 15 additions & 0 deletions python/cudf/cudf/tests/test_quantiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,18 @@ def test_quantile_q_type():
),
):
gs.quantile(cudf.DataFrame())


@pytest.mark.parametrize(
"interpolation", ["linear", "lower", "higher", "midpoint", "nearest"]
)
def test_quantile_type_int_float(interpolation):
data = [1, 3, 4]
psr = pd.Series(data)
gsr = cudf.Series(data)

expected = psr.quantile(0.5, interpolation=interpolation)
actual = gsr.quantile(0.5, interpolation=interpolation)

assert expected == actual
assert type(expected) == type(actual)

0 comments on commit a48fb54

Please sign in to comment.