Skip to content

Commit

Permalink
Ignore NaN correctly in .quantile (#17593)
Browse files Browse the repository at this point in the history
From an offline conversation, fixes the follow discrepancy between cudf and pandas

```python
In [1]: import cudf

In [2]: import numpy as np

In [3]: ser = cudf.Series([np.nan, np.nan, 0.9], nan_as_null=False)

In [4]: ser
Out[4]: 
0    NaN
1    NaN
2    0.9
dtype: float64

In [5]: ser.quantile(0.9)
Out[5]: np.float64(nan)

In [6]: import pandas as pd

In [7]: ser = pd.Series([np.nan, np.nan, 0.9])

In [8]: ser.quantile(0.9)
Out[8]: np.float64(0.9)
```

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: #17593
  • Loading branch information
mroeschke authored Dec 13, 2024
1 parent 34e2045 commit 76b35ad
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 3 deletions.
7 changes: 4 additions & 3 deletions python/cudf/cudf/core/column/numerical_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,14 @@ def quantile(
),
)
else:
no_nans = self.nans_to_nulls()
# get sorted indices and exclude nulls
indices = sorting.order_by(
[self], [True], "first", stable=True
).slice(self.null_count, len(self))
[no_nans], [True], "first", stable=True
).slice(no_nans.null_count, len(no_nans))
with acquire_spill_lock():
plc_column = plc.quantiles.quantile(
self.to_pylibcudf(mode="read"),
no_nans.to_pylibcudf(mode="read"),
q,
plc.types.Interpolation[interpolation.upper()],
indices.to_pylibcudf(mode="read"),
Expand Down
16 changes: 16 additions & 0 deletions python/cudf/cudf/tests/test_quantiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,19 @@ def test_quantile_type_int_float(interpolation):

assert expected == actual
assert type(expected) is type(actual)


@pytest.mark.parametrize(
"data",
[
[float("nan"), float("nan"), 0.9],
[float("nan"), float("nan"), float("nan")],
],
)
def test_ignore_nans(data):
psr = pd.Series(data)
gsr = cudf.Series(data, nan_as_null=False)

expected = gsr.quantile(0.9)
result = psr.quantile(0.9)
assert_eq(result, expected)

0 comments on commit 76b35ad

Please sign in to comment.