From b2e5863266cd6990f6bba15ca2d986ed30e45f84 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:02:06 -0800 Subject: [PATCH] Ignore NaN correctly in .quantile --- python/cudf/cudf/core/column/numerical_base.py | 7 ++++--- python/cudf/cudf/tests/test_quantiles.py | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index aaf2239a71e..689d5132d45 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -143,13 +143,14 @@ def quantile( ), ) else: + no_nans = self.nans_to_nulls() # get sorted indices and exclude nulls indices = sorting.order_by( - [self], [True], "first", stable=True - ).slice(self.null_count, len(self)) + [no_nans], [True], "first", stable=True + ).slice(no_nans.null_count, len(no_nans)) with acquire_spill_lock(): plc_column = plc.quantiles.quantile( - self.to_pylibcudf(mode="read"), + no_nans.to_pylibcudf(mode="read"), q, plc.types.Interpolation[interpolation.upper()], indices.to_pylibcudf(mode="read"), diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py index 9a2816f5444..84de2ac38e7 100644 --- a/python/cudf/cudf/tests/test_quantiles.py +++ b/python/cudf/cudf/tests/test_quantiles.py @@ -91,3 +91,19 @@ def test_quantile_type_int_float(interpolation): assert expected == actual assert type(expected) is type(actual) + + +@pytest.mark.parametrize( + "data", + [ + [float("nan"), float("nan"), 0.9], + [float("nan"), float("nan"), float("nan")], + ], +) +def test_ignore_nans(data): + psr = pd.Series(data) + gsr = cudf.Series(data, nan_as_null=False) + + expected = gsr.quantile(0.9) + result = psr.quantile(0.9) + assert_eq(result, expected)