diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 10d5a518f686d..b68f941dfbcc0 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -296,6 +296,7 @@ Bug fixes - Fixed bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Fixed bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`) - Fixed bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) +- Fixed bug in :meth:`GroupBy.median` where nat values gave an incorrect result. (:issue:`57926`) - Fixed bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) - Fixed bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Fixed bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 95ac555303221..53f5f73624232 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -12,6 +12,7 @@ def group_median_float64( min_count: int = ..., # Py_ssize_t mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., + is_datetimelike: bool = ..., # bint ) -> None: ... def group_cumprod( out: np.ndarray, # float64_t[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 2ff45038d6a3e..f3df54222a126 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -101,7 +101,11 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n return result -cdef float64_t median_linear(float64_t* a, int n) noexcept nogil: +cdef float64_t median_linear( + float64_t* a, + int n, + bint is_datetimelike=False +) noexcept nogil: cdef: int i, j, na_count = 0 float64_t* tmp @@ -111,9 +115,14 @@ cdef float64_t median_linear(float64_t* a, int n) noexcept nogil: return NaN # count NAs - for i in range(n): - if a[i] != a[i]: - na_count += 1 + if is_datetimelike: + for i in range(n): + if a[i] == NPY_NAT: + na_count += 1 + else: + for i in range(n): + if a[i] != a[i]: + na_count += 1 if na_count: if na_count == n: @@ -125,6 +134,8 @@ cdef float64_t median_linear(float64_t* a, int n) noexcept nogil: j = 0 for i in range(n): + if is_datetimelike and a[i] == NPY_NAT: + continue if a[i] == a[i]: tmp[j] = a[i] j += 1 @@ -170,6 +181,7 @@ def group_median_float64( Py_ssize_t min_count=-1, const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, + bint is_datetimelike=False, ) -> None: """ Only aggregates on axis=0 @@ -228,7 +240,7 @@ def group_median_float64( ptr += _counts[0] for j in range(ngroups): size = _counts[j + 1] - out[j, i] = median_linear(ptr, size) + out[j, i] = median_linear(ptr, size, is_datetimelike) ptr += size diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index acf4c7bebf52d..8585ae3828247 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -415,6 +415,7 @@ def _call_cython_op( "last", "first", "sum", + "median", ]: func( out=result, @@ -427,7 +428,7 @@ def _call_cython_op( is_datetimelike=is_datetimelike, **kwargs, ) - elif self.how in ["sem", "std", "var", "ohlc", "prod", "median"]: + elif self.how in ["sem", "std", "var", "ohlc", "prod"]: if self.how in ["std", "sem"]: kwargs["is_datetimelike"] = is_datetimelike func( diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 00e781e6a7f07..2a9114d13c8ae 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -145,6 +145,13 @@ def test_len_nan_group(): assert len(df.groupby(["a", "b"])) == 0 +def test_groupby_timedelta_median(): + # issue 57926 + df = DataFrame({"label": ["foo", "foo"], "timedelta": [pd.NaT, Timedelta("1d")]}) + median = df.groupby("label")["timedelta"].median() + assert median.loc["foo"] == Timedelta("1d") + + @pytest.mark.parametrize("keys", [["a"], ["a", "b"]]) def test_len_categorical(dropna, observed, keys): # GH#57595