Skip to content

Commit

Permalink
BUG: Groupby median on timedelta column with NaT returns odd value (p…
Browse files Browse the repository at this point in the history
…andas-dev#57926)

Handle NaT correctly in group_median_float64
  • Loading branch information
JJLLWW committed Mar 22, 2024
1 parent 8704cfa commit f9a6d3d
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 6 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ Bug fixes
- Fixed bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
- Fixed bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`)
- Fixed bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`)
- Fixed bug in :meth:`GroupBy.median` where nat values gave an incorrect result. (:issue:`57926`)
- Fixed bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`)
- Fixed bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`)
- Fixed bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`)
Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def group_median_float64(
min_count: int = ..., # Py_ssize_t
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
is_datetimelike: bool = ..., # bint
) -> None: ...
def group_cumprod(
out: np.ndarray, # float64_t[:, ::1]
Expand Down
22 changes: 17 additions & 5 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,11 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n
return result


cdef float64_t median_linear(float64_t* a, int n) noexcept nogil:
cdef float64_t median_linear(
float64_t* a,
int n,
bint is_datetimelike=False
) noexcept nogil:
cdef:
int i, j, na_count = 0
float64_t* tmp
Expand All @@ -111,9 +115,14 @@ cdef float64_t median_linear(float64_t* a, int n) noexcept nogil:
return NaN

# count NAs
for i in range(n):
if a[i] != a[i]:
na_count += 1
if is_datetimelike:
for i in range(n):
if a[i] == NPY_NAT:
na_count += 1
else:
for i in range(n):
if a[i] != a[i]:
na_count += 1

if na_count:
if na_count == n:
Expand All @@ -125,6 +134,8 @@ cdef float64_t median_linear(float64_t* a, int n) noexcept nogil:

j = 0
for i in range(n):
if is_datetimelike and a[i] == NPY_NAT:
continue
if a[i] == a[i]:
tmp[j] = a[i]
j += 1
Expand Down Expand Up @@ -170,6 +181,7 @@ def group_median_float64(
Py_ssize_t min_count=-1,
const uint8_t[:, :] mask=None,
uint8_t[:, ::1] result_mask=None,
bint is_datetimelike=False,
) -> None:
"""
Only aggregates on axis=0
Expand Down Expand Up @@ -228,7 +240,7 @@ def group_median_float64(
ptr += _counts[0]
for j in range(ngroups):
size = _counts[j + 1]
out[j, i] = median_linear(ptr, size)
out[j, i] = median_linear(ptr, size, is_datetimelike)
ptr += size


Expand Down
3 changes: 2 additions & 1 deletion pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,7 @@ def _call_cython_op(
"last",
"first",
"sum",
"median"
]:
func(
out=result,
Expand All @@ -427,7 +428,7 @@ def _call_cython_op(
is_datetimelike=is_datetimelike,
**kwargs,
)
elif self.how in ["sem", "std", "var", "ohlc", "prod", "median"]:
elif self.how in ["sem", "std", "var", "ohlc", "prod"]:
if self.how in ["std", "sem"]:
kwargs["is_datetimelike"] = is_datetimelike
func(
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,13 @@ def test_len_nan_group():
assert len(df.groupby(["a", "b"])) == 0


def test_groupby_timedelta_median():
# issue 57926
df = DataFrame({"label": ["foo", "foo"], "timedelta": [pd.NaT, Timedelta("1d")]})
median = df.groupby("label")["timedelta"].median()
assert median.loc["foo"] == Timedelta("1d")


@pytest.mark.parametrize("keys", [["a"], ["a", "b"]])
def test_len_categorical(dropna, observed, keys):
# GH#57595
Expand Down

0 comments on commit f9a6d3d

Please sign in to comment.