From 6c499fff1ee1cb2330347bf3586c72a5c84a07b3 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sun, 28 Jul 2024 23:50:13 -0600 Subject: [PATCH] Improve precision for mean, std, var, cumsum. (#90) * Improve precision for mean, std, var. np.bincount always accumulates to float64. So only cast after the division. --- numpy_groupies/aggregate_numpy.py | 28 +++++++++++++++++++--------- numpy_groupies/tests/test_generic.py | 11 +++++++++++ 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/numpy_groupies/aggregate_numpy.py b/numpy_groupies/aggregate_numpy.py index 844f2f2..dfbf11e 100644 --- a/numpy_groupies/aggregate_numpy.py +++ b/numpy_groupies/aggregate_numpy.py @@ -149,15 +149,16 @@ def _mean(group_idx, a, size, fill_value, dtype=np.dtype(np.float64)): sums.real = np.bincount(group_idx, weights=a.real, minlength=size) sums.imag = np.bincount(group_idx, weights=a.imag, minlength=size) else: - sums = np.bincount(group_idx, weights=a, minlength=size).astype( - dtype, copy=False - ) + sums = np.bincount(group_idx, weights=a, minlength=size) with np.errstate(divide="ignore", invalid="ignore"): - ret = sums.astype(dtype, copy=False) / counts + ret = sums / counts if not np.isnan(fill_value): ret[counts == 0] = fill_value - return ret + if iscomplexobj(a): + return ret + else: + return ret.astype(dtype, copy=False) def _sum_of_squres(group_idx, a, size, fill_value, dtype=np.dtype(np.float64)): @@ -165,7 +166,10 @@ def _sum_of_squres(group_idx, a, size, fill_value, dtype=np.dtype(np.float64)): if fill_value != 0: counts = np.bincount(group_idx, minlength=size) ret[counts == 0] = fill_value - return ret + if iscomplexobj(a): + return ret + else: + return ret.astype(dtype, copy=False) def _var( @@ -176,7 +180,7 @@ def _var( counts = np.bincount(group_idx, minlength=size) sums = np.bincount(group_idx, weights=a, minlength=size) with np.errstate(divide="ignore", invalid="ignore"): - means = sums.astype(dtype, copy=False) / counts + means = sums / counts counts = np.where(counts > ddof, counts - ddof, 0) ret = ( np.bincount(group_idx, (a - means[group_idx]) ** 2, minlength=size) / counts @@ -185,7 +189,10 @@ def _var( ret = np.sqrt(ret) # this is now std not var if not np.isnan(fill_value): ret[counts == 0] = fill_value - return ret + if iscomplexobj(a): + return ret + else: + return ret.astype(dtype, copy=False) def _std(group_idx, a, size, fill_value, dtype=np.dtype(np.float64), ddof=0): @@ -252,7 +259,10 @@ def _cumsum(group_idx, a, size, fill_value=None, dtype=None): increasing = np.arange(len(a), dtype=int) group_starts = _min(group_idx_srt, increasing, size, fill_value=0)[group_idx_srt] - a_srt_cumsum += -a_srt_cumsum[group_starts] + a_srt[group_starts] + # First subtract large numbers + a_srt_cumsum -= a_srt_cumsum[group_starts] + # Then add potentially small numbers + a_srt_cumsum += a_srt[group_starts] return a_srt_cumsum[invsortidx] diff --git a/numpy_groupies/tests/test_generic.py b/numpy_groupies/tests/test_generic.py index 2a2811d..7a8b772 100644 --- a/numpy_groupies/tests/test_generic.py +++ b/numpy_groupies/tests/test_generic.py @@ -570,3 +570,14 @@ def test_var_with_nan_fill_value(aggregate_all, ddof, nan_inds, func): group_idx, a, axis=-1, fill_value=np.nan, func=func, ddof=ddof ) np.testing.assert_equal(actual, expected) + + +def test_cumsum_accuracy(aggregate_all): + array = np.array( + [0.00000000e00, 0.00000000e00, 0.00000000e00, 3.27680000e04, 9.99999975e-06] + ) + group_idx = np.array([0, 0, 0, 0, 1]) + + actual = aggregate_all(group_idx, array, axis=-1, func="cumsum") + expected = array + np.testing.assert_allclose(actual, expected)