From 033694b779c86e0ecd7fe215abb4463fe0be1d2e Mon Sep 17 00:00:00 2001 From: dcherian Date: Mon, 17 Apr 2023 10:49:42 -0600 Subject: [PATCH] Fix binning by unsorted array Closes #7759 --- xarray/core/groupby.py | 2 +- xarray/tests/test_groupby.py | 23 ++++++++++++++++++----- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 8df1e5fedec..b786fa60af9 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -338,7 +338,7 @@ def _factorize_bins( if (codes == -1).all(): raise ValueError(f"None of the data falls within bins with edges {bins!r}") full_index = binned.categories - unique_values = binned.unique().dropna() + unique_values = np.sort(binned.unique().dropna()) group_indices = [g for g in _codes_to_groups(codes, len(full_index)) if g] if len(group_indices) == 0: diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 52c1af97fbf..5d72b800877 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -1371,17 +1371,30 @@ def test_groupby_multidim_map(self): ) assert_identical(expected, actual) - def test_groupby_bins(self): - array = DataArray(np.arange(4), dims="dim_0") + @pytest.mark.parametrize("coords", [np.arange(4), np.arange(4)[::-1], [2, 0, 3, 1]]) + def test_groupby_bins(self, coords): + array = DataArray( + np.arange(4), dims="dim_0", coords={"dim_0": coords}, name="a" + ) # the first value should not be part of any group ("right" binning) array[0] = 99 # bins follow conventions for pandas.cut # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html bins = [0, 1.5, 5] - bin_coords = pd.cut(array["dim_0"], bins).categories - expected = DataArray( - [1, 5], dims="dim_0_bins", coords={"dim_0_bins": bin_coords} + + df = array.to_dataframe() + df["dim_0_bins"] = pd.cut(array["dim_0"], bins) + + expected_df = df.groupby("dim_0_bins").sum() + # TODO: can't convert df with IntervalIndex to Xarray + + expected = ( + expected_df.reset_index(drop=True) + .to_xarray() + .assign_coords(index=np.array(expected_df.index)) + .rename({"index": "dim_0_bins"})["a"] ) + actual = array.groupby_bins("dim_0", bins=bins).sum() assert_identical(expected, actual)