diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 72d748ecb74..b957a5145ff 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -34,7 +34,7 @@ jobs: # add "build" because of https://github.com/airspeed-velocity/asv/issues/1385 create-args: >- asv - build + python-build mamba diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c0a3e6bbf4e..57f8b9e86f5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,13 +13,13 @@ repos: - id: mixed-line-ending - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: 'v0.5.0' + rev: 'v0.6.2' hooks: - id: ruff args: ["--fix", "--show-fixes"] # https://github.com/python/black#version-control-integration - repo: https://github.com/psf/black-pre-commit-mirror - rev: 24.4.2 + rev: 24.8.0 hooks: - id: black-jupyter - repo: https://github.com/keewis/blackdoc @@ -27,10 +27,10 @@ repos: hooks: - id: blackdoc exclude: "generate_aggregations.py" - additional_dependencies: ["black==24.4.2"] + additional_dependencies: ["black==24.8.0"] - id: blackdoc-autoupdate-black - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.10.1 + rev: v1.11.2 hooks: - id: mypy # Copied from setup.cfg @@ -41,7 +41,7 @@ repos: additional_dependencies: [ # Type stubs types-python-dateutil, - types-pkg_resources, + types-setuptools, types-PyYAML, types-pytz, typing-extensions>=4.1.0, diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 55fea717f71..46d861de3bb 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -1,9 +1,9 @@ version: 2 build: - os: ubuntu-22.04 + os: ubuntu-lts-latest tools: - python: mambaforge-4.10 + python: mambaforge-latest jobs: post_checkout: - (git --no-pager log --pretty="tformat:%s" -1 | grep -vqF "[skip-rtd]") || exit 183 diff --git a/ci/requirements/doc.yml b/ci/requirements/doc.yml index 0467e2eb0cd..b5c9bb6c438 100644 --- a/ci/requirements/doc.yml +++ b/ci/requirements/doc.yml @@ -4,7 +4,7 @@ channels: - conda-forge - nodefaults dependencies: - - python=3.10 + - python=3.12 - bottleneck - cartopy - cfgrib @@ -40,6 +40,7 @@ dependencies: - sphinx-design - sphinx-inline-tabs - sphinx>=5.0 + - sphinx-remove-toctrees - sphinxext-opengraph - sphinxext-rediraffe - zarr>=2.10 diff --git a/doc/conf.py b/doc/conf.py index 93a0e459a33..e418045207c 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -88,6 +88,7 @@ "sphinxext.rediraffe", "sphinx_design", "sphinx_inline_tabs", + "sphinx_remove_toctrees", ] @@ -198,6 +199,8 @@ # The master toctree document. master_doc = "index" +remove_from_toctrees = ["generated/*"] + # General information about the project. project = "xarray" copyright = f"2014-{datetime.datetime.now().year}, xarray Developers" @@ -244,6 +247,7 @@ repository_url="https://github.com/pydata/xarray", repository_branch="main", navigation_with_keys=False, # pydata/pydata-sphinx-theme#1492 + navigation_depth=4, path_to_docs="doc", use_edit_page_button=True, use_repository_button=True, diff --git a/doc/user-guide/groupby.rst b/doc/user-guide/groupby.rst index b41bf3eeb3a..c10ee6a659d 100644 --- a/doc/user-guide/groupby.rst +++ b/doc/user-guide/groupby.rst @@ -81,8 +81,7 @@ You can index out a particular group: ds.groupby("letters")["b"] -Just like in pandas, creating a GroupBy object is cheap: it does not actually -split the data until you access particular values. +To group by multiple variables, see :ref:`this section `. Binning ~~~~~~~ @@ -180,19 +179,6 @@ This last line is roughly equivalent to the following:: results.append(group - alt.sel(letters=label)) xr.concat(results, dim='x') -Iterating and Squeezing -~~~~~~~~~~~~~~~~~~~~~~~ - -Previously, Xarray defaulted to squeezing out dimensions of size one when iterating over -a GroupBy object. This behaviour is being removed. -You can always squeeze explicitly later with the Dataset or DataArray -:py:meth:`DataArray.squeeze` methods. - -.. ipython:: python - - next(iter(arr.groupby("x", squeeze=False))) - - .. _groupby.multidim: Multidimensional Grouping @@ -236,6 +222,8 @@ applying your function, and then unstacking the result: stacked = da.stack(gridcell=["ny", "nx"]) stacked.groupby("gridcell").sum(...).unstack("gridcell") +Alternatively, you can groupby both `lat` and `lon` at the :ref:`same time `. + .. _groupby.groupers: Grouper Objects @@ -276,7 +264,8 @@ is identical to ds.groupby(x=UniqueGrouper()) -and + +Similarly, .. code-block:: python @@ -303,3 +292,26 @@ is identical to from xarray.groupers import TimeResampler ds.resample(time=TimeResampler("ME")) + + +.. _groupby.multiple: + +Grouping by multiple variables +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use grouper objects to group by multiple dimensions: + +.. ipython:: python + + from xarray.groupers import UniqueGrouper + + da.groupby(lat=UniqueGrouper(), lon=UniqueGrouper()).sum() + + +Different groupers can be combined to construct sophisticated GroupBy operations. + +.. ipython:: python + + from xarray.groupers import BinGrouper + + ds.groupby(x=BinGrouper(bins=[5, 15, 25]), letters=UniqueGrouper()).sum() diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 3c6b7bfb58d..712ad68aeb3 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -24,6 +24,11 @@ New Features ~~~~~~~~~~~~ - Make chunk manager an option in ``set_options`` (:pull:`9362`). By `Tom White `_. +- Support for :ref:`grouping by multiple variables `. + This is quite new, so please check your results and report bugs. + Binary operations after grouping by multiple arrays are not supported yet. + (:issue:`1056`, :issue:`9332`, :issue:`324`, :pull:`9372`). + By `Deepak Cherian `_. - Allow data variable specific ``constant_values`` in the dataset ``pad`` function (:pull:`9353``). By `Tiago Sanona `_. diff --git a/properties/test_encode_decode.py b/properties/test_encode_decode.py index 60e1bbe81c1..e7eece7e81e 100644 --- a/properties/test_encode_decode.py +++ b/properties/test_encode_decode.py @@ -11,42 +11,35 @@ # isort: split import hypothesis.extra.numpy as npst -import hypothesis.strategies as st +import numpy as np from hypothesis import given import xarray as xr - -an_array = npst.arrays( - dtype=st.one_of( - npst.unsigned_integer_dtypes(), npst.integer_dtypes(), npst.floating_dtypes() - ), - shape=npst.array_shapes(max_side=3), # max_side specified for performance -) +from xarray.testing.strategies import variables @pytest.mark.slow -@given(st.data(), an_array) -def test_CFMask_coder_roundtrip(data, arr) -> None: - names = data.draw( - st.lists(st.text(), min_size=arr.ndim, max_size=arr.ndim, unique=True).map( - tuple - ) - ) - original = xr.Variable(names, arr) +@given(original=variables()) +def test_CFMask_coder_roundtrip(original) -> None: coder = xr.coding.variables.CFMaskCoder() roundtripped = coder.decode(coder.encode(original)) xr.testing.assert_identical(original, roundtripped) +@pytest.mark.xfail +@pytest.mark.slow +@given(var=variables(dtype=npst.floating_dtypes())) +def test_CFMask_coder_decode(var) -> None: + var[0] = -99 + var.attrs["_FillValue"] = -99 + coder = xr.coding.variables.CFMaskCoder() + decoded = coder.decode(var) + assert np.isnan(decoded[0]) + + @pytest.mark.slow -@given(st.data(), an_array) -def test_CFScaleOffset_coder_roundtrip(data, arr) -> None: - names = data.draw( - st.lists(st.text(), min_size=arr.ndim, max_size=arr.ndim, unique=True).map( - tuple - ) - ) - original = xr.Variable(names, arr) +@given(original=variables()) +def test_CFScaleOffset_coder_roundtrip(original) -> None: coder = xr.coding.variables.CFScaleOffsetCoder() roundtripped = coder.decode(coder.encode(original)) xr.testing.assert_identical(original, roundtripped) diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index ad2f55e585f..f7bed2c13ef 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -145,7 +145,7 @@ def __sub__(self, other): if isinstance(other, cftime.datetime): raise TypeError("Cannot subtract a cftime.datetime from a time offset.") - elif type(other) == type(self): + elif type(other) is type(self): return type(self)(self.n - other.n) else: return NotImplemented @@ -165,7 +165,7 @@ def __radd__(self, other): return self.__add__(other) def __rsub__(self, other): - if isinstance(other, BaseCFTimeOffset) and type(self) != type(other): + if isinstance(other, BaseCFTimeOffset) and type(self) is not type(other): raise TypeError("Cannot subtract cftime offsets of differing types") return -self + other @@ -462,7 +462,7 @@ def __sub__(self, other: Self) -> Self: if isinstance(other, cftime.datetime): raise TypeError("Cannot subtract cftime.datetime from offset.") - if type(other) == type(self) and other.month == self.month: + if type(other) is type(self) and other.month == self.month: return type(self)(self.n - other.n, month=self.month) return NotImplemented @@ -548,7 +548,7 @@ def __sub__(self, other): if isinstance(other, cftime.datetime): raise TypeError("Cannot subtract cftime.datetime from offset.") - elif type(other) == type(self) and other.month == self.month: + elif type(other) is type(self) and other.month == self.month: return type(self)(self.n - other.n, month=self.month) else: return NotImplemented diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 84f229bf575..1f0544c1041 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -6801,27 +6801,22 @@ def groupby( groupers = either_dict_or_kwargs(group, groupers, "groupby") # type: ignore group = None - grouper: Grouper + rgroupers: tuple[ResolvedGrouper, ...] if group is not None: if groupers: raise ValueError( "Providing a combination of `group` and **groupers is not supported." ) - grouper = UniqueGrouper() + rgroupers = (ResolvedGrouper(UniqueGrouper(), group, self),) else: - if len(groupers) > 1: - raise ValueError("grouping by multiple variables is not supported yet.") if not groupers: raise ValueError("Either `group` or `**groupers` must be provided.") - group, grouper = next(iter(groupers.items())) - - rgrouper = ResolvedGrouper(grouper, group, self) + rgroupers = tuple( + ResolvedGrouper(grouper, group, self) + for group, grouper in groupers.items() + ) - return DataArrayGroupBy( - self, - (rgrouper,), - restore_coord_dims=restore_coord_dims, - ) + return DataArrayGroupBy(self, rgroupers, restore_coord_dims=restore_coord_dims) @_deprecate_positional_args("v2024.07.0") def groupby_bins( diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index dbc00a03025..e14176f1589 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -10397,25 +10397,22 @@ def groupby( groupers = either_dict_or_kwargs(group, groupers, "groupby") # type: ignore group = None + rgroupers: tuple[ResolvedGrouper, ...] if group is not None: if groupers: raise ValueError( "Providing a combination of `group` and **groupers is not supported." ) - rgrouper = ResolvedGrouper(UniqueGrouper(), group, self) + rgroupers = (ResolvedGrouper(UniqueGrouper(), group, self),) else: - if len(groupers) > 1: - raise ValueError("Grouping by multiple variables is not supported yet.") - elif not groupers: + if not groupers: raise ValueError("Either `group` or `**groupers` must be provided.") - for group, grouper in groupers.items(): - rgrouper = ResolvedGrouper(grouper, group, self) + rgroupers = tuple( + ResolvedGrouper(grouper, group, self) + for group, grouper in groupers.items() + ) - return DatasetGroupBy( - self, - (rgrouper,), - restore_coord_dims=restore_coord_dims, - ) + return DatasetGroupBy(self, rgroupers, restore_coord_dims=restore_coord_dims) @_deprecate_positional_args("v2024.07.0") def groupby_bins( diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 9636e65905c..3c387fde072 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -1,6 +1,8 @@ from __future__ import annotations import copy +import functools +import itertools import warnings from collections.abc import Callable, Hashable, Iterator, Mapping, Sequence from dataclasses import dataclass, field @@ -15,7 +17,7 @@ DataArrayGroupByAggregations, DatasetGroupByAggregations, ) -from xarray.core.alignment import align +from xarray.core.alignment import align, broadcast from xarray.core.arithmetic import DataArrayGroupbyArithmetic, DatasetGroupbyArithmetic from xarray.core.common import ImplementsArrayReduce, ImplementsDatasetReduce from xarray.core.concat import concat @@ -69,10 +71,11 @@ def check_reduce_dims(reduce_dims, dimensions): ) -def _codes_to_group_indices(inverse: np.ndarray, N: int) -> GroupIndices: - assert inverse.ndim == 1 +def _codes_to_group_indices(codes: np.ndarray, N: int) -> GroupIndices: + """Converts integer codes for groups to group indices.""" + assert codes.ndim == 1 groups: GroupIndices = tuple([] for _ in range(N)) - for n, g in enumerate(inverse): + for n, g in enumerate(codes): if g >= 0: groups[g].append(n) return groups @@ -381,6 +384,65 @@ def _resolve_group( return newgroup +@dataclass +class ComposedGrouper: + """ + Helper class for multi-variable GroupBy. + This satisfies the Grouper interface, but is awkward to wrap in ResolvedGrouper. + For one, it simply re-infers a new EncodedGroups using known information + in existing ResolvedGroupers. So passing in a `group` (hard to define), + and `obj` (pointless) is not useful. + """ + + groupers: tuple[ResolvedGrouper, ...] + + def factorize(self) -> EncodedGroups: + from xarray.groupers import EncodedGroups + + groupers = self.groupers + + # At this point all arrays have been factorized. + codes = tuple(grouper.codes for grouper in groupers) + shape = tuple(grouper.size for grouper in groupers) + # We broadcast the codes against each other + broadcasted_codes = broadcast(*codes) + # This fully broadcasted DataArray is used as a template later + first_codes = broadcasted_codes[0] + # Now we convert to a single variable GroupBy problem + _flatcodes = np.ravel_multi_index( + tuple(codes.data for codes in broadcasted_codes), shape, mode="wrap" + ) + # NaNs; as well as values outside the bins are coded by -1 + # Restore these after the raveling + mask = functools.reduce(np.logical_or, [(code == -1) for code in broadcasted_codes]) # type: ignore + _flatcodes[mask] = -1 + + midx = pd.MultiIndex.from_product( + (grouper.unique_coord.data for grouper in groupers), + names=tuple(grouper.name for grouper in groupers), + ) + # Constructing an index from the product is wrong when there are missing groups + # (e.g. binning, resampling). Account for that now. + midx = midx[np.sort(pd.unique(_flatcodes[~mask]))] + + full_index = pd.MultiIndex.from_product( + (grouper.full_index.values for grouper in groupers), + names=tuple(grouper.name for grouper in groupers), + ) + dim_name = "stacked_" + "_".join(str(grouper.name) for grouper in groupers) + + coords = Coordinates.from_pandas_multiindex(midx, dim=dim_name) + for grouper in groupers: + coords.variables[grouper.name].attrs = grouper.group.attrs + return EncodedGroups( + codes=first_codes.copy(data=_flatcodes), + full_index=full_index, + group_indices=_codes_to_group_indices(_flatcodes.ravel(), len(full_index)), + unique_coord=Variable(dims=(dim_name,), data=midx.values), + coords=coords, + ) + + class GroupBy(Generic[T_Xarray]): """A object that implements the split-apply-combine pattern. @@ -418,12 +480,12 @@ class GroupBy(Generic[T_Xarray]): "encoded", ) _obj: T_Xarray - groupers: tuple[ResolvedGrouper] + groupers: tuple[ResolvedGrouper, ...] _restore_coord_dims: bool _original_obj: T_Xarray _group_indices: GroupIndices - _codes: DataArray + _codes: tuple[DataArray, ...] _group_dim: Hashable _groups: dict[GroupKey, GroupIndex] | None @@ -441,7 +503,7 @@ class GroupBy(Generic[T_Xarray]): def __init__( self, obj: T_Xarray, - groupers: tuple[ResolvedGrouper], + groupers: tuple[ResolvedGrouper, ...], restore_coord_dims: bool = True, ) -> None: """Create a GroupBy object @@ -460,8 +522,19 @@ def __init__( self._restore_coord_dims = restore_coord_dims self.groupers = groupers - (grouper,) = groupers - self.encoded = grouper.encoded + if len(groupers) == 1: + (grouper,) = groupers + self.encoded = grouper.encoded + else: + if any( + isinstance(obj._indexes.get(grouper.name, None), PandasMultiIndex) + for grouper in groupers + ): + raise NotImplementedError( + "Grouping by multiple variables, one of which " + "wraps a Pandas MultiIndex, is not supported yet." + ) + self.encoded = ComposedGrouper(groupers).factorize() # specification for the groupby operation # TODO: handle obj having variables that are not present on any of the groupers @@ -599,6 +672,12 @@ def reduce( ) -> T_Xarray: raise NotImplementedError() + def _raise_if_not_single_group(self): + if len(self.groupers) != 1: + raise NotImplementedError( + "This method is not supported for grouping by multiple variables yet." + ) + @property def groups(self) -> dict[GroupKey, GroupIndex]: """ @@ -624,13 +703,16 @@ def __iter__(self) -> Iterator[tuple[GroupKey, T_Xarray]]: return zip(self.encoded.unique_coord.data, self._iter_grouped()) def __repr__(self) -> str: - (grouper,) = self.groupers - return "{}, grouped over {!r}\n{!r} groups with labels {}.".format( - self.__class__.__name__, - grouper.name, - grouper.full_index.size, - ", ".join(format_array_flat(grouper.full_index, 30).split()), + text = ( + f"<{self.__class__.__name__}, " + f"grouped over {len(self.groupers)} grouper(s)," + f" {self._len} groups in total:" ) + for grouper in self.groupers: + coord = grouper.unique_coord + labels = ", ".join(format_array_flat(coord, 30).split()) + text += f"\n\t{grouper.name!r}: {coord.size} groups with labels {labels}" + return text + ">" def _iter_grouped(self) -> Iterator[T_Xarray]: """Iterate over each element in this group""" @@ -639,7 +721,6 @@ def _iter_grouped(self) -> Iterator[T_Xarray]: yield self._obj.isel({self._group_dim: indices}) def _infer_concat_args(self, applied_example): - if self._group_dim in applied_example.dims: coord = self.group1d positions = self.encoded.group_indices @@ -655,6 +736,7 @@ def _binary_op(self, other, f, reflexive=False): g = f if not reflexive else lambda x, y: f(y, x) + self._raise_if_not_single_group() (grouper,) = self.groupers obj = self._original_obj name = grouper.name @@ -747,27 +829,44 @@ def _maybe_restore_empty_groups(self, combined): """ from xarray.groupers import BinGrouper, TimeResampler - (grouper,) = self.groupers - if ( - isinstance(grouper.grouper, BinGrouper | TimeResampler) - and grouper.name in combined.dims - ): - indexers = {grouper.name: grouper.full_index} + indexers = {} + for grouper in self.groupers: + if ( + isinstance(grouper.grouper, BinGrouper | TimeResampler) + and grouper.name in combined.dims + ): + indexers[grouper.name] = grouper.full_index + if indexers: combined = combined.reindex(**indexers) return combined def _maybe_unstack(self, obj): """This gets called if we are applying on an array with a multidimensional group.""" - (grouper,) = self.groupers + from xarray.groupers import UniqueGrouper + stacked_dim = self._stacked_dim - inserted_dims = self._inserted_dims if stacked_dim is not None and stacked_dim in obj.dims: + inserted_dims = self._inserted_dims obj = obj.unstack(stacked_dim) for dim in inserted_dims: if dim in obj.coords: del obj.coords[dim] obj._indexes = filter_indexes_from_coords(obj._indexes, set(obj.coords)) + elif len(self.groupers) > 1: + # TODO: we could clean this up by setting the appropriate `stacked_dim` + # and `inserted_dims` + # if multiple groupers all share the same single dimension, then + # we don't stack/unstack. Do that manually now. + obj = obj.unstack(*self.encoded.unique_coord.dims) + to_drop = [ + grouper.name + for grouper in self.groupers + if isinstance(grouper.group, _DummyGroup) + and isinstance(grouper.grouper, UniqueGrouper) + ] + obj = obj.drop_vars(to_drop) + return obj def _flox_reduce( @@ -784,9 +883,15 @@ def _flox_reduce( from xarray.groupers import BinGrouper obj = self._original_obj - (grouper,) = self.groupers - name = grouper.name - isbin = isinstance(grouper.grouper, BinGrouper) + variables = ( + {k: v.variable for k, v in obj.data_vars.items()} + if isinstance(obj, Dataset) + else obj._coords + ) + + any_isbin = any( + isinstance(grouper.grouper, BinGrouper) for grouper in self.groupers + ) if keep_attrs is None: keep_attrs = _get_keep_attrs(default=True) @@ -797,12 +902,27 @@ def _flox_reduce( # flox >=0.9 will choose this on its own. kwargs.setdefault("method", "cohorts") - numeric_only = kwargs.pop("numeric_only", None) - if numeric_only: + midx_grouping_vars: tuple[Hashable, ...] = () + for grouper in self.groupers: + name = grouper.name + maybe_midx = obj._indexes.get(name, None) + if isinstance(maybe_midx, PandasMultiIndex): + midx_grouping_vars += tuple(maybe_midx.index.names) + (name,) + + # For datasets, running a numeric-only reduction on non-numeric + # variable will just drop it. + non_numeric: dict[Hashable, Variable] + if kwargs.pop("numeric_only", None): non_numeric = { name: var - for name, var in obj.data_vars.items() - if not (np.issubdtype(var.dtype, np.number) or (var.dtype == np.bool_)) + for name, var in variables.items() + if ( + not (np.issubdtype(var.dtype, np.number) or (var.dtype == np.bool_)) + # this avoids dropping any levels of a MultiIndex, which raises + # a warning + and name not in midx_grouping_vars + and name not in obj.dims + ) } else: non_numeric = {} @@ -814,15 +934,25 @@ def _flox_reduce( # set explicitly to avoid unnecessarily accumulating count kwargs["min_count"] = 0 - unindexed_dims: tuple[Hashable, ...] = tuple() - if isinstance(grouper.group, _DummyGroup) and not isbin: - unindexed_dims = (name,) + unindexed_dims: tuple[Hashable, ...] = tuple( + grouper.name + for grouper in self.groupers + if isinstance(grouper.group, _DummyGroup) + and not isinstance(grouper.grouper, BinGrouper) + ) parsed_dim: tuple[Hashable, ...] if isinstance(dim, str): parsed_dim = (dim,) elif dim is None: - parsed_dim = grouper.group.dims + parsed_dim_list = list() + # preserve order + for dim_ in itertools.chain( + *(grouper.group.dims for grouper in self.groupers) + ): + if dim_ not in parsed_dim_list: + parsed_dim_list.append(dim_) + parsed_dim = tuple(parsed_dim_list) elif dim is ...: parsed_dim = tuple(obj.dims) else: @@ -830,12 +960,15 @@ def _flox_reduce( # Do this so we raise the same error message whether flox is present or not. # Better to control it here than in flox. - if any(d not in grouper.group.dims and d not in obj.dims for d in parsed_dim): - raise ValueError(f"cannot reduce over dimensions {dim}.") + for grouper in self.groupers: + if any( + d not in grouper.group.dims and d not in obj.dims for d in parsed_dim + ): + raise ValueError(f"cannot reduce over dimensions {dim}.") if kwargs["func"] not in ["all", "any", "count"]: kwargs.setdefault("fill_value", np.nan) - if isbin and kwargs["func"] == "count": + if any_isbin and kwargs["func"] == "count": # This is an annoying hack. Xarray returns np.nan # when there are no observations in a bin, instead of 0. # We can fake that here by forcing min_count=1. @@ -844,13 +977,17 @@ def _flox_reduce( kwargs.setdefault("fill_value", np.nan) kwargs.setdefault("min_count", 1) - output_index = grouper.full_index + # pass RangeIndex as a hint to flox that `by` is already factorized + expected_groups = tuple( + pd.RangeIndex(len(grouper)) for grouper in self.groupers + ) + + codes = tuple(g.codes for g in self.groupers) result = xarray_reduce( obj.drop_vars(non_numeric.keys()), - self.encoded.codes, + *codes, dim=parsed_dim, - # pass RangeIndex as a hint to flox that `by` is already factorized - expected_groups=(pd.RangeIndex(len(output_index)),), + expected_groups=expected_groups, isbin=False, keep_attrs=keep_attrs, **kwargs, @@ -880,12 +1017,28 @@ def _flox_reduce( Coordinates(new_coords, new_indexes) ).drop_vars(unindexed_dims) - # broadcast and restore non-numeric data variables (backcompat) - for name, var in non_numeric.items(): - if all(d not in var.dims for d in parsed_dim): - result[name] = var.variable.set_dims( - (name,) + var.dims, (result.sizes[name],) + var.shape + # broadcast any non-dim coord variables that don't + # share all dimensions with the grouper + result_variables = ( + result._variables if isinstance(result, Dataset) else result._coords + ) + to_broadcast: dict[Hashable, Variable] = {} + for name, var in variables.items(): + dims_set = set(var.dims) + if ( + dims_set <= set(parsed_dim) + and (dims_set & set(result.dims)) + and name not in result_variables + ): + to_broadcast[name] = var + for name, var in to_broadcast.items(): + if new_dims := tuple(d for d in parsed_dim if d not in var.dims): + new_sizes = tuple( + result.sizes.get(dim, obj.sizes.get(dim)) for dim in new_dims ) + result[name] = var.set_dims( + new_dims + var.dims, new_sizes + var.shape + ).transpose(..., *result.dims) if not isinstance(result, Dataset): # only restore dimension order for arrays @@ -1047,8 +1200,7 @@ def quantile( The American Statistician, 50(4), pp. 361-365, 1996 """ if dim is None: - (grouper,) = self.groupers - dim = self.group1d.dims + dim = (self._group_dim,) # Dataset.quantile does this, do it for flox to ensure same output. q = np.asarray(q, dtype=np.float64) @@ -1159,7 +1311,8 @@ def _iter_grouped_shortcut(self): """ var = self._obj.variable for idx, indices in enumerate(self.encoded.group_indices): - yield var[{self._group_dim: indices}] + if indices: + yield var[{self._group_dim: indices}] def _concat_shortcut(self, applied, dim, positions=None): # nb. don't worry too much about maintaining this method -- it does @@ -1173,12 +1326,11 @@ def _concat_shortcut(self, applied, dim, positions=None): return self._obj._replace_maybe_drop_dims(reordered) def _restore_dim_order(self, stacked: DataArray) -> DataArray: - (grouper,) = self.groupers - group = self.group1d def lookup_order(dimension): - if dimension == grouper.name: - (dimension,) = group.dims + for grouper in self.groupers: + if dimension == grouper.name and grouper.group.ndim == 1: + (dimension,) = grouper.group.dims if dimension in self._obj.dims: axis = self._obj.get_axis_num(dimension) else: @@ -1186,7 +1338,10 @@ def lookup_order(dimension): return axis new_order = sorted(stacked.dims, key=lookup_order) - return stacked.transpose(*new_order, transpose_coords=self._restore_coord_dims) + stacked = stacked.transpose( + *new_order, transpose_coords=self._restore_coord_dims + ) + return stacked def map( self, diff --git a/xarray/static/css/style.css b/xarray/static/css/style.css index dbe61e311c1..d4f5c104850 100644 --- a/xarray/static/css/style.css +++ b/xarray/static/css/style.css @@ -65,7 +65,7 @@ body.vscode-dark { .xr-sections { padding-left: 0 !important; display: grid; - grid-template-columns: 150px auto auto 1fr 20px 20px; + grid-template-columns: 150px auto auto 1fr 0 20px 0 20px; } .xr-section-item { @@ -73,7 +73,8 @@ body.vscode-dark { } .xr-section-item input { - display: none; + display: inline-block; + opacity: 0; } .xr-section-item input + label { @@ -85,6 +86,10 @@ body.vscode-dark { color: var(--xr-font-color2); } +.xr-section-item input:focus + label { + border: 2px solid var(--xr-font-color0); +} + .xr-section-item input:enabled + label:hover { color: var(--xr-font-color0); } diff --git a/xarray/testing/assertions.py b/xarray/testing/assertions.py index 5bc9c8c1016..3dec6a25616 100644 --- a/xarray/testing/assertions.py +++ b/xarray/testing/assertions.py @@ -149,7 +149,7 @@ def assert_equal(a, b, from_root=True, check_dim_order: bool = True): """ __tracebackhide__ = True assert ( - type(a) == type(b) or isinstance(a, Coordinates) and isinstance(b, Coordinates) + type(a) is type(b) or isinstance(a, Coordinates) and isinstance(b, Coordinates) ) b = maybe_transpose_dims(a, b, check_dim_order) if isinstance(a, Variable | DataArray): @@ -206,7 +206,7 @@ def assert_identical(a, b, from_root=True): """ __tracebackhide__ = True assert ( - type(a) == type(b) or isinstance(a, Coordinates) and isinstance(b, Coordinates) + type(a) is type(b) or isinstance(a, Coordinates) and isinstance(b, Coordinates) ) if isinstance(a, Variable): assert a.identical(b), formatting.diff_array_repr(a, b, "identical") @@ -260,7 +260,7 @@ def assert_allclose( assert_identical, assert_equal, numpy.testing.assert_allclose """ __tracebackhide__ = True - assert type(a) == type(b) + assert type(a) is type(b) b = maybe_transpose_dims(a, b, check_dim_order) equiv = functools.partial( diff --git a/xarray/tests/test_distributed.py b/xarray/tests/test_distributed.py index d223bce2098..ca37fbd3d99 100644 --- a/xarray/tests/test_distributed.py +++ b/xarray/tests/test_distributed.py @@ -295,4 +295,4 @@ def f(x, lock=None): await c.gather(futures) lock2 = pickle.loads(pickle.dumps(lock)) - assert type(lock) == type(lock2) + assert type(lock) is type(lock2) diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 022fa37392e..41947d6626a 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -12,9 +12,16 @@ import xarray as xr from xarray import DataArray, Dataset, Variable +from xarray.core.alignment import broadcast from xarray.core.groupby import _consolidate_slices from xarray.core.types import InterpOptions -from xarray.groupers import BinGrouper, EncodedGroups, Grouper, UniqueGrouper +from xarray.groupers import ( + BinGrouper, + EncodedGroups, + Grouper, + TimeResampler, + UniqueGrouper, +) from xarray.tests import ( InaccessibleArray, assert_allclose, @@ -123,6 +130,15 @@ def test_multi_index_groupby_sum() -> None: actual = ds.stack(space=["x", "y"]).groupby("space").sum("z").unstack("space") assert_equal(expected, actual) + with pytest.raises(NotImplementedError): + actual = ( + ds.stack(space=["x", "y"]) + .groupby(space=UniqueGrouper(), z=UniqueGrouper()) + .sum("z") + .unstack("space") + ) + assert_equal(expected, ds) + if not has_pandas_ge_2_1: # the next line triggers a mysterious multiindex error on pandas 2.0 return @@ -568,27 +584,28 @@ def test_da_groupby_assign_coords() -> None: @pytest.mark.parametrize("obj", [repr_da, repr_da.to_dataset(name="a")]) def test_groupby_repr(obj, dim) -> None: actual = repr(obj.groupby(dim)) - expected = f"{obj.__class__.__name__}GroupBy" - expected += f", grouped over {dim!r}" - expected += f"\n{len(np.unique(obj[dim]))!r} groups with labels " + N = len(np.unique(obj[dim])) + expected = f"<{obj.__class__.__name__}GroupBy" + expected += f", grouped over 1 grouper(s), {N} groups in total:" + expected += f"\n\t{dim!r}: {N} groups with labels " if dim == "x": - expected += "1, 2, 3, 4, 5." + expected += "1, 2, 3, 4, 5>" elif dim == "y": - expected += "0, 1, 2, 3, 4, 5, ..., 15, 16, 17, 18, 19." + expected += "0, 1, 2, 3, 4, 5, ..., 15, 16, 17, 18, 19>" elif dim == "z": - expected += "'a', 'b', 'c'." + expected += "'a', 'b', 'c'>" elif dim == "month": - expected += "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12." + expected += "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12>" assert actual == expected @pytest.mark.parametrize("obj", [repr_da, repr_da.to_dataset(name="a")]) def test_groupby_repr_datetime(obj) -> None: actual = repr(obj.groupby("t.month")) - expected = f"{obj.__class__.__name__}GroupBy" - expected += ", grouped over 'month'" - expected += f"\n{len(np.unique(obj.t.dt.month))!r} groups with labels " - expected += "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12." + expected = f"<{obj.__class__.__name__}GroupBy" + expected += ", grouped over 1 grouper(s), 12 groups in total:\n" + expected += "\t'month': 12 groups with labels " + expected += "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12>" assert actual == expected @@ -2646,3 +2663,139 @@ def test_weather_data_resample(use_flox): with xr.set_options(use_flox=use_flox): actual = ds.resample(time="1MS").mean() assert "location" in actual._indexes + + gb = ds.groupby(time=TimeResampler(freq="1MS"), location=UniqueGrouper()) + with xr.set_options(use_flox=use_flox): + actual = gb.mean() + expected = ds.resample(time="1MS").mean().sortby("location") + assert_allclose(actual, expected) + assert actual.time.attrs == ds.time.attrs + assert actual.location.attrs == ds.location.attrs + + assert expected.time.attrs == ds.time.attrs + assert expected.location.attrs == ds.location.attrs + + +@pytest.mark.parametrize("use_flox", [True, False]) +def test_multiple_groupers(use_flox) -> None: + da = DataArray( + np.array([1, 2, 3, 0, 2, np.nan]), + dims="d", + coords=dict( + labels1=("d", np.array(["a", "b", "c", "c", "b", "a"])), + labels2=("d", np.array(["x", "y", "z", "z", "y", "x"])), + ), + name="foo", + ) + + gb = da.groupby(labels1=UniqueGrouper(), labels2=UniqueGrouper()) + repr(gb) + + expected = DataArray( + np.array([[1.0, np.nan, np.nan], [np.nan, 2.0, np.nan], [np.nan, np.nan, 1.5]]), + dims=("labels1", "labels2"), + coords={ + "labels1": np.array(["a", "b", "c"], dtype=object), + "labels2": np.array(["x", "y", "z"], dtype=object), + }, + name="foo", + ) + with xr.set_options(use_flox=use_flox): + actual = gb.mean() + assert_identical(actual, expected) + + # ------- + coords = {"a": ("x", [0, 0, 1, 1]), "b": ("y", [0, 0, 1, 1])} + square = DataArray(np.arange(16).reshape(4, 4), coords=coords, dims=["x", "y"]) + gb = square.groupby(a=UniqueGrouper(), b=UniqueGrouper()) + repr(gb) + with xr.set_options(use_flox=use_flox): + actual = gb.mean() + expected = DataArray( + np.array([[2.5, 4.5], [10.5, 12.5]]), + dims=("a", "b"), + coords={"a": [0, 1], "b": [0, 1]}, + ) + assert_identical(actual, expected) + + expected = square.astype(np.float64) + expected["a"], expected["b"] = broadcast(square.a, square.b) + with xr.set_options(use_flox=use_flox): + assert_identical( + square.groupby(x=UniqueGrouper(), y=UniqueGrouper()).mean(), expected + ) + + b = xr.DataArray( + np.random.RandomState(0).randn(2, 3, 4), + coords={"xy": (("x", "y"), [["a", "b", "c"], ["b", "c", "c"]])}, + dims=["x", "y", "z"], + ) + gb = b.groupby(x=UniqueGrouper(), y=UniqueGrouper()) + repr(gb) + with xr.set_options(use_flox=use_flox): + assert_identical(gb.mean("z"), b.mean("z")) + + gb = b.groupby(x=UniqueGrouper(), xy=UniqueGrouper()) + repr(gb) + with xr.set_options(use_flox=use_flox): + actual = gb.mean() + expected = b.drop_vars("xy").rename({"y": "xy"}).copy(deep=True) + newval = b.isel(x=1, y=slice(1, None)).mean("y").data + expected.loc[dict(x=1, xy=1)] = expected.sel(x=1, xy=0).data + expected.loc[dict(x=1, xy=0)] = np.nan + expected.loc[dict(x=1, xy=2)] = newval + expected["xy"] = ("xy", ["a", "b", "c"]) + # TODO: is order of dims correct? + assert_identical(actual, expected.transpose("z", "x", "xy")) + + +@pytest.mark.parametrize("use_flox", [True, False]) +def test_multiple_groupers_mixed(use_flox) -> None: + # This groupby has missing groups + ds = xr.Dataset( + {"foo": (("x", "y"), np.arange(12).reshape((4, 3)))}, + coords={"x": [10, 20, 30, 40], "letters": ("x", list("abba"))}, + ) + gb = ds.groupby(x=BinGrouper(bins=[5, 15, 25]), letters=UniqueGrouper()) + expected_data = np.array( + [ + [[0.0, np.nan], [np.nan, 3.0]], + [[1.0, np.nan], [np.nan, 4.0]], + [[2.0, np.nan], [np.nan, 5.0]], + ] + ) + expected = xr.Dataset( + {"foo": (("y", "x_bins", "letters"), expected_data)}, + coords={ + "x_bins": ( + "x_bins", + np.array( + [ + pd.Interval(5, 15, closed="right"), + pd.Interval(15, 25, closed="right"), + ], + dtype=object, + ), + ), + "letters": ("letters", np.array(["a", "b"], dtype=object)), + }, + ) + with xr.set_options(use_flox=use_flox): + actual = gb.sum() + assert_identical(actual, expected) + + # assert_identical( + # b.groupby(['x', 'y']).apply(lambda x: x - x.mean()), + # b - b.mean("z"), + # ) + + # gb = square.groupby(x=UniqueGrouper(), y=UniqueGrouper()) + # gb - gb.mean() + + # ------ + + +# Possible property tests +# 1. lambda x: x +# 2. grouped-reduce on unique coords is identical to array +# 3. group_over == groupby-reduce along other dimensions diff --git a/xarray/tests/test_plot.py b/xarray/tests/test_plot.py index 680a98a6500..3ebaeff712b 100644 --- a/xarray/tests/test_plot.py +++ b/xarray/tests/test_plot.py @@ -1177,7 +1177,7 @@ def setUp(self): @pytest.mark.slow def test_recover_from_seaborn_jet_exception(self) -> None: pal = _color_palette("jet", 4) - assert type(pal) == np.ndarray + assert type(pal) is np.ndarray assert len(pal) == 4 @pytest.mark.slow diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index d1d5d5ada55..a1d8994a736 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1268,38 +1268,38 @@ def test_detect_indexer_type(self): v = Variable(["x", "y"], data) _, ind, _ = v._broadcast_indexes((0, 1)) - assert type(ind) == indexing.BasicIndexer + assert type(ind) is indexing.BasicIndexer _, ind, _ = v._broadcast_indexes((0, slice(0, 8, 2))) - assert type(ind) == indexing.BasicIndexer + assert type(ind) is indexing.BasicIndexer _, ind, _ = v._broadcast_indexes((0, [0, 1])) - assert type(ind) == indexing.OuterIndexer + assert type(ind) is indexing.OuterIndexer _, ind, _ = v._broadcast_indexes(([0, 1], 1)) - assert type(ind) == indexing.OuterIndexer + assert type(ind) is indexing.OuterIndexer _, ind, _ = v._broadcast_indexes(([0, 1], [1, 2])) - assert type(ind) == indexing.OuterIndexer + assert type(ind) is indexing.OuterIndexer _, ind, _ = v._broadcast_indexes(([0, 1], slice(0, 8, 2))) - assert type(ind) == indexing.OuterIndexer + assert type(ind) is indexing.OuterIndexer vind = Variable(("a",), [0, 1]) _, ind, _ = v._broadcast_indexes((vind, slice(0, 8, 2))) - assert type(ind) == indexing.OuterIndexer + assert type(ind) is indexing.OuterIndexer vind = Variable(("y",), [0, 1]) _, ind, _ = v._broadcast_indexes((vind, 3)) - assert type(ind) == indexing.OuterIndexer + assert type(ind) is indexing.OuterIndexer vind = Variable(("a",), [0, 1]) _, ind, _ = v._broadcast_indexes((vind, vind)) - assert type(ind) == indexing.VectorizedIndexer + assert type(ind) is indexing.VectorizedIndexer vind = Variable(("a", "b"), [[0, 2], [1, 3]]) _, ind, _ = v._broadcast_indexes((vind, 3)) - assert type(ind) == indexing.VectorizedIndexer + assert type(ind) is indexing.VectorizedIndexer def test_indexer_type(self): # GH:issue:1688. Wrong indexer type induces NotImplementedError @@ -2587,7 +2587,7 @@ def test_converted_types(self): for input_array in [[[0, 1, 2]], pd.DataFrame([[0, 1, 2]])]: actual = as_compatible_data(input_array) assert_array_equal(np.asarray(input_array), actual) - assert np.ndarray == type(actual) + assert np.ndarray is type(actual) assert np.asarray(input_array).dtype == actual.dtype def test_masked_array(self): @@ -2622,26 +2622,26 @@ def test_datetime(self): expected = np.datetime64("2000-01-01") actual = as_compatible_data(expected) assert expected == actual - assert np.ndarray == type(actual) + assert np.ndarray is type(actual) assert np.dtype("datetime64[ns]") == actual.dtype expected = np.array([np.datetime64("2000-01-01")]) actual = as_compatible_data(expected) assert np.asarray(expected) == actual - assert np.ndarray == type(actual) + assert np.ndarray is type(actual) assert np.dtype("datetime64[ns]") == actual.dtype expected = np.array([np.datetime64("2000-01-01", "ns")]) actual = as_compatible_data(expected) assert np.asarray(expected) == actual - assert np.ndarray == type(actual) + assert np.ndarray is type(actual) assert np.dtype("datetime64[ns]") == actual.dtype assert expected is source_ndarray(np.asarray(actual)) expected = np.datetime64("2000-01-01", "ns") actual = as_compatible_data(datetime(2000, 1, 1)) assert np.asarray(expected) == actual - assert np.ndarray == type(actual) + assert np.ndarray is type(actual) assert np.dtype("datetime64[ns]") == actual.dtype def test_tz_datetime(self) -> None: