diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst index c38b4f2d11c..1900c208532 100644 --- a/doc/api-hidden.rst +++ b/doc/api-hidden.rst @@ -684,7 +684,6 @@ conventions.decode_cf_variables - coding.variables.UnsignedIntegerCoder coding.variables.CFMaskCoder coding.variables.CFScaleOffsetCoder diff --git a/doc/conf.py b/doc/conf.py index 4f1fc6751d2..93a0e459a33 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -153,6 +153,9 @@ "matplotlib colormap name": ":doc:`matplotlib colormap name `", "matplotlib axes object": ":py:class:`matplotlib axes object `", "colormap": ":py:class:`colormap `", + # xarray terms + "dim name": ":term:`dimension name `", + "var name": ":term:`variable name `", # objects without namespace: xarray "DataArray": "~xarray.DataArray", "Dataset": "~xarray.Dataset", diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c8f3a40e87f..2cf2d5928bf 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -24,6 +24,8 @@ New Features ~~~~~~~~~~~~ - Make chunk manager an option in ``set_options`` (:pull:`9362`). By `Tom White `_. +- Allow data variable specific ``constant_values`` in the dataset ``pad`` function (:pull:`9353``). + By `Tiago Sanona `_. Breaking changes ~~~~~~~~~~~~~~~~ @@ -47,6 +49,9 @@ Bug fixes date "0001-01-01". (:issue:`9108`, :pull:`9116`) By `Spencer Clark `_ and `Deepak Cherian `_. +- Fix issue with passing parameters to ZarrStore.open_store when opening + datatree in zarr format (:issue:`9376`, :pull:`9377`). + By `Alfonso Ladino `_ Documentation ~~~~~~~~~~~~~ diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 0da056e8ad2..242507f9c20 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1225,7 +1225,18 @@ def open_datatree( filename_or_obj = _normalize_path(filename_or_obj) if group: parent = NodePath("/") / NodePath(group) - stores = ZarrStore.open_store(filename_or_obj, group=parent) + stores = ZarrStore.open_store( + filename_or_obj, + group=parent, + mode=mode, + synchronizer=synchronizer, + consolidated=consolidated, + consolidate_on_close=False, + chunk_store=chunk_store, + storage_options=storage_options, + stacklevel=stacklevel + 1, + zarr_version=zarr_version, + ) if not stores: ds = open_dataset( filename_or_obj, group=parent, engine="zarr", **kwargs @@ -1233,7 +1244,18 @@ def open_datatree( return DataTree.from_dict({str(parent): ds}) else: parent = NodePath("/") - stores = ZarrStore.open_store(filename_or_obj, group=parent) + stores = ZarrStore.open_store( + filename_or_obj, + group=parent, + mode=mode, + synchronizer=synchronizer, + consolidated=consolidated, + consolidate_on_close=False, + chunk_store=chunk_store, + storage_options=storage_options, + stacklevel=stacklevel + 1, + zarr_version=zarr_version, + ) ds = open_dataset(filename_or_obj, group=parent, engine="zarr", **kwargs) tree_root = DataTree.from_dict({str(parent): ds}) for path_group, store in stores.items(): diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 441ddfe7bfd..74916886026 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -261,7 +261,7 @@ def _is_time_like(units): def _check_fill_values(attrs, name, dtype): - """ "Check _FillValue and missing_value if available. + """Check _FillValue and missing_value if available. Return dictionary with raw fill values and set with encoded fill values. @@ -298,6 +298,72 @@ def _check_fill_values(attrs, name, dtype): return raw_fill_dict, encoded_fill_values +def _convert_unsigned_fill_value( + name: T_Name, + data: Any, + unsigned: str, + raw_fill_value: Any, + encoded_fill_values: set, +) -> Any: + if data.dtype.kind == "i": + if unsigned == "true": + unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}") + transform = partial(np.asarray, dtype=unsigned_dtype) + if raw_fill_value is not None: + new_fill = np.array(raw_fill_value, dtype=data.dtype) + encoded_fill_values.remove(raw_fill_value) + # use view here to prevent OverflowError + encoded_fill_values.add(new_fill.view(unsigned_dtype).item()) + data = lazy_elemwise_func(data, transform, unsigned_dtype) + elif data.dtype.kind == "u": + if unsigned == "false": + signed_dtype = np.dtype(f"i{data.dtype.itemsize}") + transform = partial(np.asarray, dtype=signed_dtype) + data = lazy_elemwise_func(data, transform, signed_dtype) + if raw_fill_value is not None: + new_fill = signed_dtype.type(raw_fill_value) + encoded_fill_values.remove(raw_fill_value) + encoded_fill_values.add(new_fill) + else: + warnings.warn( + f"variable {name!r} has _Unsigned attribute but is not " + "of integer type. Ignoring attribute.", + SerializationWarning, + stacklevel=3, + ) + return data + + +def _encode_unsigned_fill_value( + name: T_Name, + fill_value: Any, + encoded_dtype: np.dtype, +) -> Any: + try: + if hasattr(fill_value, "item"): + # if numpy type, convert to python native integer to determine overflow + # otherwise numpy unsigned ints will silently cast to the signed counterpart + fill_value = fill_value.item() + # passes if provided fill value fits in encoded on-disk type + new_fill = encoded_dtype.type(fill_value) + except OverflowError: + encoded_kind_str = "signed" if encoded_dtype.kind == "i" else "unsigned" + warnings.warn( + f"variable {name!r} will be stored as {encoded_kind_str} integers " + f"but _FillValue attribute can't be represented as a " + f"{encoded_kind_str} integer.", + SerializationWarning, + stacklevel=3, + ) + # user probably provided the fill as the in-memory dtype, + # convert to on-disk type to match CF standard + orig_kind = "u" if encoded_dtype.kind == "i" else "i" + orig_dtype = np.dtype(f"{orig_kind}{encoded_dtype.itemsize}") + # use view here to prevent OverflowError + new_fill = np.array(fill_value, dtype=orig_dtype).view(encoded_dtype).item() + return new_fill + + class CFMaskCoder(VariableCoder): """Mask or unmask fill values according to CF conventions.""" @@ -305,11 +371,14 @@ def encode(self, variable: Variable, name: T_Name = None): dims, data, attrs, encoding = unpack_for_encoding(variable) dtype = np.dtype(encoding.get("dtype", data.dtype)) + # from netCDF best practices + # https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data + # "_Unsigned = "true" to indicate that + # integer data should be treated as unsigned" + has_unsigned = encoding.get("_Unsigned") is not None fv = encoding.get("_FillValue") mv = encoding.get("missing_value") - # to properly handle _FillValue/missing_value below [a], [b] - # we need to check if unsigned data is written as signed data - unsigned = encoding.get("_Unsigned") is not None + fill_value = None fv_exists = fv is not None mv_exists = mv is not None @@ -324,23 +393,28 @@ def encode(self, variable: Variable, name: T_Name = None): if fv_exists: # Ensure _FillValue is cast to same dtype as data's - # [a] need to skip this if _Unsigned is available - if not unsigned: - encoding["_FillValue"] = dtype.type(fv) + encoding["_FillValue"] = ( + _encode_unsigned_fill_value(name, fv, dtype) + if has_unsigned + else dtype.type(fv) + ) fill_value = pop_to(encoding, attrs, "_FillValue", name=name) if mv_exists: # try to use _FillValue, if it exists to align both values # or use missing_value and ensure it's cast to same dtype as data's - # [b] need to provide mv verbatim if _Unsigned is available encoding["missing_value"] = attrs.get( "_FillValue", - (dtype.type(mv) if not unsigned else mv), + ( + _encode_unsigned_fill_value(name, mv, dtype) + if has_unsigned + else dtype.type(mv) + ), ) fill_value = pop_to(encoding, attrs, "missing_value", name=name) # apply fillna - if not pd.isnull(fill_value): + if fill_value is not None and not pd.isnull(fill_value): # special case DateTime to properly handle NaT if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu": data = duck_array_ops.where( @@ -349,46 +423,63 @@ def encode(self, variable: Variable, name: T_Name = None): else: data = duck_array_ops.fillna(data, fill_value) + if fill_value is not None and has_unsigned: + pop_to(encoding, attrs, "_Unsigned") + # XXX: Is this actually needed? Doesn't the backend handle this? + data = duck_array_ops.astype(duck_array_ops.around(data), dtype) + attrs["_FillValue"] = fill_value + return Variable(dims, data, attrs, encoding, fastpath=True) def decode(self, variable: Variable, name: T_Name = None): raw_fill_dict, encoded_fill_values = _check_fill_values( variable.attrs, name, variable.dtype ) + if "_Unsigned" not in variable.attrs and not raw_fill_dict: + return variable - if raw_fill_dict: - dims, data, attrs, encoding = unpack_for_decoding(variable) - [ - safe_setitem(encoding, attr, value, name=name) - for attr, value in raw_fill_dict.items() - ] - - if encoded_fill_values: - # special case DateTime to properly handle NaT - dtype: np.typing.DTypeLike - decoded_fill_value: Any - if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu": - dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min + dims, data, attrs, encoding = unpack_for_decoding(variable) + + # Even if _Unsigned is use, retain on-disk _FillValue + [ + safe_setitem(encoding, attr, value, name=name) + for attr, value in raw_fill_dict.items() + ] + + if "_Unsigned" in attrs: + unsigned = pop_to(attrs, encoding, "_Unsigned") + data = _convert_unsigned_fill_value( + name, + data, + unsigned, + raw_fill_dict.get("_FillValue"), + encoded_fill_values, + ) + + if encoded_fill_values: + # special case DateTime to properly handle NaT + dtype: np.typing.DTypeLike + decoded_fill_value: Any + if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu": + dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min + else: + if "scale_factor" not in attrs and "add_offset" not in attrs: + dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype) else: - if "scale_factor" not in attrs and "add_offset" not in attrs: - dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype) - else: - dtype, decoded_fill_value = ( - _choose_float_dtype(data.dtype, attrs), - np.nan, - ) + dtype, decoded_fill_value = ( + _choose_float_dtype(data.dtype, attrs), + np.nan, + ) - transform = partial( - _apply_mask, - encoded_fill_values=encoded_fill_values, - decoded_fill_value=decoded_fill_value, - dtype=dtype, - ) - data = lazy_elemwise_func(data, transform, dtype) + transform = partial( + _apply_mask, + encoded_fill_values=encoded_fill_values, + decoded_fill_value=decoded_fill_value, + dtype=dtype, + ) + data = lazy_elemwise_func(data, transform, dtype) - return Variable(dims, data, attrs, encoding, fastpath=True) - else: - return variable + return Variable(dims, data, attrs, encoding, fastpath=True) def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTypeLike): @@ -506,74 +597,6 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: return variable -class UnsignedIntegerCoder(VariableCoder): - def encode(self, variable: Variable, name: T_Name = None) -> Variable: - # from netCDF best practices - # https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data - # "_Unsigned = "true" to indicate that - # integer data should be treated as unsigned" - if variable.encoding.get("_Unsigned", "false") == "true": - dims, data, attrs, encoding = unpack_for_encoding(variable) - - pop_to(encoding, attrs, "_Unsigned") - # we need the on-disk type here - # trying to get it from encoding, resort to an int with the same precision as data.dtype if not available - signed_dtype = np.dtype(encoding.get("dtype", f"i{data.dtype.itemsize}")) - if "_FillValue" in attrs: - try: - # user provided the on-disk signed fill - new_fill = signed_dtype.type(attrs["_FillValue"]) - except OverflowError: - # user provided the in-memory unsigned fill, convert to signed type - unsigned_dtype = np.dtype(f"u{signed_dtype.itemsize}") - # use view here to prevent OverflowError - new_fill = ( - np.array(attrs["_FillValue"], dtype=unsigned_dtype) - .view(signed_dtype) - .item() - ) - attrs["_FillValue"] = new_fill - data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype) - - return Variable(dims, data, attrs, encoding, fastpath=True) - else: - return variable - - def decode(self, variable: Variable, name: T_Name = None) -> Variable: - if "_Unsigned" in variable.attrs: - dims, data, attrs, encoding = unpack_for_decoding(variable) - unsigned = pop_to(attrs, encoding, "_Unsigned") - - if data.dtype.kind == "i": - if unsigned == "true": - unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}") - transform = partial(np.asarray, dtype=unsigned_dtype) - if "_FillValue" in attrs: - new_fill = np.array(attrs["_FillValue"], dtype=data.dtype) - # use view here to prevent OverflowError - attrs["_FillValue"] = new_fill.view(unsigned_dtype).item() - data = lazy_elemwise_func(data, transform, unsigned_dtype) - elif data.dtype.kind == "u": - if unsigned == "false": - signed_dtype = np.dtype(f"i{data.dtype.itemsize}") - transform = partial(np.asarray, dtype=signed_dtype) - data = lazy_elemwise_func(data, transform, signed_dtype) - if "_FillValue" in attrs: - new_fill = signed_dtype.type(attrs["_FillValue"]) - attrs["_FillValue"] = new_fill - else: - warnings.warn( - f"variable {name!r} has _Unsigned attribute but is not " - "of integer type. Ignoring attribute.", - SerializationWarning, - stacklevel=3, - ) - - return Variable(dims, data, attrs, encoding, fastpath=True) - else: - return variable - - class DefaultFillvalueCoder(VariableCoder): """Encode default _FillValue if needed.""" diff --git a/xarray/conventions.py b/xarray/conventions.py index d572b215d2d..18a81938225 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -187,7 +187,6 @@ def encode_cf_variable( times.CFTimedeltaCoder(), variables.CFScaleOffsetCoder(), variables.CFMaskCoder(), - variables.UnsignedIntegerCoder(), variables.NativeEnumCoder(), variables.NonStringCoder(), variables.DefaultFillvalueCoder(), @@ -279,7 +278,6 @@ def decode_cf_variable( if mask_and_scale: for coder in [ - variables.UnsignedIntegerCoder(), variables.CFMaskCoder(), variables.CFScaleOffsetCoder(), ]: diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 251edd1fc6f..3b852b962bf 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -352,7 +352,7 @@ def _construct_direct( return obj @classmethod - def from_pandas_multiindex(cls, midx: pd.MultiIndex, dim: str) -> Self: + def from_pandas_multiindex(cls, midx: pd.MultiIndex, dim: Hashable) -> Self: """Wrap a pandas multi-index as Xarray coordinates (dimension + levels). The returned coordinates can be directly assigned to a diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 0b9a085cebc..dbc00a03025 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -163,6 +163,7 @@ ReindexMethodOptions, SideOptions, T_ChunkDimFreq, + T_DatasetPadConstantValues, T_Xarray, ) from xarray.core.weighted import DatasetWeighted @@ -9153,9 +9154,7 @@ def pad( stat_length: ( int | tuple[int, int] | Mapping[Any, tuple[int, int]] | None ) = None, - constant_values: ( - float | tuple[float, float] | Mapping[Any, tuple[float, float]] | None - ) = None, + constant_values: T_DatasetPadConstantValues | None = None, end_values: int | tuple[int, int] | Mapping[Any, tuple[int, int]] | None = None, reflect_type: PadReflectOptions = None, keep_attrs: bool | None = None, @@ -9211,17 +9210,19 @@ def pad( (stat_length,) or int is a shortcut for before = after = statistic length for all axes. Default is ``None``, to use the entire axis. - constant_values : scalar, tuple or mapping of hashable to tuple, default: 0 - Used in 'constant'. The values to set the padded values for each - axis. + constant_values : scalar, tuple, mapping of dim name to scalar or tuple, or \ + mapping of var name to scalar, tuple or to mapping of dim name to scalar or tuple, default: None + Used in 'constant'. The values to set the padded values for each data variable / axis. + ``{var_1: {dim_1: (before_1, after_1), ... dim_N: (before_N, after_N)}, ... + var_M: (before, after)}`` unique pad constants per data variable. ``{dim_1: (before_1, after_1), ... dim_N: (before_N, after_N)}`` unique pad constants along each dimension. ``((before, after),)`` yields same before and after constants for each dimension. ``(constant,)`` or ``constant`` is a shortcut for ``before = after = constant`` for all dimensions. - Default is 0. - end_values : scalar, tuple or mapping of hashable to tuple, default: 0 + Default is ``None``, pads with ``np.nan``. + end_values : scalar, tuple or mapping of hashable to tuple, default: None Used in 'linear_ramp'. The values used for the ending value of the linear_ramp and that will form the edge of the padded array. ``{dim_1: (before_1, after_1), ... dim_N: (before_N, after_N)}`` unique @@ -9230,7 +9231,7 @@ def pad( axis. ``(constant,)`` or ``constant`` is a shortcut for ``before = after = constant`` for all axes. - Default is 0. + Default is None. reflect_type : {"even", "odd", None}, optional Used in "reflect", and "symmetric". The "even" style is the default with an unaltered reflection around the edge value. For @@ -9304,11 +9305,22 @@ def pad( if not var_pad_width: variables[name] = var elif name in self.data_vars: + if utils.is_dict_like(constant_values): + if name in constant_values.keys(): + filtered_constant_values = constant_values[name] + elif not set(var.dims).isdisjoint(constant_values.keys()): + filtered_constant_values = { + k: v for k, v in constant_values.items() if k in var.dims + } + else: + filtered_constant_values = 0 # TODO: https://github.com/pydata/xarray/pull/9353#discussion_r1724018352 + else: + filtered_constant_values = constant_values variables[name] = var.pad( pad_width=var_pad_width, mode=mode, stat_length=stat_length, - constant_values=constant_values, + constant_values=filtered_constant_values, end_values=end_values, reflect_type=reflect_type, keep_attrs=keep_attrs, diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index faeb0c538c3..833466ffe9e 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -23,7 +23,7 @@ from xarray.core.formatting import format_array_flat from xarray.core.indexes import ( PandasIndex, - create_default_index_implicit, + PandasMultiIndex, filter_indexes_from_coords, ) from xarray.core.options import OPTIONS, _get_keep_attrs @@ -54,7 +54,7 @@ from xarray.core.dataset import Dataset from xarray.core.types import GroupIndex, GroupIndices, GroupKey from xarray.core.utils import Frozen - from xarray.groupers import Grouper + from xarray.groupers import EncodedGroups, Grouper def check_reduce_dims(reduce_dims, dimensions): @@ -273,16 +273,19 @@ class ResolvedGrouper(Generic[T_DataWithCoords]): obj: T_DataWithCoords # returned by factorize: - codes: DataArray = field(init=False, repr=False) - full_index: pd.Index = field(init=False, repr=False) - group_indices: GroupIndices = field(init=False, repr=False) - unique_coord: Variable | _DummyGroup = field(init=False, repr=False) + encoded: EncodedGroups = field(init=False, repr=False) - # _ensure_1d: - group1d: T_Group = field(init=False, repr=False) - stacked_obj: T_DataWithCoords = field(init=False, repr=False) - stacked_dim: Hashable | None = field(init=False, repr=False) - inserted_dims: list[Hashable] = field(init=False, repr=False) + @property + def full_index(self) -> pd.Index: + return self.encoded.full_index + + @property + def codes(self) -> DataArray: + return self.encoded.codes + + @property + def unique_coord(self) -> Variable | _DummyGroup: + return self.encoded.unique_coord def __post_init__(self) -> None: # This copy allows the BinGrouper.factorize() method @@ -294,20 +297,13 @@ def __post_init__(self) -> None: self.group = _resolve_group(self.obj, self.group) - ( - self.group1d, - self.stacked_obj, - self.stacked_dim, - self.inserted_dims, - ) = _ensure_1d(group=self.group, obj=self.obj) - - self.factorize() + self.encoded = self.grouper.factorize(self.group) @property def name(self) -> Hashable: """Name for the grouped coordinate after reduction.""" # the name has to come from unique_coord because we need `_bins` suffix for BinGrouper - (name,) = self.unique_coord.dims + (name,) = self.encoded.unique_coord.dims return name @property @@ -317,33 +313,7 @@ def size(self) -> int: def __len__(self) -> int: """Number of groups.""" - return len(self.full_index) - - @property - def dims(self): - return self.group1d.dims - - def factorize(self) -> None: - encoded = self.grouper.factorize(self.group1d) - - self.codes = encoded.codes - self.full_index = encoded.full_index - - if encoded.group_indices is not None: - self.group_indices = encoded.group_indices - else: - self.group_indices = tuple( - g - for g in _codes_to_group_indices(self.codes.data, len(self.full_index)) - if g - ) - if encoded.unique_coord is None: - unique_values = self.full_index[np.unique(encoded.codes)] - self.unique_coord = Variable( - dims=self.codes.name, data=unique_values, attrs=self.group.attrs - ) - else: - self.unique_coord = encoded.unique_coord + return len(self.encoded.full_index) def _validate_groupby_squeeze(squeeze: Literal[False]) -> None: @@ -428,31 +398,29 @@ class GroupBy(Generic[T_Xarray]): """ __slots__ = ( - "_full_index", - "_inserted_dims", - "_group", "_group_dim", - "_group_indices", - "_groups", "groupers", "_obj", "_restore_coord_dims", - "_stacked_dim", - "_unique_coord", + # cached properties + "_groups", "_dims", "_sizes", + "_len", # Save unstacked object for flox "_original_obj", - "_original_group", - "_bins", "_codes", + # stack nD vars + "group1d", + "_stacked_dim", + "_inserted_dims", + "encoded", ) _obj: T_Xarray groupers: tuple[ResolvedGrouper] _restore_coord_dims: bool _original_obj: T_Xarray - _original_group: T_Group _group_indices: GroupIndices _codes: DataArray _group_dim: Hashable @@ -460,6 +428,14 @@ class GroupBy(Generic[T_Xarray]): _groups: dict[GroupKey, GroupIndex] | None _dims: tuple[Hashable, ...] | Frozen[Hashable, int] | None _sizes: Mapping[Hashable, int] | None + _len: int + + # _ensure_1d: + group1d: T_Group + _stacked_dim: Hashable | None + _inserted_dims: list[Hashable] + + encoded: EncodedGroups def __init__( self, @@ -479,26 +455,26 @@ def __init__( If True, also restore the dimension order of multi-dimensional coordinates. """ - self.groupers = groupers - self._original_obj = obj + self._restore_coord_dims = restore_coord_dims + self.groupers = groupers - (grouper,) = self.groupers - self._original_group = grouper.group + (grouper,) = groupers + self.encoded = grouper.encoded # specification for the groupby operation - self._obj = grouper.stacked_obj - self._restore_coord_dims = restore_coord_dims - - # These should generalize to multiple groupers - self._group_indices = grouper.group_indices - self._codes = self._maybe_unstack(grouper.codes) + # TODO: handle obj having variables that are not present on any of the groupers + # simple broadcasting fails for ExtensionArrays. + (self.group1d, self._obj, self._stacked_dim, self._inserted_dims) = _ensure_1d( + group=self.encoded.codes, obj=obj + ) + (self._group_dim,) = self.group1d.dims - (self._group_dim,) = grouper.group1d.dims # cached attributes self._groups = None self._dims = None self._sizes = None + self._len = len(self.encoded.full_index) @property def sizes(self) -> Mapping[Hashable, int]: @@ -512,8 +488,7 @@ def sizes(self) -> Mapping[Hashable, int]: Dataset.sizes """ if self._sizes is None: - (grouper,) = self.groupers - index = self._group_indices[0] + index = self.encoded.group_indices[0] self._sizes = self._obj.isel({self._group_dim: index}).sizes return self._sizes @@ -546,24 +521,22 @@ def groups(self) -> dict[GroupKey, GroupIndex]: """ # provided to mimic pandas.groupby if self._groups is None: - (grouper,) = self.groupers - self._groups = dict(zip(grouper.unique_coord.values, self._group_indices)) + self._groups = dict( + zip(self.encoded.unique_coord.data, self.encoded.group_indices) + ) return self._groups def __getitem__(self, key: GroupKey) -> T_Xarray: """ Get DataArray or Dataset corresponding to a particular group label. """ - (grouper,) = self.groupers return self._obj.isel({self._group_dim: self.groups[key]}) def __len__(self) -> int: - (grouper,) = self.groupers - return grouper.size + return self._len def __iter__(self) -> Iterator[tuple[GroupKey, T_Xarray]]: - (grouper,) = self.groupers - return zip(grouper.unique_coord.data, self._iter_grouped()) + return zip(self.encoded.unique_coord.data, self._iter_grouped()) def __repr__(self) -> str: (grouper,) = self.groupers @@ -576,28 +549,20 @@ def __repr__(self) -> str: def _iter_grouped(self) -> Iterator[T_Xarray]: """Iterate over each element in this group""" - (grouper,) = self.groupers - for idx, indices in enumerate(self._group_indices): - yield self._obj.isel({self._group_dim: indices}) + for indices in self.encoded.group_indices: + if indices: + yield self._obj.isel({self._group_dim: indices}) def _infer_concat_args(self, applied_example): - from xarray.groupers import BinGrouper - (grouper,) = self.groupers if self._group_dim in applied_example.dims: - coord = grouper.group1d - positions = self._group_indices + coord = self.group1d + positions = self.encoded.group_indices else: - coord = grouper.unique_coord + coord = self.encoded.unique_coord positions = None (dim,) = coord.dims - if isinstance(grouper.group, _DummyGroup) and not isinstance( - grouper.grouper, BinGrouper - ): - # When binning we actually do set the index - coord = None - coord = getattr(coord, "variable", coord) - return coord, dim, positions + return dim, positions def _binary_op(self, other, f, reflexive=False): from xarray.core.dataarray import DataArray @@ -609,7 +574,7 @@ def _binary_op(self, other, f, reflexive=False): obj = self._original_obj name = grouper.name group = grouper.group - codes = self._codes + codes = self.encoded.codes dims = group.dims if isinstance(group, _DummyGroup): @@ -710,8 +675,8 @@ def _maybe_unstack(self, obj): """This gets called if we are applying on an array with a multidimensional group.""" (grouper,) = self.groupers - stacked_dim = grouper.stacked_dim - inserted_dims = grouper.inserted_dims + stacked_dim = self._stacked_dim + inserted_dims = self._inserted_dims if stacked_dim is not None and stacked_dim in obj.dims: obj = obj.unstack(stacked_dim) for dim in inserted_dims: @@ -797,7 +762,7 @@ def _flox_reduce( output_index = grouper.full_index result = xarray_reduce( obj.drop_vars(non_numeric.keys()), - self._codes, + self.encoded.codes, dim=parsed_dim, # pass RangeIndex as a hint to flox that `by` is already factorized expected_groups=(pd.RangeIndex(len(output_index)),), @@ -808,15 +773,27 @@ def _flox_reduce( # we did end up reducing over dimension(s) that are # in the grouped variable - group_dims = grouper.group.dims - if set(group_dims).issubset(set(parsed_dim)): - result = result.assign_coords( - Coordinates( - coords={name: (name, np.array(output_index))}, - indexes={name: PandasIndex(output_index, dim=name)}, + group_dims = set(grouper.group.dims) + new_coords = {} + if group_dims.issubset(set(parsed_dim)): + new_indexes = {} + for grouper in self.groupers: + output_index = grouper.full_index + if isinstance(output_index, pd.RangeIndex): + continue + name = grouper.name + new_coords[name] = IndexVariable( + dims=name, data=np.array(output_index), attrs=grouper.codes.attrs ) - ) - result = result.drop_vars(unindexed_dims) + index_cls = ( + PandasIndex + if not isinstance(output_index, pd.MultiIndex) + else PandasMultiIndex + ) + new_indexes[name] = index_cls(output_index, dim=name) + result = result.assign_coords( + Coordinates(new_coords, new_indexes) + ).drop_vars(unindexed_dims) # broadcast and restore non-numeric data variables (backcompat) for name, var in non_numeric.items(): @@ -986,7 +963,7 @@ def quantile( """ if dim is None: (grouper,) = self.groupers - dim = grouper.group1d.dims + dim = self.group1d.dims # Dataset.quantile does this, do it for flox to ensure same output. q = np.asarray(q, dtype=np.float64) @@ -1038,7 +1015,7 @@ def _first_or_last(self, op, skipna, keep_attrs): if all( isinstance(maybe_slice, slice) and (maybe_slice.stop == maybe_slice.start + 1) - for maybe_slice in self._group_indices + for maybe_slice in self.encoded.group_indices ): # NB. this is currently only used for reductions along an existing # dimension @@ -1087,8 +1064,7 @@ class DataArrayGroupByBase(GroupBy["DataArray"], DataArrayGroupbyArithmetic): @property def dims(self) -> tuple[Hashable, ...]: if self._dims is None: - (grouper,) = self.groupers - index = self._group_indices[0] + index = self.encoded.group_indices[0] self._dims = self._obj.isel({self._group_dim: index}).dims return self._dims @@ -1097,8 +1073,7 @@ def _iter_grouped_shortcut(self): metadata """ var = self._obj.variable - (grouper,) = self.groupers - for idx, indices in enumerate(self._group_indices): + for idx, indices in enumerate(self.encoded.group_indices): yield var[{self._group_dim: indices}] def _concat_shortcut(self, applied, dim, positions=None): @@ -1109,14 +1084,12 @@ def _concat_shortcut(self, applied, dim, positions=None): # TODO: benbovy - explicit indexes: this fast implementation doesn't # create an explicit index for the stacked dim coordinate stacked = Variable.concat(applied, dim, shortcut=True) - - (grouper,) = self.groupers - reordered = _maybe_reorder(stacked, dim, positions, N=grouper.group.size) + reordered = _maybe_reorder(stacked, dim, positions, N=self.group1d.size) return self._obj._replace_maybe_drop_dims(reordered) def _restore_dim_order(self, stacked: DataArray) -> DataArray: (grouper,) = self.groupers - group = grouper.group1d + group = self.group1d def lookup_order(dimension): if dimension == grouper.name: @@ -1200,24 +1173,21 @@ def apply(self, func, shortcut=False, args=(), **kwargs): def _combine(self, applied, shortcut=False): """Recombine the applied objects like the original.""" applied_example, applied = peek_at(applied) - coord, dim, positions = self._infer_concat_args(applied_example) + dim, positions = self._infer_concat_args(applied_example) if shortcut: combined = self._concat_shortcut(applied, dim, positions) else: combined = concat(applied, dim) - (grouper,) = self.groupers - combined = _maybe_reorder(combined, dim, positions, N=grouper.group.size) + combined = _maybe_reorder(combined, dim, positions, N=self.group1d.size) if isinstance(combined, type(self._obj)): # only restore dimension order for arrays combined = self._restore_dim_order(combined) # assign coord and index when the applied function does not return that coord - if coord is not None and dim not in applied_example.dims: - index, index_vars = create_default_index_implicit(coord) - indexes = {k: index for k in index_vars} - combined = combined._overwrite_indexes(indexes, index_vars) - combined = self._maybe_restore_empty_groups(combined) + if dim not in applied_example.dims: + combined = combined.assign_coords(self.encoded.coords) combined = self._maybe_unstack(combined) + combined = self._maybe_restore_empty_groups(combined) return combined def reduce( @@ -1297,8 +1267,7 @@ class DatasetGroupByBase(GroupBy["Dataset"], DatasetGroupbyArithmetic): @property def dims(self) -> Frozen[Hashable, int]: if self._dims is None: - (grouper,) = self.groupers - index = self._group_indices[0] + index = self.encoded.group_indices[0] self._dims = self._obj.isel({self._group_dim: index}).dims return FrozenMappingWarningOnValuesAccess(self._dims) @@ -1362,17 +1331,14 @@ def apply(self, func, args=(), shortcut=None, **kwargs): def _combine(self, applied): """Recombine the applied objects like the original.""" applied_example, applied = peek_at(applied) - coord, dim, positions = self._infer_concat_args(applied_example) + dim, positions = self._infer_concat_args(applied_example) combined = concat(applied, dim) - (grouper,) = self.groupers - combined = _maybe_reorder(combined, dim, positions, N=grouper.group.size) + combined = _maybe_reorder(combined, dim, positions, N=self.group1d.size) # assign coord when the applied function does not return that coord - if coord is not None and dim not in applied_example.dims: - index, index_vars = create_default_index_implicit(coord) - indexes = {k: index for k in index_vars} - combined = combined._overwrite_indexes(indexes, index_vars) - combined = self._maybe_restore_empty_groups(combined) + if dim not in applied_example.dims: + combined = combined.assign_coords(self.encoded.coords) combined = self._maybe_unstack(combined) + combined = self._maybe_restore_empty_groups(combined) return combined def reduce( diff --git a/xarray/core/types.py b/xarray/core/types.py index 0e432283ba9..3eb97f86c4a 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -243,6 +243,11 @@ def copy( "symmetric", "wrap", ] +T_PadConstantValues = float | tuple[float, float] +T_VarPadConstantValues = T_PadConstantValues | Mapping[Any, T_PadConstantValues] +T_DatasetPadConstantValues = ( + T_VarPadConstantValues | Mapping[Any, T_VarPadConstantValues] +) PadReflectOptions = Literal["even", "odd", None] CFCalendar = Literal[ diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 3cd8e4acbd5..a74fb4d8ce9 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -65,6 +65,7 @@ Self, T_Chunks, T_DuckArray, + T_VarPadConstantValues, ) from xarray.namedarray.parallelcompat import ChunkManagerEntrypoint @@ -1121,9 +1122,14 @@ def shift(self, shifts=None, fill_value=dtypes.NA, **shifts_kwargs): def _pad_options_dim_to_index( self, - pad_option: Mapping[Any, int | tuple[int, int]], + pad_option: Mapping[Any, int | float | tuple[int, int] | tuple[float, float]], fill_with_shape=False, ): + # change number values to a tuple of two of those values + for k, v in pad_option.items(): + if isinstance(v, numbers.Number): + pad_option[k] = (v, v) + if fill_with_shape: return [ (n, n) if d not in pad_option else pad_option[d] @@ -1138,9 +1144,7 @@ def pad( stat_length: ( int | tuple[int, int] | Mapping[Any, tuple[int, int]] | None ) = None, - constant_values: ( - float | tuple[float, float] | Mapping[Any, tuple[float, float]] | None - ) = None, + constant_values: T_VarPadConstantValues | None = None, end_values: int | tuple[int, int] | Mapping[Any, tuple[int, int]] | None = None, reflect_type: PadReflectOptions = None, keep_attrs: bool | None = None, @@ -1160,7 +1164,7 @@ def pad( stat_length : int, tuple or mapping of hashable to tuple Used in 'maximum', 'mean', 'median', and 'minimum'. Number of values at edge of each axis used to calculate the statistic value. - constant_values : scalar, tuple or mapping of hashable to tuple + constant_values : scalar, tuple or mapping of hashable to scalar or tuple Used in 'constant'. The values to set the padded values for each axis. end_values : scalar, tuple or mapping of hashable to tuple @@ -1207,10 +1211,6 @@ def pad( if stat_length is None and mode in ["maximum", "mean", "median", "minimum"]: stat_length = [(n, n) for n in self.data.shape] # type: ignore[assignment] - # change integer values to a tuple of two of those values and change pad_width to index - for k, v in pad_width.items(): - if isinstance(v, numbers.Number): - pad_width[k] = (v, v) pad_width_by_index = self._pad_options_dim_to_index(pad_width) # create pad_options_kwargs, numpy/dask requires only relevant kwargs to be nonempty diff --git a/xarray/groupers.py b/xarray/groupers.py index 98409dfe542..f70cad655e8 100644 --- a/xarray/groupers.py +++ b/xarray/groupers.py @@ -9,13 +9,14 @@ import datetime from abc import ABC, abstractmethod from dataclasses import dataclass, field -from typing import Any, Literal, cast +from typing import TYPE_CHECKING, Any, Literal, cast import numpy as np import pandas as pd from xarray.coding.cftime_offsets import _new_to_legacy_freq from xarray.core import duck_array_ops +from xarray.core.coordinates import Coordinates from xarray.core.dataarray import DataArray from xarray.core.groupby import T_Group, _DummyGroup from xarray.core.indexes import safe_cast_to_index @@ -35,7 +36,18 @@ RESAMPLE_DIM = "__resample_dim__" -@dataclass +def _coordinates_from_variable(variable: Variable) -> Coordinates: + from xarray.core.indexes import create_default_index_implicit + + (name,) = variable.dims + new_index, index_vars = create_default_index_implicit(variable) + indexes = {k: new_index for k in index_vars} + new_vars = new_index.create_variables() + new_vars[name].attrs = variable.attrs + return Coordinates(new_vars, indexes) + + +@dataclass(init=False) class EncodedGroups: """ Dataclass for storing intermediate values for GroupBy operation. @@ -57,18 +69,49 @@ class EncodedGroups: codes: DataArray full_index: pd.Index - group_indices: GroupIndices | None = field(default=None) - unique_coord: Variable | _DummyGroup | None = field(default=None) - - def __post_init__(self): - assert isinstance(self.codes, DataArray) - if self.codes.name is None: + group_indices: GroupIndices + unique_coord: Variable | _DummyGroup + coords: Coordinates + + def __init__( + self, + codes: DataArray, + full_index: pd.Index, + group_indices: GroupIndices | None = None, + unique_coord: Variable | _DummyGroup | None = None, + coords: Coordinates | None = None, + ): + from xarray.core.groupby import _codes_to_group_indices + + assert isinstance(codes, DataArray) + if codes.name is None: raise ValueError("Please set a name on the array you are grouping by.") - assert isinstance(self.full_index, pd.Index) - assert ( - isinstance(self.unique_coord, Variable | _DummyGroup) - or self.unique_coord is None - ) + self.codes = codes + assert isinstance(full_index, pd.Index) + self.full_index = full_index + + if group_indices is None: + self.group_indices = tuple( + g + for g in _codes_to_group_indices(codes.data.ravel(), len(full_index)) + if g + ) + else: + self.group_indices = group_indices + + if unique_coord is None: + unique_values = full_index[np.unique(codes)] + self.unique_coord = Variable( + dims=codes.name, data=unique_values, attrs=codes.attrs + ) + else: + self.unique_coord = unique_coord + + if coords is None: + assert not isinstance(self.unique_coord, _DummyGroup) + self.coords = _coordinates_from_variable(self.unique_coord) + else: + self.coords = coords class Grouper(ABC): @@ -111,11 +154,14 @@ class UniqueGrouper(Grouper): def group_as_index(self) -> pd.Index: """Caches the group DataArray as a pandas Index.""" if self._group_as_index is None: - self._group_as_index = self.group.to_index() + if self.group.ndim == 1: + self._group_as_index = self.group.to_index() + else: + self._group_as_index = pd.Index(np.array(self.group).ravel()) return self._group_as_index - def factorize(self, group1d: T_Group) -> EncodedGroups: - self.group = group1d + def factorize(self, group: T_Group) -> EncodedGroups: + self.group = group index = self.group_as_index is_unique_and_monotonic = isinstance(self.group, _DummyGroup) or ( @@ -138,14 +184,17 @@ def _factorize_unique(self) -> EncodedGroups: raise ValueError( "Failed to group data. Are you grouping by a variable that is all NaN?" ) - codes = self.group.copy(data=codes_) + codes = self.group.copy(data=codes_.reshape(self.group.shape)) unique_coord = Variable( dims=codes.name, data=unique_values, attrs=self.group.attrs ) full_index = pd.Index(unique_values) return EncodedGroups( - codes=codes, full_index=full_index, unique_coord=unique_coord + codes=codes, + full_index=full_index, + unique_coord=unique_coord, + coords=_coordinates_from_variable(unique_coord), ) def _factorize_dummy(self) -> EncodedGroups: @@ -156,20 +205,31 @@ def _factorize_dummy(self) -> EncodedGroups: group_indices: GroupIndices = tuple(slice(i, i + 1) for i in range(size)) size_range = np.arange(size) full_index: pd.Index + unique_coord: _DummyGroup | Variable if isinstance(self.group, _DummyGroup): codes = self.group.to_dataarray().copy(data=size_range) unique_coord = self.group full_index = pd.RangeIndex(self.group.size) + coords = Coordinates() else: codes = self.group.copy(data=size_range) unique_coord = self.group.variable.to_base_variable() - full_index = pd.Index(unique_coord.data) + full_index = self.group_as_index + if isinstance(full_index, pd.MultiIndex): + coords = Coordinates.from_pandas_multiindex( + full_index, dim=self.group.name + ) + else: + if TYPE_CHECKING: + assert isinstance(unique_coord, Variable) + coords = _coordinates_from_variable(unique_coord) return EncodedGroups( codes=codes, group_indices=group_indices, full_index=full_index, unique_coord=unique_coord, + coords=coords, ) @@ -231,7 +291,7 @@ def factorize(self, group: T_Group) -> EncodedGroups: data = np.asarray(group.data) # Cast _DummyGroup data to array binned, self.bins = pd.cut( # type: ignore [call-overload] - data, + data.ravel(), bins=self.bins, right=self.right, labels=self.labels, @@ -254,13 +314,18 @@ def factorize(self, group: T_Group) -> EncodedGroups: unique_values = full_index[uniques[uniques != -1]] codes = DataArray( - binned_codes, getattr(group, "coords", None), name=new_dim_name + binned_codes.reshape(group.shape), + getattr(group, "coords", None), + name=new_dim_name, ) unique_coord = Variable( dims=new_dim_name, data=unique_values, attrs=group.attrs ) return EncodedGroups( - codes=codes, full_index=full_index, unique_coord=unique_coord + codes=codes, + full_index=full_index, + unique_coord=unique_coord, + coords=_coordinates_from_variable(unique_coord), ) @@ -373,13 +438,14 @@ def factorize(self, group: T_Group) -> EncodedGroups: unique_coord = Variable( dims=group.name, data=first_items.index, attrs=group.attrs ) - codes = group.copy(data=codes_) + codes = group.copy(data=codes_.reshape(group.shape)) return EncodedGroups( codes=codes, group_indices=group_indices, full_index=full_index, unique_coord=unique_coord, + coords=_coordinates_from_variable(unique_coord), ) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 0caab6e8247..b4d3871c229 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -134,6 +134,7 @@ def _importorskip( has_pint, requires_pint = _importorskip("pint") has_numexpr, requires_numexpr = _importorskip("numexpr") has_flox, requires_flox = _importorskip("flox") +has_pandas_ge_2_1, __ = _importorskip("pandas", "2.1") has_pandas_ge_2_2, __ = _importorskip("pandas", "2.2") has_pandas_3, requires_pandas_3 = _importorskip("pandas", "3.0.0.dev0") diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index bbe48663c1f..c755924f583 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -54,6 +54,7 @@ from xarray.conventions import encode_dataset_coordinates from xarray.core import indexing from xarray.core.options import set_options +from xarray.core.utils import module_available from xarray.namedarray.pycompat import array_type from xarray.tests import ( assert_allclose, @@ -166,7 +167,7 @@ def create_encoded_masked_and_scaled_data(dtype: np.dtype) -> Dataset: def create_unsigned_masked_scaled_data(dtype: np.dtype) -> Dataset: encoding = { - "_FillValue": 255, + "_FillValue": -1, "_Unsigned": "true", "dtype": "i1", "add_offset": dtype.type(10), @@ -242,6 +243,32 @@ def create_encoded_signed_masked_scaled_data(dtype: np.dtype) -> Dataset: return Dataset({"x": ("t", sb, attributes)}) +def create_unsigned_false_masked_scaled_data(dtype: np.dtype) -> Dataset: + encoding = { + "_FillValue": 255, + "_Unsigned": "false", + "dtype": "u1", + "add_offset": dtype.type(10), + "scale_factor": dtype.type(0.1), + } + x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=dtype) + return Dataset({"x": ("t", x, {}, encoding)}) + + +def create_encoded_unsigned_false_masked_scaled_data(dtype: np.dtype) -> Dataset: + # These are values as written to the file: the _FillValue will + # be represented in the unsigned form. + attributes = { + "_FillValue": 255, + "_Unsigned": "false", + "add_offset": dtype.type(10), + "scale_factor": dtype.type(0.1), + } + # Create unsigned data corresponding to [-110, 1, 127, 255] signed + sb = np.asarray([146, 1, 127, 255], dtype="u1") + return Dataset({"x": ("t", sb, attributes)}) + + def create_boolean_data() -> Dataset: attributes = {"units": "-"} return Dataset({"x": ("t", [True, False, False, True], attributes)}) @@ -890,6 +917,10 @@ def test_roundtrip_empty_vlen_string_array(self) -> None: create_signed_masked_scaled_data, create_encoded_signed_masked_scaled_data, ), + ( + create_unsigned_false_masked_scaled_data, + create_encoded_unsigned_false_masked_scaled_data, + ), (create_masked_and_scaled_data, create_encoded_masked_and_scaled_data), ], ) @@ -899,9 +930,21 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: pytest.skip("float32 will be treated as float64 in zarr") decoded = decoded_fn(dtype) encoded = encoded_fn(dtype) + if decoded["x"].encoding["dtype"] == "u1" and not ( + self.engine == "netcdf4" + and self.file_format is None + or self.file_format == "NETCDF4" + ): + pytest.skip("uint8 data can't be written to non-NetCDF4 data") + with self.roundtrip(decoded) as actual: for k in decoded.variables: assert decoded.variables[k].dtype == actual.variables[k].dtype + # CF _FillValue is always on-disk type + assert ( + decoded.variables[k].encoding["_FillValue"] + == actual.variables[k].encoding["_FillValue"] + ) assert_allclose(decoded, actual, decode_bytes=False) with self.roundtrip(decoded, open_kwargs=dict(decode_cf=False)) as actual: @@ -909,11 +952,21 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: # encode. Is that something we want to test for? for k in encoded.variables: assert encoded.variables[k].dtype == actual.variables[k].dtype + # CF _FillValue is always on-disk type + assert ( + decoded.variables[k].encoding["_FillValue"] + == actual.variables[k].attrs["_FillValue"] + ) assert_allclose(encoded, actual, decode_bytes=False) with self.roundtrip(encoded, open_kwargs=dict(decode_cf=False)) as actual: for k in encoded.variables: assert encoded.variables[k].dtype == actual.variables[k].dtype + # CF _FillValue is always on-disk type + assert ( + encoded.variables[k].attrs["_FillValue"] + == actual.variables[k].attrs["_FillValue"] + ) assert_allclose(encoded, actual, decode_bytes=False) # make sure roundtrip encoding didn't change the @@ -925,11 +978,33 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: assert decoded.variables[k].dtype == actual.variables[k].dtype assert_allclose(decoded, actual, decode_bytes=False) - @pytest.mark.parametrize("fillvalue", [np.int8(-1), np.uint8(255), -1, 255]) - def test_roundtrip_unsigned(self, fillvalue): + @pytest.mark.parametrize( + ("fill_value", "exp_fill_warning"), + [ + (np.int8(-1), False), + (np.uint8(255), True), + (-1, False), + (255, True), + ], + ) + def test_roundtrip_unsigned(self, fill_value, exp_fill_warning): + @contextlib.contextmanager + def _roundtrip_with_warnings(*args, **kwargs): + is_np2 = module_available("numpy", minversion="2.0.0.dev0") + if exp_fill_warning and is_np2: + warn_checker: contextlib.AbstractContextManager = pytest.warns( + SerializationWarning, + match="_FillValue attribute can't be represented", + ) + else: + warn_checker = contextlib.nullcontext() + with warn_checker: + with self.roundtrip(*args, **kwargs) as actual: + yield actual + # regression/numpy2 test for encoding = { - "_FillValue": fillvalue, + "_FillValue": fill_value, "_Unsigned": "true", "dtype": "i1", } @@ -937,21 +1012,32 @@ def test_roundtrip_unsigned(self, fillvalue): decoded = Dataset({"x": ("t", x, {}, encoding)}) attributes = { - "_FillValue": fillvalue, + "_FillValue": fill_value, "_Unsigned": "true", } # Create unsigned data corresponding to [0, 1, 127, 128, 255] unsigned sb = np.asarray([0, 1, 127, -128, -2, -1], dtype="i1") encoded = Dataset({"x": ("t", sb, attributes)}) + unsigned_dtype = np.dtype(f"u{sb.dtype.itemsize}") - with self.roundtrip(decoded) as actual: + with _roundtrip_with_warnings(decoded) as actual: for k in decoded.variables: assert decoded.variables[k].dtype == actual.variables[k].dtype + exp_fv = decoded.variables[k].encoding["_FillValue"] + if exp_fill_warning: + exp_fv = np.array(exp_fv, dtype=unsigned_dtype).view(sb.dtype) + assert exp_fv == actual.variables[k].encoding["_FillValue"] assert_allclose(decoded, actual, decode_bytes=False) - with self.roundtrip(decoded, open_kwargs=dict(decode_cf=False)) as actual: + with _roundtrip_with_warnings( + decoded, open_kwargs=dict(decode_cf=False) + ) as actual: for k in encoded.variables: assert encoded.variables[k].dtype == actual.variables[k].dtype + exp_fv = encoded.variables[k].attrs["_FillValue"] + if exp_fill_warning: + exp_fv = np.array(exp_fv, dtype=unsigned_dtype).view(sb.dtype) + assert exp_fv == actual.variables[k].attrs["_FillValue"] assert_allclose(encoded, actual, decode_bytes=False) @staticmethod diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index 6d81d6f5dc8..acb32504948 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -129,7 +129,7 @@ def test_decode_unsigned_from_signed(bits) -> None: encoded = xr.Variable( ("x",), original_values.astype(signed_dtype), attrs={"_Unsigned": "true"} ) - coder = variables.UnsignedIntegerCoder() + coder = variables.CFMaskCoder() decoded = coder.decode(encoded) assert decoded.dtype == unsigned_dtype assert decoded.values == original_values @@ -143,7 +143,7 @@ def test_decode_signed_from_unsigned(bits) -> None: encoded = xr.Variable( ("x",), original_values.astype(unsigned_dtype), attrs={"_Unsigned": "false"} ) - coder = variables.UnsignedIntegerCoder() + coder = variables.CFMaskCoder() decoded = coder.decode(encoded) assert decoded.dtype == signed_dtype assert decoded.values == original_values diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index fb3d487f2ef..f2e712e334c 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -6704,18 +6704,80 @@ def test_polyfit_warnings(self) -> None: ds.var1.polyfit("dim2", 10, full=True) assert len(ws) == 1 - def test_pad(self) -> None: - ds = create_test_data(seed=1) - padded = ds.pad(dim2=(1, 1), constant_values=42) - - assert padded["dim2"].shape == (11,) - assert padded["var1"].shape == (8, 11) - assert padded["var2"].shape == (8, 11) - assert padded["var3"].shape == (10, 8) - assert dict(padded.sizes) == {"dim1": 8, "dim2": 11, "dim3": 10, "time": 20} + @staticmethod + def _test_data_var_interior( + original_data_var, padded_data_var, padded_dim_name, expected_pad_values + ): + np.testing.assert_equal( + np.unique(padded_data_var.isel({padded_dim_name: [0, -1]})), + expected_pad_values, + ) + np.testing.assert_array_equal( + padded_data_var.isel({padded_dim_name: slice(1, -1)}), original_data_var + ) - np.testing.assert_equal(padded["var1"].isel(dim2=[0, -1]).data, 42) - np.testing.assert_equal(padded["dim2"][[0, -1]].data, np.nan) + @pytest.mark.parametrize("padded_dim_name", ["dim1", "dim2", "dim3", "time"]) + @pytest.mark.parametrize( + ["constant_values"], + [ + pytest.param(None, id="default"), + pytest.param(42, id="scalar"), + pytest.param((42, 43), id="tuple"), + pytest.param({"dim1": 42, "dim2": 43}, id="per dim scalar"), + pytest.param({"dim1": (42, 43), "dim2": (43, 44)}, id="per dim tuple"), + pytest.param({"var1": 42, "var2": (42, 43)}, id="per var"), + pytest.param({"var1": 42, "dim1": (42, 43)}, id="mixed"), + ], + ) + def test_pad(self, padded_dim_name, constant_values) -> None: + ds = create_test_data(seed=1) + padded = ds.pad({padded_dim_name: (1, 1)}, constant_values=constant_values) + + # test padded dim values and size + for ds_dim_name, ds_dim in ds.sizes.items(): + if ds_dim_name == padded_dim_name: + np.testing.assert_equal(padded.sizes[ds_dim_name], ds_dim + 2) + if ds_dim_name in padded.coords: + assert padded[ds_dim_name][[0, -1]].isnull().all() + else: + np.testing.assert_equal(padded.sizes[ds_dim_name], ds_dim) + + # check if coord "numbers" with dimention dim3 is paded correctly + if padded_dim_name == "dim3": + assert padded["numbers"][[0, -1]].isnull().all() + # twarning: passes but dtype changes from int to float + np.testing.assert_array_equal(padded["numbers"][1:-1], ds["numbers"]) + + # test if data_vars are paded with correct values + for data_var_name, data_var in padded.data_vars.items(): + if padded_dim_name in data_var.dims: + if utils.is_dict_like(constant_values): + if ( + expected := constant_values.get(data_var_name, None) + ) is not None: + self._test_data_var_interior( + ds[data_var_name], data_var, padded_dim_name, expected + ) + elif ( + expected := constant_values.get(padded_dim_name, None) + ) is not None: + self._test_data_var_interior( + ds[data_var_name], data_var, padded_dim_name, expected + ) + else: + self._test_data_var_interior( + ds[data_var_name], data_var, padded_dim_name, 0 + ) + elif constant_values: + self._test_data_var_interior( + ds[data_var_name], data_var, padded_dim_name, constant_values + ) + else: + self._test_data_var_interior( + ds[data_var_name], data_var, padded_dim_name, np.nan + ) + else: + assert_array_equal(data_var, ds[data_var_name]) @pytest.mark.parametrize( ["keep_attrs", "attrs", "expected"], diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 6c9254966d9..7dbb0d5e59c 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -22,6 +22,7 @@ create_test_data, has_cftime, has_flox, + has_pandas_ge_2_1, requires_cftime, requires_dask, requires_flox, @@ -118,6 +119,13 @@ def test_multi_index_groupby_sum() -> None: actual = ds.stack(space=["x", "y"]).groupby("space").sum("z").unstack("space") assert_equal(expected, actual) + if not has_pandas_ge_2_1: + # the next line triggers a mysterious multiindex error on pandas 2.0 + return + + actual = ds.stack(space=["x", "y"]).groupby("space").sum(...).unstack("space") + assert_equal(expected, actual) + def test_groupby_da_datetime() -> None: # test groupby with a DataArray of dtype datetime for GH1132 @@ -806,6 +814,7 @@ def test_groupby_dataset_errors() -> None: data.groupby(data.coords["dim1"].to_index()) # type: ignore[arg-type] +@pytest.mark.parametrize("use_flox", [True, False]) @pytest.mark.parametrize( "by_func", [ @@ -813,7 +822,10 @@ def test_groupby_dataset_errors() -> None: pytest.param(lambda x: {x: UniqueGrouper()}, id="group-by-unique-grouper"), ], ) -def test_groupby_dataset_reduce_ellipsis(by_func) -> None: +@pytest.mark.parametrize("letters_as_coord", [True, False]) +def test_groupby_dataset_reduce_ellipsis( + by_func, use_flox: bool, letters_as_coord: bool +) -> None: data = Dataset( { "xy": (["x", "y"], np.random.randn(3, 4)), @@ -823,13 +835,18 @@ def test_groupby_dataset_reduce_ellipsis(by_func) -> None: } ) + if letters_as_coord: + data = data.set_coords("letters") + expected = data.mean("y") expected["yonly"] = expected["yonly"].variable.set_dims({"x": 3}) gb = data.groupby(by_func("x")) - actual = gb.mean(...) + with xr.set_options(use_flox=use_flox): + actual = gb.mean(...) assert_allclose(expected, actual) - actual = gb.mean("y") + with xr.set_options(use_flox=use_flox): + actual = gb.mean("y") assert_allclose(expected, actual) letters = data["letters"] @@ -841,7 +858,8 @@ def test_groupby_dataset_reduce_ellipsis(by_func) -> None: } ) gb = data.groupby(by_func("letters")) - actual = gb.mean(...) + with xr.set_options(use_flox=use_flox): + actual = gb.mean(...) assert_allclose(expected, actual) @@ -1729,7 +1747,7 @@ def test_groupby_fastpath_for_monotonic(self, use_flox: bool) -> None: rev = array_rev.groupby("idx", squeeze=False) for gb in [fwd, rev]: - assert all([isinstance(elem, slice) for elem in gb._group_indices]) + assert all([isinstance(elem, slice) for elem in gb.encoded.group_indices]) with xr.set_options(use_flox=use_flox): assert_identical(fwd.sum(), array) @@ -2561,3 +2579,29 @@ def factorize(self, group) -> EncodedGroups: obj.groupby("time.year", time=YearGrouper()) with pytest.raises(ValueError): obj.groupby() + + +@pytest.mark.parametrize("use_flox", [True, False]) +def test_weather_data_resample(use_flox): + # from the docs + times = pd.date_range("2000-01-01", "2001-12-31", name="time") + annual_cycle = np.sin(2 * np.pi * (times.dayofyear.values / 365.25 - 0.28)) + + base = 10 + 15 * annual_cycle.reshape(-1, 1) + tmin_values = base + 3 * np.random.randn(annual_cycle.size, 3) + tmax_values = base + 10 + 3 * np.random.randn(annual_cycle.size, 3) + + ds = xr.Dataset( + { + "tmin": (("time", "location"), tmin_values), + "tmax": (("time", "location"), tmax_values), + }, + { + "time": ("time", times, {"time_key": "time_values"}), + "location": ("location", ["IA", "IN", "IL"], {"loc_key": "loc_value"}), + }, + ) + + with xr.set_options(use_flox=use_flox): + actual = ds.resample(time="1MS").mean() + assert "location" in actual._indexes