From e70138b61033081e3bfab3aaaec5997716cd7109 Mon Sep 17 00:00:00 2001 From: crusaderky Date: Wed, 13 Nov 2019 00:53:26 +0000 Subject: [PATCH 01/24] Recursive tokenization (#3515) * recursive tokenize * black * What's New * Also test Dataset * Also test IndexVariable * Cleanup * tokenize sparse objects --- doc/whats-new.rst | 2 +- xarray/core/dataarray.py | 4 +++- xarray/core/dataset.py | 6 +++++- xarray/core/variable.py | 8 ++++++-- xarray/tests/test_dask.py | 26 ++++++++++++++++++++++++++ xarray/tests/test_sparse.py | 4 ++++ 6 files changed, 45 insertions(+), 5 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 96f0ba9a4a6..620617c127a 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -73,7 +73,7 @@ New Features for xarray objects. Note that xarray objects with a dask.array backend already used deterministic hashing in previous releases; this change implements it when whole xarray objects are embedded in a dask graph, e.g. when :meth:`DataArray.map` is - invoked. (:issue:`3378`, :pull:`3446`) + invoked. (:issue:`3378`, :pull:`3446`, :pull:`3515`) By `Deepak Cherian `_ and `Guido Imperiale `_. diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 5e164f420c8..a192fe08cee 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -755,7 +755,9 @@ def reset_coords( return dataset def __dask_tokenize__(self): - return (type(self), self._variable, self._coords, self._name) + from dask.base import normalize_token + + return normalize_token((type(self), self._variable, self._coords, self._name)) def __dask_graph__(self): return self._to_temp_dataset().__dask_graph__() diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index dc5a315e72a..fe8abdc4b95 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -652,7 +652,11 @@ def load(self, **kwargs) -> "Dataset": return self def __dask_tokenize__(self): - return (type(self), self._variables, self._coord_names, self._attrs) + from dask.base import normalize_token + + return normalize_token( + (type(self), self._variables, self._coord_names, self._attrs) + ) def __dask_graph__(self): graphs = {k: v.__dask_graph__() for k, v in self.variables.items()} diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 916df75b3e0..f842a4a9428 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -393,7 +393,9 @@ def compute(self, **kwargs): def __dask_tokenize__(self): # Use v.data, instead of v._data, in order to cope with the wrappers # around NetCDF and the like - return type(self), self._dims, self.data, self._attrs + from dask.base import normalize_token + + return normalize_token((type(self), self._dims, self.data, self._attrs)) def __dask_graph__(self): if isinstance(self._data, dask_array_type): @@ -1973,8 +1975,10 @@ def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False): self._data = PandasIndexAdapter(self._data) def __dask_tokenize__(self): + from dask.base import normalize_token + # Don't waste time converting pd.Index to np.ndarray - return (type(self), self._dims, self._data.array, self._attrs) + return normalize_token((type(self), self._dims, self._data.array, self._attrs)) def load(self): # data is already loaded into memory for IndexVariable diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index fa8ae9991d7..43b788153bc 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1283,6 +1283,32 @@ def test_token_identical(obj, transform): ) +def test_recursive_token(): + """Test that tokenization is invoked recursively, and doesn't just rely on the + output of str() + """ + a = np.ones(10000) + b = np.ones(10000) + b[5000] = 2 + assert str(a) == str(b) + assert dask.base.tokenize(a) != dask.base.tokenize(b) + + # Test DataArray and Variable + da_a = DataArray(a) + da_b = DataArray(b) + assert dask.base.tokenize(da_a) != dask.base.tokenize(da_b) + + # Test Dataset + ds_a = da_a.to_dataset(name="x") + ds_b = da_b.to_dataset(name="x") + assert dask.base.tokenize(ds_a) != dask.base.tokenize(ds_b) + + # Test IndexVariable + da_a = DataArray(a, dims=["x"], coords={"x": a}) + da_b = DataArray(a, dims=["x"], coords={"x": b}) + assert dask.base.tokenize(da_a) != dask.base.tokenize(da_b) + + @requires_scipy_or_netCDF4 def test_normalize_token_with_backend(map_ds): with create_tmp_file(allow_cleanup_failure=ON_WINDOWS) as tmp_file: diff --git a/xarray/tests/test_sparse.py b/xarray/tests/test_sparse.py index a31da162487..a02fef2faeb 100644 --- a/xarray/tests/test_sparse.py +++ b/xarray/tests/test_sparse.py @@ -856,6 +856,10 @@ def test_dask_token(): import dask s = sparse.COO.from_numpy(np.array([0, 0, 1, 2])) + + # https://github.com/pydata/sparse/issues/300 + s.__dask_tokenize__ = lambda: dask.base.normalize_token(s.__dict__) + a = DataArray(s) t1 = dask.base.tokenize(a) t2 = dask.base.tokenize(a) From 94525bbaf417476dbe9a70b98801ae04aceaebf3 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 13 Nov 2019 15:48:45 +0000 Subject: [PATCH 02/24] Deprecate allow_lazy (#3435) * Deprecate allow_lazy * add whats-new * test that reductions are lazy * minor whats-new fix. * fix merge wahts=new * fix bad merge. * remove tests that only work with nep-18 * Update doc/whats-new.rst Co-Authored-By: Mathias Hauser * Update xarray/core/variable.py Co-Authored-By: Mathias Hauser * fix whats-new * Fix test that assumed NEP-18 * fix tests. --- doc/whats-new.rst | 3 +++ xarray/core/common.py | 17 ++++------------- xarray/core/dataset.py | 2 +- xarray/core/groupby.py | 4 +--- xarray/core/variable.py | 13 ++++++++++++- xarray/tests/test_dask.py | 18 ++++++++++++++++-- xarray/tests/test_variable.py | 4 ++++ 7 files changed, 41 insertions(+), 20 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 620617c127a..212e465b368 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -88,6 +88,9 @@ Bug fixes By `Deepak Cherian `_. - Sync with cftime by removing `dayofwk=-1` for cftime>=1.0.4. By `Anderson Banihirwe `_. +- Rolling reduction operations no longer compute dask arrays by default. (:issue:`3161`). + In addition, the ``allow_lazy`` kwarg to ``reduce`` is deprecated. + By `Deepak Cherian `_. - Fix :py:meth:`xarray.core.groupby.DataArrayGroupBy.reduce` and :py:meth:`xarray.core.groupby.DatasetGroupBy.reduce` when reducing over multiple dimensions. (:issue:`3402`). By `Deepak Cherian `_ diff --git a/xarray/core/common.py b/xarray/core/common.py index d372115ea57..2afe4b4c3a7 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -43,14 +43,12 @@ def _reduce_method(cls, func: Callable, include_skipna: bool, numeric_only: bool if include_skipna: def wrapped_func(self, dim=None, axis=None, skipna=None, **kwargs): - return self.reduce( - func, dim, axis, skipna=skipna, allow_lazy=True, **kwargs - ) + return self.reduce(func, dim, axis, skipna=skipna, **kwargs) else: def wrapped_func(self, dim=None, axis=None, **kwargs): # type: ignore - return self.reduce(func, dim, axis, allow_lazy=True, **kwargs) + return self.reduce(func, dim, axis, **kwargs) return wrapped_func @@ -83,20 +81,13 @@ def _reduce_method(cls, func: Callable, include_skipna: bool, numeric_only: bool def wrapped_func(self, dim=None, skipna=None, **kwargs): return self.reduce( - func, - dim, - skipna=skipna, - numeric_only=numeric_only, - allow_lazy=True, - **kwargs, + func, dim, skipna=skipna, numeric_only=numeric_only, **kwargs ) else: def wrapped_func(self, dim=None, **kwargs): # type: ignore - return self.reduce( - func, dim, numeric_only=numeric_only, allow_lazy=True, **kwargs - ) + return self.reduce(func, dim, numeric_only=numeric_only, **kwargs) return wrapped_func diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index fe8abdc4b95..15a7209ab24 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4031,7 +4031,7 @@ def reduce( keep_attrs: bool = None, keepdims: bool = False, numeric_only: bool = False, - allow_lazy: bool = False, + allow_lazy: bool = None, **kwargs: Any, ) -> "Dataset": """Reduce this dataset by applying `func` along some dimension(s). diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 8ae65d9b9df..c73ee3cf7c5 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -585,9 +585,7 @@ def _first_or_last(self, op, skipna, keep_attrs): return self._obj if keep_attrs is None: keep_attrs = _get_keep_attrs(default=True) - return self.reduce( - op, self._group_dim, skipna=skipna, keep_attrs=keep_attrs, allow_lazy=True - ) + return self.reduce(op, self._group_dim, skipna=skipna, keep_attrs=keep_attrs) def first(self, skipna=None, keep_attrs=None): """Return the first element of each group along the group dimension diff --git a/xarray/core/variable.py b/xarray/core/variable.py index f842a4a9428..cf97c997017 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1,5 +1,6 @@ import functools import itertools +import warnings from collections import defaultdict from datetime import timedelta from distutils.version import LooseVersion @@ -1427,7 +1428,7 @@ def reduce( axis=None, keep_attrs=None, keepdims=False, - allow_lazy=False, + allow_lazy=None, **kwargs, ): """Reduce this array by applying `func` along some dimension(s). @@ -1468,7 +1469,17 @@ def reduce( if dim is not None: axis = self.get_axis_num(dim) + + if allow_lazy is not None: + warnings.warn( + "allow_lazy is deprecated and will be removed in version 0.16.0. It is now True by default.", + DeprecationWarning, + ) + else: + allow_lazy = True + input_data = self.data if allow_lazy else self.values + if axis is not None: data = func(input_data, axis=axis, **kwargs) else: diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 43b788153bc..4c1f317342f 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -12,6 +12,7 @@ import xarray as xr import xarray.ufuncs as xu from xarray import DataArray, Dataset, Variable +from xarray.core import duck_array_ops from xarray.testing import assert_chunks_equal from xarray.tests import mock @@ -217,6 +218,8 @@ def test_reduce(self): self.assertLazyAndAllClose((u < 1).all("x"), (v < 1).all("x")) with raises_regex(NotImplementedError, "dask"): v.median() + with raise_if_dask_computes(): + v.reduce(duck_array_ops.mean) def test_missing_values(self): values = np.array([0, 1, np.nan, 3]) @@ -488,7 +491,17 @@ def test_groupby(self): v = self.lazy_array expected = u.groupby("x").mean(...) - actual = v.groupby("x").mean(...) + with raise_if_dask_computes(): + actual = v.groupby("x").mean(...) + self.assertLazyAndAllClose(expected, actual) + + def test_rolling(self): + u = self.eager_array + v = self.lazy_array + + expected = u.rolling(x=2).mean() + with raise_if_dask_computes(): + actual = v.rolling(x=2).mean() self.assertLazyAndAllClose(expected, actual) def test_groupby_first(self): @@ -500,7 +513,8 @@ def test_groupby_first(self): with raises_regex(NotImplementedError, "dask"): v.groupby("ab").first() expected = u.groupby("ab").first() - actual = v.groupby("ab").first(skipna=False) + with raise_if_dask_computes(): + actual = v.groupby("ab").first(skipna=False) self.assertLazyAndAllClose(expected, actual) def test_reindex(self): diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 528027ed149..d394919dbdd 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1477,6 +1477,10 @@ def test_reduce(self): with raises_regex(ValueError, "cannot supply both"): v.mean(dim="x", axis=0) + with pytest.warns(DeprecationWarning, match="allow_lazy is deprecated"): + v.mean(dim="x", allow_lazy=True) + with pytest.warns(DeprecationWarning, match="allow_lazy is deprecated"): + v.mean(dim="x", allow_lazy=False) def test_quantile(self): v = Variable(["x", "y"], self.d) From 7241aa12ae168f7af6efcf13f8012158a1331cb3 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 13 Nov 2019 15:53:34 +0000 Subject: [PATCH 03/24] warn if dim is passed to rolling operations. (#3513) * warn if dim is passed to rolling operations. * Update doc/whats-new.rst Co-Authored-By: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> * Update xarray/core/rolling.py Co-Authored-By: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> --- doc/whats-new.rst | 3 +++ xarray/core/rolling.py | 9 +++++++++ xarray/tests/test_dataarray.py | 6 ++++++ 3 files changed, 18 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 212e465b368..f042f846c39 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -220,6 +220,9 @@ Bug fixes By `Deepak Cherian `_. - Fix error in concatenating unlabeled dimensions (:pull:`3362`). By `Deepak Cherian `_. +- Warn if the ``dim`` kwarg is passed to rolling operations. This is redundant since a dimension is + specified when the :py:class:`DatasetRolling` or :py:class:`DataArrayRolling` object is created. + (:pull:`3362`). By `Deepak Cherian `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index f4e571a8efe..a1864332f4d 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -1,4 +1,5 @@ import functools +import warnings from typing import Callable import numpy as np @@ -351,6 +352,14 @@ def _bottleneck_reduce(self, func, **kwargs): def _numpy_or_bottleneck_reduce( self, array_agg_func, bottleneck_move_func, **kwargs ): + if "dim" in kwargs: + warnings.warn( + f"Reductions will be applied along the rolling dimension '{self.dim}'. Passing the 'dim' kwarg to reduction operations has no effect and will raise an error in xarray 0.16.0.", + DeprecationWarning, + stacklevel=3, + ) + del kwargs["dim"] + if bottleneck_move_func is not None and not isinstance( self.obj.data, dask_array_type ): diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 42fae2c9dd4..7c6dc1825a1 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -4188,6 +4188,9 @@ def test_rolling_wrapped_bottleneck(da, name, center, min_periods): ) assert_array_equal(actual.values, expected) + with pytest.warns(DeprecationWarning, match="Reductions will be applied"): + getattr(rolling_obj, name)(dim="time") + # Test center rolling_obj = da.rolling(time=7, center=center) actual = getattr(rolling_obj, name)()["time"] @@ -4203,6 +4206,9 @@ def test_rolling_wrapped_dask(da_dask, name, center, min_periods, window): # dask version rolling_obj = da_dask.rolling(time=window, min_periods=min_periods, center=center) actual = getattr(rolling_obj, name)().load() + if name != "count": + with pytest.warns(DeprecationWarning, match="Reductions will be applied"): + getattr(rolling_obj, name)(dim="time") # numpy version rolling_obj = da_dask.load().rolling( time=window, min_periods=min_periods, center=center From 40588dc38ddc2d573e3dc8c63b2e6533eb978656 Mon Sep 17 00:00:00 2001 From: Akihiro Matsukawa Date: Wed, 13 Nov 2019 10:55:32 -0500 Subject: [PATCH 04/24] Allow appending datetime & boolean variables to zarr stores (#3504) * Allow appending datetime and boolean data variables to zarr stores. * Run black and flake8 * Update error message --- doc/whats-new.rst | 2 ++ xarray/backends/api.py | 7 +++++-- xarray/tests/test_dataset.py | 14 ++++++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index f042f846c39..ea3b012cc98 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -94,6 +94,8 @@ Bug fixes - Fix :py:meth:`xarray.core.groupby.DataArrayGroupBy.reduce` and :py:meth:`xarray.core.groupby.DatasetGroupBy.reduce` when reducing over multiple dimensions. (:issue:`3402`). By `Deepak Cherian `_ +- Allow appending datetime and bool data variables to zarr stores. + (:issue:`3480`). By `Akihiro Matsukawa `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index d23594fc675..945b3937c43 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1234,6 +1234,8 @@ def _validate_datatypes_for_zarr_append(dataset): def check_dtype(var): if ( not np.issubdtype(var.dtype, np.number) + and not np.issubdtype(var.dtype, np.datetime64) + and not np.issubdtype(var.dtype, np.bool) and not coding.strings.is_unicode_dtype(var.dtype) and not var.dtype == object ): @@ -1241,8 +1243,9 @@ def check_dtype(var): raise ValueError( "Invalid dtype for data variable: {} " "dtype must be a subtype of number, " - "a fixed sized string, a fixed size " - "unicode string or an object".format(var) + "datetime, bool, a fixed sized string, " + "a fixed size unicode string or an " + "object".format(var) ) for k in dataset.data_vars.values(): diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index d001c43da94..67d3b3198dc 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -90,6 +90,14 @@ def create_append_test_data(seed=None): string_var = np.array(["ae", "bc", "df"], dtype=object) string_var_to_append = np.array(["asdf", "asdfg"], dtype=object) unicode_var = ["áó", "áó", "áó"] + datetime_var = np.array( + ["2019-01-01", "2019-01-02", "2019-01-03"], dtype="datetime64[s]" + ) + datetime_var_to_append = np.array( + ["2019-01-04", "2019-01-05"], dtype="datetime64[s]" + ) + bool_var = np.array([True, False, True], dtype=np.bool) + bool_var_to_append = np.array([False, True], dtype=np.bool) ds = xr.Dataset( data_vars={ @@ -102,6 +110,8 @@ def create_append_test_data(seed=None): "unicode_var": xr.DataArray( unicode_var, coords=[time1], dims=["time"] ).astype(np.unicode_), + "datetime_var": xr.DataArray(datetime_var, coords=[time1], dims=["time"]), + "bool_var": xr.DataArray(bool_var, coords=[time1], dims=["time"]), } ) @@ -118,6 +128,10 @@ def create_append_test_data(seed=None): "unicode_var": xr.DataArray( unicode_var[:nt2], coords=[time2], dims=["time"] ).astype(np.unicode_), + "datetime_var": xr.DataArray( + datetime_var_to_append, coords=[time2], dims=["time"] + ), + "bool_var": xr.DataArray(bool_var_to_append, coords=[time2], dims=["time"]), } ) From 810345c4564a2bc15bf1b4c7ba4c4840238f1e82 Mon Sep 17 00:00:00 2001 From: Gina Date: Wed, 13 Nov 2019 14:18:14 -0600 Subject: [PATCH 05/24] FUNDING.yml (#3523) add NumFOCUS github sponsors button (recurring donations only) This feature launched today at GitHub Universe! Also add the custom link to point to the donation form for xarray. cc @shoyer --- .github/FUNDING.yml | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .github/FUNDING.yml diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 00000000000..30c1e18f33c --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,2 @@ +github: numfocus +custom: http://numfocus.org/donate-to-xarray From eece07932d5498a8abef6a8fbd30d00066931b18 Mon Sep 17 00:00:00 2001 From: Anderson Banihirwe Date: Wed, 13 Nov 2019 18:22:50 -0700 Subject: [PATCH 06/24] Harmonize `FillValue` and `missing_value` during encoding and decoding steps (#3502) * Replace `equivalent()` with `allclose_or_equiv()` * Ensure _FillValue & missing_value are cast to same dtype as data's * Use Numpy scalar during type casting * Update ValueError message * Formatting only * Update whats-new.rst --- doc/whats-new.rst | 2 ++ xarray/coding/variables.py | 14 ++++++++++---- xarray/tests/test_coding.py | 17 +++++++++++++++++ 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ea3b012cc98..f840557ab5d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -79,6 +79,8 @@ New Features Bug fixes ~~~~~~~~~ +- Harmonize `_FillValue`, `missing_value` during encoding and decoding steps. (:pull:`3502`) + By `Anderson Banihirwe `_. - Fix regression introduced in v0.14.0 that would cause a crash if dask is installed but cloudpickle isn't (:issue:`3401`) by `Rhys Doyle `_ - Fix grouping over variables with NaNs. (:issue:`2383`, :pull:`3406`). diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 5f9c8932b6b..2b5f87ab0cd 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -8,7 +8,6 @@ from ..core import dtypes, duck_array_ops, indexing from ..core.pycompat import dask_array_type -from ..core.utils import equivalent from ..core.variable import Variable @@ -152,18 +151,25 @@ def encode(self, variable, name=None): fv = encoding.get("_FillValue") mv = encoding.get("missing_value") - if fv is not None and mv is not None and not equivalent(fv, mv): + if ( + fv is not None + and mv is not None + and not duck_array_ops.allclose_or_equiv(fv, mv) + ): raise ValueError( - "Variable {!r} has multiple fill values {}. " - "Cannot encode data. ".format(name, [fv, mv]) + f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data." ) if fv is not None: + # Ensure _FillValue is cast to same dtype as data's + encoding["_FillValue"] = data.dtype.type(fv) fill_value = pop_to(encoding, attrs, "_FillValue", name=name) if not pd.isnull(fill_value): data = duck_array_ops.fillna(data, fill_value) if mv is not None: + # Ensure missing_value is cast to same dtype as data's + encoding["missing_value"] = data.dtype.type(mv) fill_value = pop_to(encoding, attrs, "missing_value", name=name) if not pd.isnull(fill_value) and fv is None: data = duck_array_ops.fillna(data, fill_value) diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index 6cd584daa96..3e0474e7b60 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -20,6 +20,23 @@ def test_CFMaskCoder_decode(): assert_identical(expected, encoded) +def test_CFMaskCoder_encode_missing_fill_values_conflict(): + original = xr.Variable( + ("x",), + [0.0, -1.0, 1.0], + encoding={"_FillValue": np.float32(1e20), "missing_value": np.float64(1e20)}, + ) + coder = variables.CFMaskCoder() + encoded = coder.encode(original) + + assert encoded.dtype == encoded.attrs["missing_value"].dtype + assert encoded.dtype == encoded.attrs["_FillValue"].dtype + + with pytest.warns(variables.SerializationWarning): + roundtripped = coder.decode(coder.encode(original)) + assert_identical(roundtripped, original) + + def test_CFMaskCoder_missing_value(): expected = xr.DataArray( np.array([[26915, 27755, -9999, 27705], [25595, -9999, 28315, -9999]]), From 4358762d7ccf0d81dfbbc37d9c0665d53fe9c426 Mon Sep 17 00:00:00 2001 From: keewis Date: Thu, 14 Nov 2019 02:24:07 +0100 Subject: [PATCH 07/24] Tests for module-level functions with units (#3493) * add tests for replication functions * add tests for `xarray.dot` * add tests for apply_ufunc * explicitly set the test ids to repr * add tests for align * cover a bit more of align * add tests for broadcast * black changed how tuple unpacking should look like * correct the xfail message for full_like tests * add tests for where * add tests for concat * add tests for combine_by_coords * fix a bug in convert_units * convert the align results to the same units * rename the combine_by_coords test * convert the units for expected in combine_by_coords * add tests for combine_nested * add tests for merge with datasets * only use three datasets for merging * add tests for merge with dataarrays * update whats-new.rst --- doc/whats-new.rst | 3 +- xarray/tests/test_units.py | 871 ++++++++++++++++++++++++++++++++++++- 2 files changed, 865 insertions(+), 9 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index f840557ab5d..a7687368884 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -118,7 +118,8 @@ Internal Changes ~~~~~~~~~~~~~~~~ - Added integration tests against `pint `_. - (:pull:`3238`, :pull:`3447`, :pull:`3508`) by `Justus Magin `_. + (:pull:`3238`, :pull:`3447`, :pull:`3493`, :pull:`3508`) + by `Justus Magin `_. .. note:: diff --git a/xarray/tests/test_units.py b/xarray/tests/test_units.py index fd9e9b039ac..509a50d23ff 100644 --- a/xarray/tests/test_units.py +++ b/xarray/tests/test_units.py @@ -222,7 +222,9 @@ def convert_units(obj, to): if name != obj.name } - new_obj = xr.DataArray(name=name, data=data, coords=coords, attrs=obj.attrs) + new_obj = xr.DataArray( + name=name, data=data, coords=coords, attrs=obj.attrs, dims=obj.dims + ) elif isinstance(obj, unit_registry.Quantity): units = to.get(None) new_obj = obj.to(units) if units is not None else obj @@ -307,19 +309,689 @@ def __repr__(self): class function: - def __init__(self, name): - self.name = name - self.func = getattr(np, name) + def __init__(self, name_or_function, *args, **kwargs): + if callable(name_or_function): + self.name = name_or_function.__name__ + self.func = name_or_function + else: + self.name = name_or_function + self.func = getattr(np, name_or_function) + if self.func is None: + raise AttributeError( + f"module 'numpy' has no attribute named '{self.name}'" + ) + + self.args = args + self.kwargs = kwargs def __call__(self, *args, **kwargs): - return self.func(*args, **kwargs) + all_args = list(self.args) + list(args) + all_kwargs = {**self.kwargs, **kwargs} + + return self.func(*all_args, **all_kwargs) def __repr__(self): return f"function_{self.name}" +def test_apply_ufunc_dataarray(dtype): + func = function( + xr.apply_ufunc, np.mean, input_core_dims=[["x"]], kwargs={"axis": -1} + ) + + array = np.linspace(0, 10, 20).astype(dtype) * unit_registry.m + x = np.arange(20) * unit_registry.s + data_array = xr.DataArray(data=array, dims="x", coords={"x": x}) + + expected = attach_units(func(strip_units(data_array)), extract_units(data_array)) + result = func(data_array) + + assert_equal_with_units(expected, result) + + +@pytest.mark.xfail( + reason="pint does not implement `np.result_type` and align strips units" +) +@pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.mm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ids=repr, +) +@pytest.mark.parametrize( + "variant", + ( + "data", + pytest.param("dims", marks=pytest.mark.xfail(reason="indexes strip units")), + "coords", + ), +) +@pytest.mark.parametrize("fill_value", (np.float64(10), np.float64(np.nan))) +def test_align_dataarray(fill_value, variant, unit, error, dtype): + original_unit = unit_registry.m + + variants = { + "data": (unit, original_unit, original_unit), + "dims": (original_unit, unit, original_unit), + "coords": (original_unit, original_unit, unit), + } + data_unit, dim_unit, coord_unit = variants.get(variant) + + array1 = np.linspace(0, 10, 2 * 5).reshape(2, 5).astype(dtype) * original_unit + array2 = np.linspace(0, 8, 2 * 5).reshape(2, 5).astype(dtype) * data_unit + x = np.arange(2) * original_unit + x_a1 = np.array([10, 5]) * original_unit + x_a2 = np.array([10, 5]) * coord_unit + + y1 = np.arange(5) * original_unit + y2 = np.arange(2, 7) * dim_unit + + data_array1 = xr.DataArray( + data=array1, coords={"x": x, "x_a": ("x", x_a1), "y": y1}, dims=("x", "y") + ) + data_array2 = xr.DataArray( + data=array2, coords={"x": x, "x_a": ("x", x_a2), "y": y2}, dims=("x", "y") + ) + + fill_value = fill_value * data_unit + func = function(xr.align, join="outer", fill_value=fill_value) + if error is not None: + with pytest.raises(error): + func(data_array1, data_array2) + + return + + stripped_kwargs = { + key: strip_units( + convert_units(value, {None: original_unit}) + if isinstance(value, unit_registry.Quantity) + else value + ) + for key, value in func.kwargs.items() + } + units = extract_units(data_array1) + # FIXME: should the expected_b have the same units as data_array1 + # or data_array2? + expected_a, expected_b = tuple( + attach_units(elem, units) + for elem in func( + strip_units(data_array1), + strip_units(convert_units(data_array2, units)), + **stripped_kwargs, + ) + ) + result_a, result_b = func(data_array1, data_array2) + + assert_equal_with_units(expected_a, result_a) + assert_equal_with_units(expected_b, result_b) + + +@pytest.mark.xfail( + reason="pint does not implement `np.result_type` and align strips units" +) +@pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.mm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ids=repr, +) +@pytest.mark.parametrize( + "variant", + ( + "data", + pytest.param("dims", marks=pytest.mark.xfail(reason="indexes strip units")), + "coords", + ), +) +@pytest.mark.parametrize("fill_value", (np.float64(10), np.float64(np.nan))) +def test_align_dataset(fill_value, unit, variant, error, dtype): + original_unit = unit_registry.m + + variants = { + "data": (unit, original_unit, original_unit), + "dims": (original_unit, unit, original_unit), + "coords": (original_unit, original_unit, unit), + } + data_unit, dim_unit, coord_unit = variants.get(variant) + + array1 = np.linspace(0, 10, 2 * 5).reshape(2, 5).astype(dtype) * original_unit + array2 = np.linspace(0, 10, 2 * 5).reshape(2, 5).astype(dtype) * data_unit + + x = np.arange(2) * original_unit + x_a1 = np.array([10, 5]) * original_unit + x_a2 = np.array([10, 5]) * coord_unit + + y1 = np.arange(5) * original_unit + y2 = np.arange(2, 7) * dim_unit + + ds1 = xr.Dataset( + data_vars={"a": (("x", "y"), array1)}, + coords={"x": x, "x_a": ("x", x_a1), "y": y1}, + ) + ds2 = xr.Dataset( + data_vars={"a": (("x", "y"), array2)}, + coords={"x": x, "x_a": ("x", x_a2), "y": y2}, + ) + + fill_value = fill_value * data_unit + func = function(xr.align, join="outer", fill_value=fill_value) + if error is not None: + with pytest.raises(error): + func(ds1, ds2) + + return + + stripped_kwargs = { + key: strip_units( + convert_units(value, {None: original_unit}) + if isinstance(value, unit_registry.Quantity) + else value + ) + for key, value in func.kwargs.items() + } + units = extract_units(ds1) + # FIXME: should the expected_b have the same units as ds1 or ds2? + expected_a, expected_b = tuple( + attach_units(elem, units) + for elem in func( + strip_units(ds1), strip_units(convert_units(ds2, units)), **stripped_kwargs + ) + ) + result_a, result_b = func(ds1, ds2) + + assert_equal_with_units(expected_a, result_a) + assert_equal_with_units(expected_b, result_b) + + +def test_broadcast_dataarray(dtype): + array1 = np.linspace(0, 10, 2) * unit_registry.Pa + array2 = np.linspace(0, 10, 3) * unit_registry.Pa + + a = xr.DataArray(data=array1, dims="x") + b = xr.DataArray(data=array2, dims="y") + + expected_a, expected_b = tuple( + attach_units(elem, extract_units(a)) + for elem in xr.broadcast(strip_units(a), strip_units(b)) + ) + result_a, result_b = xr.broadcast(a, b) + + assert_equal_with_units(expected_a, result_a) + assert_equal_with_units(expected_b, result_b) + + +def test_broadcast_dataset(dtype): + array1 = np.linspace(0, 10, 2) * unit_registry.Pa + array2 = np.linspace(0, 10, 3) * unit_registry.Pa + + ds = xr.Dataset(data_vars={"a": ("x", array1), "b": ("y", array2)}) + + (expected,) = tuple( + attach_units(elem, extract_units(ds)) for elem in xr.broadcast(strip_units(ds)) + ) + (result,) = xr.broadcast(ds) + + assert_equal_with_units(expected, result) + + +@pytest.mark.xfail(reason="`combine_by_coords` strips units") +@pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.mm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ids=repr, +) +@pytest.mark.parametrize( + "variant", + ( + "data", + pytest.param("dims", marks=pytest.mark.xfail(reason="indexes strip units")), + "coords", + ), +) +def test_combine_by_coords(variant, unit, error, dtype): + original_unit = unit_registry.m + + variants = { + "data": (unit, original_unit, original_unit), + "dims": (original_unit, unit, original_unit), + "coords": (original_unit, original_unit, unit), + } + data_unit, dim_unit, coord_unit = variants.get(variant) + + array1 = np.zeros(shape=(2, 3), dtype=dtype) * original_unit + array2 = np.zeros(shape=(2, 3), dtype=dtype) * original_unit + x = np.arange(1, 4) * 10 * original_unit + y = np.arange(2) * original_unit + z = np.arange(3) * original_unit + + other_array1 = np.ones_like(array1) * data_unit + other_array2 = np.ones_like(array2) * data_unit + other_x = np.arange(1, 4) * 10 * dim_unit + other_y = np.arange(2, 4) * dim_unit + other_z = np.arange(3, 6) * coord_unit + + ds = xr.Dataset( + data_vars={"a": (("y", "x"), array1), "b": (("y", "x"), array2)}, + coords={"x": x, "y": y, "z": ("x", z)}, + ) + other = xr.Dataset( + data_vars={"a": (("y", "x"), other_array1), "b": (("y", "x"), other_array2)}, + coords={"x": other_x, "y": other_y, "z": ("x", other_z)}, + ) + + if error is not None: + with pytest.raises(error): + xr.combine_by_coords([ds, other]) + + return + + units = extract_units(ds) + expected = attach_units( + xr.combine_by_coords( + [strip_units(ds), strip_units(convert_units(other, units))] + ), + units, + ) + result = xr.combine_by_coords([ds, other]) + + assert_equal_with_units(expected, result) + + +@pytest.mark.xfail(reason="blocked by `where`") +@pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.mm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ids=repr, +) +@pytest.mark.parametrize( + "variant", + ( + "data", + pytest.param("dims", marks=pytest.mark.xfail(reason="indexes strip units")), + "coords", + ), +) +def test_combine_nested(variant, unit, error, dtype): + original_unit = unit_registry.m + + variants = { + "data": (unit, original_unit, original_unit), + "dims": (original_unit, unit, original_unit), + "coords": (original_unit, original_unit, unit), + } + data_unit, dim_unit, coord_unit = variants.get(variant) + + array1 = np.zeros(shape=(2, 3), dtype=dtype) * original_unit + array2 = np.zeros(shape=(2, 3), dtype=dtype) * original_unit + + x = np.arange(1, 4) * 10 * original_unit + y = np.arange(2) * original_unit + z = np.arange(3) * original_unit + + ds1 = xr.Dataset( + data_vars={"a": (("y", "x"), array1), "b": (("y", "x"), array2)}, + coords={"x": x, "y": y, "z": ("x", z)}, + ) + ds2 = xr.Dataset( + data_vars={ + "a": (("y", "x"), np.ones_like(array1) * data_unit), + "b": (("y", "x"), np.ones_like(array2) * data_unit), + }, + coords={ + "x": np.arange(3) * dim_unit, + "y": np.arange(2, 4) * dim_unit, + "z": ("x", np.arange(-3, 0) * coord_unit), + }, + ) + ds3 = xr.Dataset( + data_vars={ + "a": (("y", "x"), np.zeros_like(array1) * np.nan * data_unit), + "b": (("y", "x"), np.zeros_like(array2) * np.nan * data_unit), + }, + coords={ + "x": np.arange(3, 6) * dim_unit, + "y": np.arange(4, 6) * dim_unit, + "z": ("x", np.arange(3, 6) * coord_unit), + }, + ) + ds4 = xr.Dataset( + data_vars={ + "a": (("y", "x"), -1 * np.ones_like(array1) * data_unit), + "b": (("y", "x"), -1 * np.ones_like(array2) * data_unit), + }, + coords={ + "x": np.arange(6, 9) * dim_unit, + "y": np.arange(6, 8) * dim_unit, + "z": ("x", np.arange(6, 9) * coord_unit), + }, + ) + + func = function(xr.combine_nested, concat_dim=["x", "y"]) + if error is not None: + with pytest.raises(error): + func([[ds1, ds2], [ds3, ds4]]) + + return + + units = extract_units(ds1) + convert_and_strip = lambda ds: strip_units(convert_units(ds, units)) + expected = attach_units( + func( + [ + [strip_units(ds1), convert_and_strip(ds2)], + [convert_and_strip(ds3), convert_and_strip(ds4)], + ] + ), + units, + ) + result = func([[ds1, ds2], [ds3, ds4]]) + + assert_equal_with_units(expected, result) + + +@pytest.mark.xfail(reason="`concat` strips units") +@pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.mm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ids=repr, +) +@pytest.mark.parametrize( + "variant", + ( + "data", + pytest.param("dims", marks=pytest.mark.xfail(reason="indexes strip units")), + ), +) +def test_concat_dataarray(variant, unit, error, dtype): + original_unit = unit_registry.m + + variants = {"data": (unit, original_unit), "dims": (original_unit, unit)} + data_unit, dims_unit = variants.get(variant) + + array1 = np.linspace(0, 5, 10).astype(dtype) * unit_registry.m + array2 = np.linspace(-5, 0, 5).astype(dtype) * data_unit + x1 = np.arange(5, 15) * original_unit + x2 = np.arange(5) * dims_unit + + arr1 = xr.DataArray(data=array1, coords={"x": x1}, dims="x") + arr2 = xr.DataArray(data=array2, coords={"x": x2}, dims="x") + + if error is not None: + with pytest.raises(error): + xr.concat([arr1, arr2], dim="x") + + return + + expected = attach_units( + xr.concat([strip_units(arr1), strip_units(arr2)], dim="x"), extract_units(arr1) + ) + result = xr.concat([arr1, arr2], dim="x") + + assert_equal_with_units(expected, result) + + +@pytest.mark.xfail(reason="`concat` strips units") +@pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.mm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ids=repr, +) +@pytest.mark.parametrize( + "variant", + ( + "data", + pytest.param("dims", marks=pytest.mark.xfail(reason="indexes strip units")), + ), +) +def test_concat_dataset(variant, unit, error, dtype): + original_unit = unit_registry.m + + variants = {"data": (unit, original_unit), "dims": (original_unit, unit)} + data_unit, dims_unit = variants.get(variant) + + array1 = np.linspace(0, 5, 10).astype(dtype) * unit_registry.m + array2 = np.linspace(-5, 0, 5).astype(dtype) * data_unit + x1 = np.arange(5, 15) * original_unit + x2 = np.arange(5) * dims_unit + + ds1 = xr.Dataset(data_vars={"a": ("x", array1)}, coords={"x": x1}) + ds2 = xr.Dataset(data_vars={"a": ("x", array2)}, coords={"x": x2}) + + if error is not None: + with pytest.raises(error): + xr.concat([ds1, ds2], dim="x") + + return + + expected = attach_units( + xr.concat([strip_units(ds1), strip_units(ds2)], dim="x"), extract_units(ds1) + ) + result = xr.concat([ds1, ds2], dim="x") + + assert_equal_with_units(expected, result) + + +@pytest.mark.xfail(reason="blocked by `where`") +@pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.mm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ids=repr, +) +@pytest.mark.parametrize( + "variant", + ( + "data", + pytest.param("dims", marks=pytest.mark.xfail(reason="indexes strip units")), + "coords", + ), +) +def test_merge_dataarray(variant, unit, error, dtype): + original_unit = unit_registry.m + + variants = { + "data": (unit, original_unit, original_unit), + "dims": (original_unit, unit, original_unit), + "coords": (original_unit, original_unit, unit), + } + data_unit, dim_unit, coord_unit = variants.get(variant) + + array1 = np.linspace(0, 1, 2 * 3).reshape(2, 3).astype(dtype) * original_unit + array2 = np.linspace(1, 2, 2 * 4).reshape(2, 4).astype(dtype) * data_unit + array3 = np.linspace(0, 2, 3 * 4).reshape(3, 4).astype(dtype) * data_unit + + x = np.arange(2) * original_unit + y = np.arange(3) * original_unit + z = np.arange(4) * original_unit + u = np.linspace(10, 20, 2) * original_unit + v = np.linspace(10, 20, 3) * original_unit + w = np.linspace(10, 20, 4) * original_unit + + arr1 = xr.DataArray( + name="a", + data=array1, + coords={"x": x, "y": y, "u": ("x", u), "v": ("y", v)}, + dims=("x", "y"), + ) + arr2 = xr.DataArray( + name="b", + data=array2, + coords={ + "x": np.arange(2, 4) * dim_unit, + "z": z, + "u": ("x", np.linspace(20, 30, 2) * coord_unit), + "w": ("z", w), + }, + dims=("x", "z"), + ) + arr3 = xr.DataArray( + name="c", + data=array3, + coords={ + "y": np.arange(3, 6) * dim_unit, + "z": np.arange(4, 8) * dim_unit, + "v": ("y", np.linspace(10, 20, 3) * coord_unit), + "w": ("z", np.linspace(10, 20, 4) * coord_unit), + }, + dims=("y", "z"), + ) + + func = function(xr.merge) + if error is not None: + with pytest.raises(error): + func([arr1, arr2, arr3]) + + return + + units = {name: original_unit for name in list("abcuvwxyz")} + convert_and_strip = lambda arr: strip_units(convert_units(arr, units)) + expected = attach_units( + func([strip_units(arr1), convert_and_strip(arr2), convert_and_strip(arr3)]), + units, + ) + result = func([arr1, arr2, arr3]) + + assert_equal_with_units(expected, result) + + +@pytest.mark.xfail(reason="blocked by `where`") +@pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.mm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ids=repr, +) +@pytest.mark.parametrize( + "variant", + ( + "data", + pytest.param("dims", marks=pytest.mark.xfail(reason="indexes strip units")), + "coords", + ), +) +def test_merge_dataset(variant, unit, error, dtype): + original_unit = unit_registry.m + + variants = { + "data": (unit, original_unit, original_unit), + "dims": (original_unit, unit, original_unit), + "coords": (original_unit, original_unit, unit), + } + data_unit, dim_unit, coord_unit = variants.get(variant) + + array1 = np.zeros(shape=(2, 3), dtype=dtype) * original_unit + array2 = np.zeros(shape=(2, 3), dtype=dtype) * original_unit + + x = np.arange(11, 14) * original_unit + y = np.arange(2) * original_unit + z = np.arange(3) * original_unit + + ds1 = xr.Dataset( + data_vars={"a": (("y", "x"), array1), "b": (("y", "x"), array2)}, + coords={"x": x, "y": y, "z": ("x", z)}, + ) + ds2 = xr.Dataset( + data_vars={ + "a": (("y", "x"), np.ones_like(array1) * data_unit), + "b": (("y", "x"), np.ones_like(array2) * data_unit), + }, + coords={ + "x": np.arange(3) * dim_unit, + "y": np.arange(2, 4) * dim_unit, + "z": ("x", np.arange(-3, 0) * coord_unit), + }, + ) + ds3 = xr.Dataset( + data_vars={ + "a": (("y", "x"), np.zeros_like(array1) * np.nan * data_unit), + "b": (("y", "x"), np.zeros_like(array2) * np.nan * data_unit), + }, + coords={ + "x": np.arange(3, 6) * dim_unit, + "y": np.arange(4, 6) * dim_unit, + "z": ("x", np.arange(3, 6) * coord_unit), + }, + ) + + func = function(xr.merge) + if error is not None: + with pytest.raises(error): + func([ds1, ds2, ds3]) + + return + + units = extract_units(ds1) + convert_and_strip = lambda ds: strip_units(convert_units(ds, units)) + expected = attach_units( + func([strip_units(ds1), convert_and_strip(ds2), convert_and_strip(ds3)]), units + ) + result = func([ds1, ds2, ds3]) + + assert_equal_with_units(expected, result) + + @pytest.mark.parametrize("func", (xr.zeros_like, xr.ones_like)) -def test_replication(func, dtype): +def test_replication_dataarray(func, dtype): array = np.linspace(0, 10, 20).astype(dtype) * unit_registry.s data_array = xr.DataArray(data=array, dims="x") @@ -330,8 +1002,33 @@ def test_replication(func, dtype): assert_equal_with_units(expected, result) +@pytest.mark.parametrize("func", (xr.zeros_like, xr.ones_like)) +def test_replication_dataset(func, dtype): + array1 = np.linspace(0, 10, 20).astype(dtype) * unit_registry.s + array2 = np.linspace(5, 10, 10).astype(dtype) * unit_registry.Pa + x = np.arange(20).astype(dtype) * unit_registry.m + y = np.arange(10).astype(dtype) * unit_registry.m + z = y.to(unit_registry.mm) + + ds = xr.Dataset( + data_vars={"a": ("x", array1), "b": ("y", array2)}, + coords={"x": x, "y": y, "z": ("y", z)}, + ) + + numpy_func = getattr(np, func.__name__) + expected = ds.copy( + data={name: numpy_func(array.data) for name, array in ds.data_vars.items()} + ) + result = func(ds) + + assert_equal_with_units(expected, result) + + @pytest.mark.xfail( - reason="np.full_like on Variable strips the unit and pint does not allow mixed args" + reason=( + "pint is undecided on how `full_like` should work, so incorrect errors " + "may be expected: hgrecco/pint#882" + ) ) @pytest.mark.parametrize( "unit,error", @@ -344,8 +1041,9 @@ def test_replication(func, dtype): pytest.param(unit_registry.ms, None, id="compatible_unit"), pytest.param(unit_registry.s, None, id="identical_unit"), ), + ids=repr, ) -def test_replication_full_like(unit, error, dtype): +def test_replication_full_like_dataarray(unit, error, dtype): array = np.linspace(0, 5, 10) * unit_registry.s data_array = xr.DataArray(data=array, dims="x") @@ -360,6 +1058,163 @@ def test_replication_full_like(unit, error, dtype): assert_equal_with_units(expected, result) +@pytest.mark.xfail( + reason=( + "pint is undecided on how `full_like` should work, so incorrect errors " + "may be expected: hgrecco/pint#882" + ) +) +@pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.m, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.ms, None, id="compatible_unit"), + pytest.param(unit_registry.s, None, id="identical_unit"), + ), + ids=repr, +) +def test_replication_full_like_dataset(unit, error, dtype): + array1 = np.linspace(0, 10, 20).astype(dtype) * unit_registry.s + array2 = np.linspace(5, 10, 10).astype(dtype) * unit_registry.Pa + x = np.arange(20).astype(dtype) * unit_registry.m + y = np.arange(10).astype(dtype) * unit_registry.m + z = y.to(unit_registry.mm) + + ds = xr.Dataset( + data_vars={"a": ("x", array1), "b": ("y", array2)}, + coords={"x": x, "y": y, "z": ("y", z)}, + ) + + fill_value = -1 * unit + if error is not None: + with pytest.raises(error): + xr.full_like(ds, fill_value=fill_value) + + return + + expected = ds.copy( + data={ + name: np.full_like(array, fill_value=fill_value) + for name, array in ds.data_vars.items() + } + ) + result = xr.full_like(ds, fill_value=fill_value) + + assert_equal_with_units(expected, result) + + +@pytest.mark.xfail(reason="`where` strips units") +@pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.mm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ids=repr, +) +@pytest.mark.parametrize("fill_value", (np.nan, 10.2)) +def test_where_dataarray(fill_value, unit, error, dtype): + array = np.linspace(0, 5, 10).astype(dtype) * unit_registry.m + + x = xr.DataArray(data=array, dims="x") + cond = x < 5 * unit_registry.m + # FIXME: this should work without wrapping in array() + fill_value = np.array(fill_value) * unit + + if error is not None: + with pytest.raises(error): + xr.where(cond, x, fill_value) + + return + + fill_value_ = ( + fill_value.to(unit_registry.m) + if isinstance(fill_value, unit_registry.Quantity) + and fill_value.check(unit_registry.m) + else fill_value + ) + expected = attach_units( + xr.where(cond, strip_units(x), strip_units(fill_value_)), extract_units(x) + ) + result = xr.where(cond, x, fill_value) + + assert_equal_with_units(expected, result) + + +@pytest.mark.xfail(reason="`where` strips units") +@pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.mm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ids=repr, +) +@pytest.mark.parametrize("fill_value", (np.nan, 10.2)) +def test_where_dataset(fill_value, unit, error, dtype): + array1 = np.linspace(0, 5, 10).astype(dtype) * unit_registry.m + array2 = np.linspace(-5, 0, 10).astype(dtype) * unit_registry.m + x = np.arange(10) * unit_registry.s + + ds = xr.Dataset(data_vars={"a": ("x", array1), "b": ("x", array2)}, coords={"x": x}) + cond = ds.x < 5 * unit_registry.s + # FIXME: this should work without wrapping in array() + fill_value = np.array(fill_value) * unit + + if error is not None: + with pytest.raises(error): + xr.where(cond, ds, fill_value) + + return + + fill_value_ = ( + fill_value.to(unit_registry.m) + if isinstance(fill_value, unit_registry.Quantity) + and fill_value.check(unit_registry.m) + else fill_value + ) + expected = attach_units( + xr.where(cond, strip_units(ds), strip_units(fill_value_)), extract_units(ds) + ) + result = xr.where(cond, ds, fill_value) + + assert_equal_with_units(expected, result) + + +@pytest.mark.xfail(reason="pint does not implement `np.einsum`") +def test_dot_dataarray(dtype): + array1 = ( + np.linspace(0, 10, 5 * 10).reshape(5, 10).astype(dtype) + * unit_registry.m + / unit_registry.s + ) + array2 = ( + np.linspace(10, 20, 10 * 20).reshape(10, 20).astype(dtype) * unit_registry.s + ) + + arr1 = xr.DataArray(data=array1, dims=("x", "y")) + arr2 = xr.DataArray(data=array2, dims=("y", "z")) + + expected = array1.dot(array2) + result = xr.dot(arr1, arr2) + + assert_equal_with_units(expected, result) + + class TestDataArray: @pytest.mark.filterwarnings("error:::pint[.*]") @pytest.mark.parametrize( From 8b240376fd91352a80b068af606850e8d57d1090 Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Wed, 13 Nov 2019 22:56:59 -0500 Subject: [PATCH 08/24] add Variable._replace (#3528) * add Variable._replace * assertions * whatsew * whatsnew --- doc/whats-new.rst | 3 +++ xarray/core/variable.py | 19 +++++++++++++++++-- xarray/tests/test_variable.py | 9 +++++++++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index a7687368884..b8fb1f8f58e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -138,6 +138,9 @@ Internal Changes - Enable type checking on default sentinel values (:pull:`3472`) By `Maximilian Roos `_ +- Add :py:meth:`Variable._replace` for simpler replacing of a subset of attributes (:pull:`3472`) + By `Maximilian Roos `_ + .. _whats-new.0.14.0: v0.14.0 (14 Oct 2019) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index cf97c997017..e630dc4b457 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1,3 +1,4 @@ +import copy import functools import itertools import warnings @@ -24,10 +25,11 @@ from .pycompat import dask_array_type, integer_types from .utils import ( OrderedSet, + _default, decode_numpy_dict_values, either_dict_or_kwargs, - infix_dims, ensure_us_time_resolution, + infix_dims, ) try: @@ -887,7 +889,20 @@ def copy(self, deep=True, data=None): # note: # dims is already an immutable tuple # attributes and encoding will be copied when the new Array is created - return type(self)(self.dims, data, self._attrs, self._encoding, fastpath=True) + return self._replace(data=data) + + def _replace( + self, dims=_default, data=_default, attrs=_default, encoding=_default + ) -> "Variable": + if dims is _default: + dims = copy.copy(self._dims) + if data is _default: + data = copy.copy(self.data) + if attrs is _default: + attrs = copy.copy(self._attrs) + if encoding is _default: + encoding = copy.copy(self._encoding) + return type(self)(dims, data, attrs, encoding, fastpath=True) def __copy__(self): return self.copy(deep=False) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index d394919dbdd..d92a68729b5 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -542,6 +542,15 @@ def test_copy_index_with_data_errors(self): with raises_regex(ValueError, "must match shape of object"): orig.copy(data=new_data) + def test_replace(self): + var = Variable(("x", "y"), [[1.5, 2.0], [3.1, 4.3]], {"foo": "bar"}) + result = var._replace() + assert_identical(result, var) + + new_data = np.arange(4).reshape(2, 2) + result = var._replace(data=new_data) + assert_array_equal(result.data, new_data) + def test_real_and_imag(self): v = self.cls("x", np.arange(3) - 1j * np.arange(3), {"foo": "bar"}) expected_re = self.cls("x", np.arange(3), {"foo": "bar"}) From c0ef2f616e87e9f924425bcd373ac265f14203cb Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Thu, 14 Nov 2019 20:56:17 +0900 Subject: [PATCH 09/24] Fix set_index when an existing dimension becomes a level (#3520) * Added a test * Fix set_index * lint * black / mypy * Use _replace method * whats new --- doc/whats-new.rst | 2 ++ xarray/core/dataarray.py | 10 +++++----- xarray/core/dataset.py | 12 ++++++++++-- xarray/tests/test_dataarray.py | 10 ++++++++++ 4 files changed, 27 insertions(+), 7 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b8fb1f8f58e..abd94779435 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -79,6 +79,8 @@ New Features Bug fixes ~~~~~~~~~ +- Fix a bug in `set_index` in case that an existing dimension becomes a level variable of MultiIndex. (:pull:`3520`) + By `Keisuke Fujii `_. - Harmonize `_FillValue`, `missing_value` during encoding and decoding steps. (:pull:`3502`) By `Anderson Banihirwe `_. - Fix regression introduced in v0.14.0 that would cause a crash if dask is installed diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index a192fe08cee..55e73478260 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -48,7 +48,7 @@ assert_coordinate_consistent, remap_label_indexers, ) -from .dataset import Dataset, merge_indexes, split_indexes +from .dataset import Dataset, split_indexes from .formatting import format_item from .indexes import Indexes, default_indexes from .merge import PANDAS_TYPES @@ -1601,10 +1601,10 @@ def set_index( -------- DataArray.reset_index """ - _check_inplace(inplace) - indexes = either_dict_or_kwargs(indexes, indexes_kwargs, "set_index") - coords, _ = merge_indexes(indexes, self._coords, set(), append=append) - return self._replace(coords=coords) + ds = self._to_temp_dataset().set_index( + indexes, append=append, inplace=inplace, **indexes_kwargs + ) + return self._from_temp_dataset(ds) def reset_index( self, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 15a7209ab24..de713b830f2 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -204,6 +204,7 @@ def merge_indexes( """ vars_to_replace: Dict[Hashable, Variable] = {} vars_to_remove: List[Hashable] = [] + dims_to_replace: Dict[Hashable, Hashable] = {} error_msg = "{} is not the name of an existing variable." for dim, var_names in indexes.items(): @@ -244,7 +245,7 @@ def merge_indexes( if not len(names) and len(var_names) == 1: idx = pd.Index(variables[var_names[0]].values) - else: + else: # MultiIndex for n in var_names: try: var = variables[n] @@ -256,15 +257,22 @@ def merge_indexes( levels.append(cat.categories) idx = pd.MultiIndex(levels, codes, names=names) + for n in names: + dims_to_replace[n] = dim vars_to_replace[dim] = IndexVariable(dim, idx) vars_to_remove.extend(var_names) new_variables = {k: v for k, v in variables.items() if k not in vars_to_remove} new_variables.update(vars_to_replace) + + # update dimensions if necessary GH: 3512 + for k, v in new_variables.items(): + if any(d in dims_to_replace for d in v.dims): + new_dims = [dims_to_replace.get(d, d) for d in v.dims] + new_variables[k] = v._replace(dims=new_dims) new_coord_names = coord_names | set(vars_to_replace) new_coord_names -= set(vars_to_remove) - return new_variables, new_coord_names diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 7c6dc1825a1..4c3553c867e 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -1182,6 +1182,16 @@ def test_selection_multiindex_remove_unused(self): expected = expected.set_index(xy=["x", "y"]).unstack() assert_identical(expected, actual) + def test_selection_multiindex_from_level(self): + # GH: 3512 + da = DataArray([0, 1], dims=["x"], coords={"x": [0, 1], "y": "a"}) + db = DataArray([2, 3], dims=["x"], coords={"x": [0, 1], "y": "b"}) + data = xr.concat([da, db], dim="x").set_index(xy=["x", "y"]) + assert data.dims == ("xy",) + actual = data.sel(y="a") + expected = data.isel(xy=[0, 1]).unstack("xy").squeeze("y").drop("y") + assert_equal(actual, expected) + def test_virtual_default_coords(self): array = DataArray(np.zeros((5,)), dims="x") expected = DataArray(range(5), dims="x", name="x") From 7b4a286f59bc7d60d4e4d03be65562ff63f9b111 Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Thu, 14 Nov 2019 11:56:49 -0500 Subject: [PATCH 10/24] units & deprecation merge (#3530) --- xarray/tests/test_units.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_units.py b/xarray/tests/test_units.py index 509a50d23ff..0be6f8af464 100644 --- a/xarray/tests/test_units.py +++ b/xarray/tests/test_units.py @@ -1969,7 +1969,7 @@ def test_broadcast_equals(self, unit, dtype): dim={"z": np.linspace(10, 20, 12) * unit_registry.s}, axis=1, ), - method("drop", labels="x"), + method("drop_sel", labels="x"), method("reset_coords", names="x2"), method("copy"), pytest.param( @@ -4045,7 +4045,7 @@ def test_reindex_like(self, unit, error, dtype): marks=pytest.mark.xfail(reason="strips units"), ), pytest.param( - method("apply", np.fabs), + method("map", np.fabs), marks=pytest.mark.xfail(reason="fabs strips units"), ), ), @@ -4220,7 +4220,7 @@ def test_grouped_operations(self, func, dtype): method("rename_dims", x="offset_x"), method("swap_dims", {"x": "x2"}), method("expand_dims", v=np.linspace(10, 20, 12) * unit_registry.s, axis=1), - method("drop", labels="x"), + method("drop_sel", labels="x"), method("drop_dims", "z"), method("set_coords", names="c"), method("reset_coords", names="x2"), From ee9da17ef04035cf318b6f1a4bb413f3d10ae614 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 15 Nov 2019 14:53:16 +0000 Subject: [PATCH 11/24] interpolate_na: Add max_gap support. (#3302) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * interpolate_na: Add maxgap support. * Add docs. * Add requires_bottleneck to test. * Review comments. * Update xarray/core/dataarray.py Co-Authored-By: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> * Update xarray/core/dataset.py Co-Authored-By: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> * maxgap → max_gap * update whats-new * update computation.rst * Better support uniformly spaced coordinates. Split legnths, interp test * Raise error for max_gap and irregularly spaced coordinates + test * rework. * Use pandas checks for index duplication and monotonicity. * Progress + add datetime. * nicer error message * A few fstrings. * finish up timedelta max_gap. * fix whats-new * small fixes. * fix dan's test. * remove redundant test. * nicer error message. * Add xfailed cftime tests * better error checking and tests. * typing. * update docstrings * scipy intersphinx * fix tests * add bottleneck testing decorator. --- doc/computation.rst | 3 + doc/conf.py | 11 +-- doc/whats-new.rst | 4 ++ xarray/core/dataarray.py | 58 +++++++++++----- xarray/core/dataset.py | 60 +++++++++++----- xarray/core/missing.py | 110 +++++++++++++++++++++++++---- xarray/tests/test_missing.py | 130 ++++++++++++++++++++++++++++++++++- 7 files changed, 322 insertions(+), 54 deletions(-) diff --git a/doc/computation.rst b/doc/computation.rst index 663c546be20..240a1e5704b 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -95,6 +95,9 @@ for filling missing values via 1D interpolation. Note that xarray slightly diverges from the pandas ``interpolate`` syntax by providing the ``use_coordinate`` keyword which facilitates a clear specification of which values to use as the index in the interpolation. +xarray also provides the ``max_gap`` keyword argument to limit the interpolation to +data gaps of length ``max_gap`` or smaller. See :py:meth:`~xarray.DataArray.interpolate_na` +for more. Aggregation =========== diff --git a/doc/conf.py b/doc/conf.py index 7c1557a1e66..0e04f8ccde8 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -340,9 +340,10 @@ # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { "python": ("https://docs.python.org/3/", None), - "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), - "iris": ("http://scitools.org.uk/iris/docs/latest/", None), - "numpy": ("https://docs.scipy.org/doc/numpy/", None), - "numba": ("https://numba.pydata.org/numba-doc/latest/", None), - "matplotlib": ("https://matplotlib.org/", None), + "pandas": ("https://pandas.pydata.org/pandas-docs/stable", None), + "iris": ("https://scitools.org.uk/iris/docs/latest", None), + "numpy": ("https://docs.scipy.org/doc/numpy", None), + "scipy": ("https://docs.scipy.org/doc/scipy/reference", None), + "numba": ("https://numba.pydata.org/numba-doc/latest", None), + "matplotlib": ("https://matplotlib.org", None), } diff --git a/doc/whats-new.rst b/doc/whats-new.rst index abd94779435..053f785bc05 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -38,6 +38,10 @@ Breaking changes New Features ~~~~~~~~~~~~ + +- Added the ``max_gap`` kwarg to :py:meth:`~xarray.DataArray.interpolate_na` and + :py:meth:`~xarray.Dataset.interpolate_na`. This controls the maximum size of the data + gap that will be filled by interpolation. By `Deepak Cherian `_. - :py:meth:`Dataset.drop_sel` & :py:meth:`DataArray.drop_sel` have been added for dropping labels. :py:meth:`Dataset.drop_vars` & :py:meth:`DataArray.drop_vars` have been added for dropping variables (including coordinates). The existing ``drop`` methods remain as a backward compatible diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 55e73478260..7ce775b49cd 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2018,44 +2018,69 @@ def fillna(self, value: Any) -> "DataArray": def interpolate_na( self, - dim=None, + dim: Hashable = None, method: str = "linear", limit: int = None, use_coordinate: Union[bool, str] = True, + max_gap: Union[int, float, str, pd.Timedelta, np.timedelta64] = None, **kwargs: Any, ) -> "DataArray": - """Interpolate values according to different methods. + """Fill in NaNs by interpolating according to different methods. Parameters ---------- dim : str Specifies the dimension along which to interpolate. - method : {'linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', - 'polynomial', 'barycentric', 'krog', 'pchip', - 'spline', 'akima'}, optional + method : str, optional String indicating which method to use for interpolation: - 'linear': linear interpolation (Default). Additional keyword - arguments are passed to ``numpy.interp`` - - 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', - 'polynomial': are passed to ``scipy.interpolate.interp1d``. If - method=='polynomial', the ``order`` keyword argument must also be + arguments are passed to :py:func:`numpy.interp` + - 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'polynomial': + are passed to :py:func:`scipy.interpolate.interp1d`. If + ``method='polynomial'``, the ``order`` keyword argument must also be provided. - - 'barycentric', 'krog', 'pchip', 'spline', and `akima`: use their - respective``scipy.interpolate`` classes. - use_coordinate : boolean or str, default True + - 'barycentric', 'krog', 'pchip', 'spline', 'akima': use their + respective :py:class:`scipy.interpolate` classes. + use_coordinate : bool, str, default True Specifies which index to use as the x values in the interpolation formulated as `y = f(x)`. If False, values are treated as if - eqaully-spaced along `dim`. If True, the IndexVariable `dim` is - used. If use_coordinate is a string, it specifies the name of a + eqaully-spaced along ``dim``. If True, the IndexVariable `dim` is + used. If ``use_coordinate`` is a string, it specifies the name of a coordinate variariable to use as the index. limit : int, default None Maximum number of consecutive NaNs to fill. Must be greater than 0 - or None for no limit. + or None for no limit. This filling is done regardless of the size of + the gap in the data. To only interpolate over gaps less than a given length, + see ``max_gap``. + max_gap: int, float, str, pandas.Timedelta, numpy.timedelta64, default None. + Maximum size of gap, a continuous sequence of NaNs, that will be filled. + Use None for no limit. When interpolating along a datetime64 dimension + and ``use_coordinate=True``, ``max_gap`` can be one of the following: + + - a string that is valid input for pandas.to_timedelta + - a :py:class:`numpy.timedelta64` object + - a :py:class:`pandas.Timedelta` object + Otherwise, ``max_gap`` must be an int or a float. Use of ``max_gap`` with unlabeled + dimensions has not been implemented yet. Gap length is defined as the difference + between coordinate values at the first data point after a gap and the last value + before a gap. For gaps at the beginning (end), gap length is defined as the difference + between coordinate values at the first (last) valid data point and the first (last) NaN. + For example, consider:: + + + array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 7 8 + + The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively + kwargs : dict, optional + parameters passed verbatim to the underlying interpolation function Returns ------- - DataArray + interpolated: DataArray + Filled in DataArray. See also -------- @@ -2070,6 +2095,7 @@ def interpolate_na( method=method, limit=limit, use_coordinate=use_coordinate, + max_gap=max_gap, **kwargs, ) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index de713b830f2..913842c4eba 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3908,42 +3908,65 @@ def interpolate_na( method: str = "linear", limit: int = None, use_coordinate: Union[bool, Hashable] = True, + max_gap: Union[int, float, str, pd.Timedelta, np.timedelta64] = None, **kwargs: Any, ) -> "Dataset": - """Interpolate values according to different methods. + """Fill in NaNs by interpolating according to different methods. Parameters ---------- - dim : Hashable + dim : str Specifies the dimension along which to interpolate. - method : {'linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', - 'polynomial', 'barycentric', 'krog', 'pchip', - 'spline'}, optional + method : str, optional String indicating which method to use for interpolation: - 'linear': linear interpolation (Default). Additional keyword - arguments are passed to ``numpy.interp`` - - 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', - 'polynomial': are passed to ``scipy.interpolate.interp1d``. If - method=='polynomial', the ``order`` keyword argument must also be + arguments are passed to :py:func:`numpy.interp` + - 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'polynomial': + are passed to :py:func:`scipy.interpolate.interp1d`. If + ``method='polynomial'``, the ``order`` keyword argument must also be provided. - - 'barycentric', 'krog', 'pchip', 'spline': use their respective - ``scipy.interpolate`` classes. - use_coordinate : boolean or str, default True + - 'barycentric', 'krog', 'pchip', 'spline', 'akima': use their + respective :py:class:`scipy.interpolate` classes. + use_coordinate : bool, str, default True Specifies which index to use as the x values in the interpolation formulated as `y = f(x)`. If False, values are treated as if - eqaully-spaced along `dim`. If True, the IndexVariable `dim` is - used. If use_coordinate is a string, it specifies the name of a + eqaully-spaced along ``dim``. If True, the IndexVariable `dim` is + used. If ``use_coordinate`` is a string, it specifies the name of a coordinate variariable to use as the index. limit : int, default None Maximum number of consecutive NaNs to fill. Must be greater than 0 - or None for no limit. - kwargs : any - parameters passed verbatim to the underlying interplation function + or None for no limit. This filling is done regardless of the size of + the gap in the data. To only interpolate over gaps less than a given length, + see ``max_gap``. + max_gap: int, float, str, pandas.Timedelta, numpy.timedelta64, default None. + Maximum size of gap, a continuous sequence of NaNs, that will be filled. + Use None for no limit. When interpolating along a datetime64 dimension + and ``use_coordinate=True``, ``max_gap`` can be one of the following: + + - a string that is valid input for pandas.to_timedelta + - a :py:class:`numpy.timedelta64` object + - a :py:class:`pandas.Timedelta` object + Otherwise, ``max_gap`` must be an int or a float. Use of ``max_gap`` with unlabeled + dimensions has not been implemented yet. Gap length is defined as the difference + between coordinate values at the first data point after a gap and the last value + before a gap. For gaps at the beginning (end), gap length is defined as the difference + between coordinate values at the first (last) valid data point and the first (last) NaN. + For example, consider:: + + + array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 7 8 + + The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively + kwargs : dict, optional + parameters passed verbatim to the underlying interpolation function Returns ------- - Dataset + interpolated: Dataset + Filled in Dataset. See also -------- @@ -3959,6 +3982,7 @@ def interpolate_na( method=method, limit=limit, use_coordinate=use_coordinate, + max_gap=max_gap, **kwargs, ) return new diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 77dde66484e..117fcaf8f81 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -1,18 +1,46 @@ import warnings from functools import partial -from typing import Any, Callable, Dict, Sequence +from numbers import Number +from typing import Any, Callable, Dict, Hashable, Sequence, Union import numpy as np import pandas as pd from . import utils -from .common import _contains_datetime_like_objects +from .common import _contains_datetime_like_objects, ones_like from .computation import apply_ufunc from .duck_array_ops import dask_array_type from .utils import OrderedSet, is_scalar from .variable import Variable, broadcast_variables +def _get_nan_block_lengths(obj, dim: Hashable, index: Variable): + """ + Return an object where each NaN element in 'obj' is replaced by the + length of the gap the element is in. + """ + + # make variable so that we get broadcasting for free + index = Variable([dim], index) + + # algorithm from https://github.com/pydata/xarray/pull/3302#discussion_r324707072 + arange = ones_like(obj) * index + valid = obj.notnull() + valid_arange = arange.where(valid) + cumulative_nans = valid_arange.ffill(dim=dim).fillna(index[0]) + + nan_block_lengths = ( + cumulative_nans.diff(dim=dim, label="upper") + .reindex({dim: obj[dim]}) + .where(valid) + .bfill(dim=dim) + .where(~valid, 0) + .fillna(index[-1] - valid_arange.max()) + ) + + return nan_block_lengths + + class BaseInterpolator: """Generic interpolator class for normalizing interpolation methods """ @@ -178,7 +206,7 @@ def _apply_over_vars_with_dim(func, self, dim=None, **kwargs): return ds -def get_clean_interp_index(arr, dim, use_coordinate=True): +def get_clean_interp_index(arr, dim: Hashable, use_coordinate: Union[str, bool] = True): """get index to use for x values in interpolation. If use_coordinate is True, the coordinate that shares the name of the @@ -195,23 +223,33 @@ def get_clean_interp_index(arr, dim, use_coordinate=True): index = arr.coords[use_coordinate] if index.ndim != 1: raise ValueError( - "Coordinates used for interpolation must be 1D, " - "%s is %dD." % (use_coordinate, index.ndim) + f"Coordinates used for interpolation must be 1D, " + f"{use_coordinate} is {index.ndim}D." ) + index = index.to_index() + + # TODO: index.name is None for multiindexes + # set name for nice error messages below + if isinstance(index, pd.MultiIndex): + index.name = dim + + if not index.is_monotonic: + raise ValueError(f"Index {index.name!r} must be monotonically increasing") + + if not index.is_unique: + raise ValueError(f"Index {index.name!r} has duplicate values") # raise if index cannot be cast to a float (e.g. MultiIndex) try: index = index.values.astype(np.float64) except (TypeError, ValueError): # pandas raises a TypeError - # xarray/nuppy raise a ValueError + # xarray/numpy raise a ValueError raise TypeError( - "Index must be castable to float64 to support" - "interpolation, got: %s" % type(index) + f"Index {index.name!r} must be castable to float64 to support " + f"interpolation, got {type(index).__name__}." ) - # check index sorting now so we can skip it later - if not (np.diff(index) > 0).all(): - raise ValueError("Index must be monotonicly increasing") + else: axis = arr.get_axis_num(dim) index = np.arange(arr.shape[axis], dtype=np.float64) @@ -220,7 +258,13 @@ def get_clean_interp_index(arr, dim, use_coordinate=True): def interp_na( - self, dim=None, use_coordinate=True, method="linear", limit=None, **kwargs + self, + dim: Hashable = None, + use_coordinate: Union[bool, str] = True, + method: str = "linear", + limit: int = None, + max_gap: Union[int, float, str, pd.Timedelta, np.timedelta64] = None, + **kwargs, ): """Interpolate values according to different methods. """ @@ -230,6 +274,40 @@ def interp_na( if limit is not None: valids = _get_valid_fill_mask(self, dim, limit) + if max_gap is not None: + max_type = type(max_gap).__name__ + if not is_scalar(max_gap): + raise ValueError("max_gap must be a scalar.") + + if ( + dim in self.indexes + and isinstance(self.indexes[dim], pd.DatetimeIndex) + and use_coordinate + ): + if not isinstance(max_gap, (np.timedelta64, pd.Timedelta, str)): + raise TypeError( + f"Underlying index is DatetimeIndex. Expected max_gap of type str, pandas.Timedelta or numpy.timedelta64 but received {max_type}" + ) + + if isinstance(max_gap, str): + try: + max_gap = pd.to_timedelta(max_gap) + except ValueError: + raise ValueError( + f"Could not convert {max_gap!r} to timedelta64 using pandas.to_timedelta" + ) + + if isinstance(max_gap, pd.Timedelta): + max_gap = np.timedelta64(max_gap.value, "ns") + + max_gap = np.timedelta64(max_gap, "ns").astype(np.float64) + + if not use_coordinate: + if not isinstance(max_gap, (Number, np.number)): + raise TypeError( + f"Expected integer or floating point max_gap since use_coordinate=False. Received {max_type}." + ) + # method index = get_clean_interp_index(self, dim, use_coordinate=use_coordinate) interp_class, kwargs = _get_interpolator(method, **kwargs) @@ -253,6 +331,14 @@ def interp_na( if limit is not None: arr = arr.where(valids) + if max_gap is not None: + if dim not in self.coords: + raise NotImplementedError( + "max_gap not implemented for unlabeled coordinates yet." + ) + nan_block_lengths = _get_nan_block_lengths(self, dim, index) + arr = arr.where(nan_block_lengths <= max_gap) + return arr diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index cfce5d6f645..0b410383a34 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -5,7 +5,13 @@ import pytest import xarray as xr -from xarray.core.missing import NumpyInterpolator, ScipyInterpolator, SplineInterpolator +from xarray.core.missing import ( + NumpyInterpolator, + ScipyInterpolator, + SplineInterpolator, + get_clean_interp_index, + _get_nan_block_lengths, +) from xarray.core.pycompat import dask_array_type from xarray.tests import ( assert_array_equal, @@ -153,7 +159,7 @@ def test_interpolate_pd_compat_polynomial(): def test_interpolate_unsorted_index_raises(): vals = np.array([1, 2, 3], dtype=np.float64) expected = xr.DataArray(vals, dims="x", coords={"x": [2, 1, 3]}) - with raises_regex(ValueError, "Index must be monotonicly increasing"): + with raises_regex(ValueError, "Index 'x' must be monotonically increasing"): expected.interpolate_na(dim="x", method="index") @@ -169,12 +175,19 @@ def test_interpolate_invalid_interpolator_raises(): da.interpolate_na(dim="x", method="foo") +def test_interpolate_duplicate_values_raises(): + data = np.random.randn(2, 3) + da = xr.DataArray(data, coords=[("x", ["a", "a"]), ("y", [0, 1, 2])]) + with raises_regex(ValueError, "Index 'x' has duplicate values"): + da.interpolate_na(dim="x", method="foo") + + def test_interpolate_multiindex_raises(): data = np.random.randn(2, 3) data[1, 1] = np.nan da = xr.DataArray(data, coords=[("x", ["a", "b"]), ("y", [0, 1, 2])]) das = da.stack(z=("x", "y")) - with raises_regex(TypeError, "Index must be castable to float64"): + with raises_regex(TypeError, "Index 'z' must be castable to float64"): das.interpolate_na(dim="z") @@ -439,3 +452,114 @@ def test_ffill_dataset(ds): @requires_bottleneck def test_bfill_dataset(ds): ds.ffill(dim="time") + + +@requires_bottleneck +@pytest.mark.parametrize( + "y, lengths", + [ + [np.arange(9), [[3, 3, 3, 0, 3, 3, 0, 2, 2]]], + [np.arange(9) * 3, [[9, 9, 9, 0, 9, 9, 0, 6, 6]]], + [[0, 2, 5, 6, 7, 8, 10, 12, 14], [[6, 6, 6, 0, 4, 4, 0, 4, 4]]], + ], +) +def test_interpolate_na_nan_block_lengths(y, lengths): + arr = [[np.nan, np.nan, np.nan, 1, np.nan, np.nan, 4, np.nan, np.nan]] + da = xr.DataArray(arr * 2, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_nan_block_lengths(da, dim="y", index=index) + expected = da.copy(data=lengths * 2) + assert_equal(actual, expected) + + +@pytest.fixture +def da_time(): + return xr.DataArray( + [np.nan, 1, 2, np.nan, np.nan, 5, np.nan, np.nan, np.nan, np.nan, 10], + dims=["t"], + ) + + +def test_interpolate_na_max_gap_errors(da_time): + with raises_regex( + NotImplementedError, "max_gap not implemented for unlabeled coordinates" + ): + da_time.interpolate_na("t", max_gap=1) + + with raises_regex(ValueError, "max_gap must be a scalar."): + da_time.interpolate_na("t", max_gap=(1,)) + + da_time["t"] = pd.date_range("2001-01-01", freq="H", periods=11) + with raises_regex(TypeError, "Underlying index is"): + da_time.interpolate_na("t", max_gap=1) + + with raises_regex(TypeError, "Expected integer or floating point"): + da_time.interpolate_na("t", max_gap="1H", use_coordinate=False) + + with raises_regex(ValueError, "Could not convert 'huh' to timedelta64"): + da_time.interpolate_na("t", max_gap="huh") + + +@requires_bottleneck +@pytest.mark.parametrize( + "time_range_func", + [pd.date_range, pytest.param(xr.cftime_range, marks=pytest.mark.xfail)], +) +@pytest.mark.parametrize("transform", [lambda x: x, lambda x: x.to_dataset(name="a")]) +@pytest.mark.parametrize( + "max_gap", ["3H", np.timedelta64(3, "h"), pd.to_timedelta("3H")] +) +def test_interpolate_na_max_gap_time_specifier( + da_time, max_gap, transform, time_range_func +): + da_time["t"] = time_range_func("2001-01-01", freq="H", periods=11) + expected = transform( + da_time.copy(data=[np.nan, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan, 10]) + ) + actual = transform(da_time).interpolate_na("t", max_gap=max_gap) + assert_equal(actual, expected) + + +@requires_bottleneck +@pytest.mark.parametrize( + "coords", + [ + pytest.param(None, marks=pytest.mark.xfail()), + {"x": np.arange(4), "y": np.arange(11)}, + ], +) +def test_interpolate_na_2d(coords): + da = xr.DataArray( + [ + [1, 2, 3, 4, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, np.nan, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, np.nan, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, 4, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + ], + dims=["x", "y"], + coords=coords, + ) + + actual = da.interpolate_na("y", max_gap=2) + expected_y = da.copy( + data=[ + [1, 2, 3, 4, 5, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, np.nan, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, np.nan, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, 4, 5, 6, 7, np.nan, np.nan, np.nan, 11], + ] + ) + assert_equal(actual, expected_y) + + actual = da.interpolate_na("x", max_gap=3) + expected_x = xr.DataArray( + [ + [1, 2, 3, 4, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, 4, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, 4, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, 4, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + ], + dims=["x", "y"], + coords=coords, + ) + assert_equal(actual, expected_x) From aa876cfd6b3b97ee5028d089ec741d057e3ed688 Mon Sep 17 00:00:00 2001 From: crusaderky Date: Fri, 15 Nov 2019 17:43:53 +0000 Subject: [PATCH 12/24] Leave empty slot when not using accessors --- xarray/core/dataarray.py | 5 ++--- xarray/core/dataset.py | 6 ++---- xarray/core/extensions.py | 13 +++++++++---- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 7ce775b49cd..b27a61d530b 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -249,14 +249,14 @@ class DataArray(AbstractArray, DataWithCoords): Dictionary for holding arbitrary metadata. """ - _accessors: Optional[Dict[str, Any]] # noqa + _cache: Dict[str, Any] _coords: Dict[Any, Variable] _indexes: Optional[Dict[Hashable, pd.Index]] _name: Optional[Hashable] _variable: Variable __slots__ = ( - "_accessors", + "_cache", "_coords", "_file_obj", "_indexes", @@ -373,7 +373,6 @@ def __init__( assert isinstance(coords, dict) self._coords = coords self._name = name - self._accessors = None # TODO(shoyer): document this argument, once it becomes part of the # public interface. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 913842c4eba..ea310dd164b 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -419,8 +419,8 @@ class Dataset(Mapping, ImplementsDatasetReduce, DataWithCoords): coordinates used for label based indexing. """ - _accessors: Optional[Dict[str, Any]] _attrs: Optional[Dict[Hashable, Any]] + _cache: Dict[str, Any] _coord_names: Set[Hashable] _dims: Dict[Hashable, int] _encoding: Optional[Dict[Hashable, Any]] @@ -428,8 +428,8 @@ class Dataset(Mapping, ImplementsDatasetReduce, DataWithCoords): _variables: Dict[Hashable, Variable] __slots__ = ( - "_accessors", "_attrs", + "_cache", "_coord_names", "_dims", "_encoding", @@ -535,7 +535,6 @@ def __init__( data_vars, coords, compat=compat ) - self._accessors = None self._attrs = dict(attrs) if attrs is not None else None self._file_obj = None self._encoding = None @@ -870,7 +869,6 @@ def _construct_direct( obj._attrs = attrs obj._file_obj = file_obj obj._encoding = encoding - obj._accessors = None return obj @classmethod diff --git a/xarray/core/extensions.py b/xarray/core/extensions.py index f473eaa497d..79abbccea39 100644 --- a/xarray/core/extensions.py +++ b/xarray/core/extensions.py @@ -20,10 +20,15 @@ def __get__(self, obj, cls): # we're accessing the attribute of the class, i.e., Dataset.geo return self._accessor + # Use the same dict as @pandas.util.cache_readonly. + # It must be explicitly declared in obj.__slots__. try: - return obj._accessors[self._name] - except TypeError: - obj._accessors = {} + cache = obj._cache + except AttributeError: + cache = obj._cache = {} + + try: + return cache[self._name] except KeyError: pass @@ -35,7 +40,7 @@ def __get__(self, obj, cls): # something else (GH933): raise RuntimeError("error initializing %r accessor." % self._name) - obj._accessors[self._name] = accessor_obj + cache[self._name] = accessor_obj return accessor_obj From 68b004fe5033f4a991d152190864ee1180845806 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Fri, 15 Nov 2019 20:49:29 +0100 Subject: [PATCH 13/24] ensure rename does not change index type (#3532) * ensure rename does not change index type * test requires cftime * test orig.indexes[time].name is conserved * use index.rename() --- doc/whats-new.rst | 4 +++ xarray/core/dataset.py | 2 +- xarray/tests/test_dataset.py | 49 ++++++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 053f785bc05..3c3bf127a3f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -83,6 +83,10 @@ New Features Bug fixes ~~~~~~~~~ +- Ensure an index of type ``CFTimeIndex`` is not converted to a ``DatetimeIndex`` when + calling :py:meth:`Dataset.rename` (also :py:meth:`Dataset.rename_dims` + and :py:meth:`xr.Dataset.rename_vars`). By `Mathias Hauser `_ + (:issue:`3522`). - Fix a bug in `set_index` in case that an existing dimension becomes a level variable of MultiIndex. (:pull:`3520`) By `Keisuke Fujii `_. - Harmonize `_FillValue`, `missing_value` during encoding and decoding steps. (:pull:`3502`) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index ea310dd164b..3a83b477681 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2665,7 +2665,7 @@ def _rename_indexes(self, name_dict, dims_set): verify_integrity=False, ) else: - index = pd.Index(v, name=new_name) + index = v.rename(new_name) indexes[new_name] = index return indexes diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 67d3b3198dc..780843f2e61 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -8,6 +8,7 @@ import numpy as np import pandas as pd import pytest +from pandas.core.indexes.datetimes import DatetimeIndex import xarray as xr from xarray import ( @@ -22,6 +23,7 @@ open_dataset, set_options, ) +from xarray.coding.cftimeindex import CFTimeIndex from xarray.core import dtypes, indexing, utils from xarray.core.common import duck_array_ops, full_like from xarray.core.npcompat import IS_NEP18_ACTIVE @@ -2458,6 +2460,53 @@ def test_rename_vars(self): with pytest.raises(ValueError): original.rename_vars(names_dict_bad) + @requires_cftime + def test_rename_does_not_change_CFTimeIndex_type(self): + # make sure CFTimeIndex is not converted to DatetimeIndex #3522 + + time = xr.cftime_range(start="2000", periods=6, freq="2MS", calendar="noleap") + orig = Dataset(coords={"time": time}) + + renamed = orig.rename(time="time_new") + assert "time_new" in renamed.indexes + assert isinstance(renamed.indexes["time_new"], CFTimeIndex) + assert renamed.indexes["time_new"].name == "time_new" + + # check original has not changed + assert "time" in orig.indexes + assert isinstance(orig.indexes["time"], CFTimeIndex) + assert orig.indexes["time"].name == "time" + + # note: rename_dims(time="time_new") drops "ds.indexes" + renamed = orig.rename_dims() + assert isinstance(renamed.indexes["time"], CFTimeIndex) + + renamed = orig.rename_vars() + assert isinstance(renamed.indexes["time"], CFTimeIndex) + + def test_rename_does_not_change_DatetimeIndex_type(self): + # make sure DatetimeIndex is conderved on rename + + time = pd.date_range(start="2000", periods=6, freq="2MS") + orig = Dataset(coords={"time": time}) + + renamed = orig.rename(time="time_new") + assert "time_new" in renamed.indexes + assert isinstance(renamed.indexes["time_new"], DatetimeIndex) + assert renamed.indexes["time_new"].name == "time_new" + + # check original has not changed + assert "time" in orig.indexes + assert isinstance(orig.indexes["time"], DatetimeIndex) + assert orig.indexes["time"].name == "time" + + # note: rename_dims(time="time_new") drops "ds.indexes" + renamed = orig.rename_dims() + assert isinstance(renamed.indexes["time"], DatetimeIndex) + + renamed = orig.rename_vars() + assert isinstance(renamed.indexes["time"], DatetimeIndex) + def test_swap_dims(self): original = Dataset({"x": [1, 2, 3], "y": ("x", list("abc")), "z": 42}) expected = Dataset({"z": 42}, {"x": ("y", [1, 2, 3]), "y": list("abc")}) From 52d48450f6291716a90f4f7e93e15847942e0da0 Mon Sep 17 00:00:00 2001 From: keewis Date: Fri, 15 Nov 2019 20:58:01 +0100 Subject: [PATCH 14/24] Add DatasetGroupBy.quantile (#3527) * move the implementation of DataArrayGroupBy.quantile to GroupBy * add tests for DatasetGroupBy * update whats-new.rst * move the item in whats-new.rst into New Features * don't drop scalar quantile coords --- doc/whats-new.rst | 2 + xarray/core/groupby.py | 107 +++++++++++++------------- xarray/tests/test_groupby.py | 143 +++++++++++++++++++++++++++++++---- 3 files changed, 184 insertions(+), 68 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 3c3bf127a3f..c835fbeff45 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -80,6 +80,8 @@ New Features invoked. (:issue:`3378`, :pull:`3446`, :pull:`3515`) By `Deepak Cherian `_ and `Guido Imperiale `_. +- Add the documented-but-missing :py:meth:`xarray.core.groupby.DatasetGroupBy.quantile`. + (:issue:`3525`, :pull:`3527`). By `Justus Magin `_. Bug fixes ~~~~~~~~~ diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index c73ee3cf7c5..38ecc04534a 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -557,6 +557,59 @@ def fillna(self, value): out = ops.fillna(self, value) return out + def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None): + """Compute the qth quantile over each array in the groups and + concatenate them together into a new array. + + Parameters + ---------- + q : float in range of [0,1] (or sequence of floats) + Quantile to compute, which must be between 0 and 1 + inclusive. + dim : `...`, str or sequence of str, optional + Dimension(s) over which to apply quantile. + Defaults to the grouped dimension. + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + This optional parameter specifies the interpolation method to + use when the desired quantile lies between two data points + ``i < j``: + * linear: ``i + (j - i) * fraction``, where ``fraction`` is + the fractional part of the index surrounded by ``i`` and + ``j``. + * lower: ``i``. + * higher: ``j``. + * nearest: ``i`` or ``j``, whichever is nearest. + * midpoint: ``(i + j) / 2``. + + Returns + ------- + quantiles : Variable + If `q` is a single quantile, then the result is a + scalar. If multiple percentiles are given, first axis of + the result corresponds to the quantile. In either case a + quantile dimension is added to the return array. The other + dimensions are the dimensions that remain after the + reduction of the array. + + See Also + -------- + numpy.nanpercentile, pandas.Series.quantile, Dataset.quantile, + DataArray.quantile + """ + if dim is None: + dim = self._group_dim + + out = self.map( + self._obj.__class__.quantile, + shortcut=False, + q=q, + dim=dim, + interpolation=interpolation, + keep_attrs=keep_attrs, + ) + + return out + def where(self, cond, other=dtypes.NA): """Return elements from `self` or `other` depending on `cond`. @@ -737,60 +790,6 @@ def _combine(self, applied, restore_coord_dims=False, shortcut=False): combined = self._maybe_unstack(combined) return combined - def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None): - """Compute the qth quantile over each array in the groups and - concatenate them together into a new array. - - Parameters - ---------- - q : float in range of [0,1] (or sequence of floats) - Quantile to compute, which must be between 0 and 1 - inclusive. - dim : `...`, str or sequence of str, optional - Dimension(s) over which to apply quantile. - Defaults to the grouped dimension. - interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} - This optional parameter specifies the interpolation method to - use when the desired quantile lies between two data points - ``i < j``: - * linear: ``i + (j - i) * fraction``, where ``fraction`` is - the fractional part of the index surrounded by ``i`` and - ``j``. - * lower: ``i``. - * higher: ``j``. - * nearest: ``i`` or ``j``, whichever is nearest. - * midpoint: ``(i + j) / 2``. - - Returns - ------- - quantiles : Variable - If `q` is a single quantile, then the result - is a scalar. If multiple percentiles are given, first axis of - the result corresponds to the quantile and a quantile dimension - is added to the return array. The other dimensions are the - dimensions that remain after the reduction of the array. - - See Also - -------- - numpy.nanpercentile, pandas.Series.quantile, Dataset.quantile, - DataArray.quantile - """ - if dim is None: - dim = self._group_dim - - out = self.map( - self._obj.__class__.quantile, - shortcut=False, - q=q, - dim=dim, - interpolation=interpolation, - keep_attrs=keep_attrs, - ) - - if np.asarray(q, dtype=np.float64).ndim == 0: - out = out.drop_vars("quantile") - return out - def reduce( self, func, dim=None, axis=None, keep_attrs=None, shortcut=True, **kwargs ): diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 581affa3471..97bd31ae050 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -137,42 +137,58 @@ def test_da_groupby_empty(): def test_da_groupby_quantile(): - array = xr.DataArray([1, 2, 3, 4, 5, 6], [("x", [1, 1, 1, 2, 2, 2])]) + array = xr.DataArray( + data=[1, 2, 3, 4, 5, 6], coords={"x": [1, 1, 1, 2, 2, 2]}, dims="x" + ) # Scalar quantile - expected = xr.DataArray([2, 5], [("x", [1, 2])]) + expected = xr.DataArray( + data=[2, 5], coords={"x": [1, 2], "quantile": 0.5}, dims="x" + ) actual = array.groupby("x").quantile(0.5) assert_identical(expected, actual) # Vector quantile - expected = xr.DataArray([[1, 3], [4, 6]], [("x", [1, 2]), ("quantile", [0, 1])]) + expected = xr.DataArray( + data=[[1, 3], [4, 6]], + coords={"x": [1, 2], "quantile": [0, 1]}, + dims=("x", "quantile"), + ) actual = array.groupby("x").quantile([0, 1]) assert_identical(expected, actual) # Multiple dimensions array = xr.DataArray( - [[1, 11, 26], [2, 12, 22], [3, 13, 23], [4, 16, 24], [5, 15, 25]], - [("x", [1, 1, 1, 2, 2]), ("y", [0, 0, 1])], + data=[[1, 11, 26], [2, 12, 22], [3, 13, 23], [4, 16, 24], [5, 15, 25]], + coords={"x": [1, 1, 1, 2, 2], "y": [0, 0, 1]}, + dims=("x", "y"), ) actual_x = array.groupby("x").quantile(0, dim=...) - expected_x = xr.DataArray([1, 4], [("x", [1, 2])]) + expected_x = xr.DataArray( + data=[1, 4], coords={"x": [1, 2], "quantile": 0}, dims="x" + ) assert_identical(expected_x, actual_x) actual_y = array.groupby("y").quantile(0, dim=...) - expected_y = xr.DataArray([1, 22], [("y", [0, 1])]) + expected_y = xr.DataArray( + data=[1, 22], coords={"y": [0, 1], "quantile": 0}, dims="y" + ) assert_identical(expected_y, actual_y) actual_xx = array.groupby("x").quantile(0) expected_xx = xr.DataArray( - [[1, 11, 22], [4, 15, 24]], [("x", [1, 2]), ("y", [0, 0, 1])] + data=[[1, 11, 22], [4, 15, 24]], + coords={"x": [1, 2], "y": [0, 0, 1], "quantile": 0}, + dims=("x", "y"), ) assert_identical(expected_xx, actual_xx) actual_yy = array.groupby("y").quantile(0) expected_yy = xr.DataArray( - [[1, 26], [2, 22], [3, 23], [4, 24], [5, 25]], - [("x", [1, 1, 1, 2, 2]), ("y", [0, 1])], + data=[[1, 26], [2, 22], [3, 23], [4, 24], [5, 25]], + coords={"x": [1, 1, 1, 2, 2], "y": [0, 1], "quantile": 0}, + dims=("x", "y"), ) assert_identical(expected_yy, actual_yy) @@ -180,14 +196,14 @@ def test_da_groupby_quantile(): x = [0, 1] foo = xr.DataArray( np.reshape(np.arange(365 * 2), (365, 2)), - coords=dict(time=times, x=x), + coords={"time": times, "x": x}, dims=("time", "x"), ) g = foo.groupby(foo.time.dt.month) actual = g.quantile(0, dim=...) expected = xr.DataArray( - [ + data=[ 0.0, 62.0, 120.0, @@ -201,12 +217,111 @@ def test_da_groupby_quantile(): 610.0, 670.0, ], - [("month", np.arange(1, 13))], + coords={"month": np.arange(1, 13), "quantile": 0}, + dims="month", ) assert_identical(expected, actual) actual = g.quantile(0, dim="time")[:2] - expected = xr.DataArray([[0.0, 1], [62.0, 63]], [("month", [1, 2]), ("x", [0, 1])]) + expected = xr.DataArray( + data=[[0.0, 1], [62.0, 63]], + coords={"month": [1, 2], "x": [0, 1], "quantile": 0}, + dims=("month", "x"), + ) + assert_identical(expected, actual) + + +def test_ds_groupby_quantile(): + ds = xr.Dataset( + data_vars={"a": ("x", [1, 2, 3, 4, 5, 6])}, coords={"x": [1, 1, 1, 2, 2, 2]} + ) + + # Scalar quantile + expected = xr.Dataset( + data_vars={"a": ("x", [2, 5])}, coords={"quantile": 0.5, "x": [1, 2]} + ) + actual = ds.groupby("x").quantile(0.5) + assert_identical(expected, actual) + + # Vector quantile + expected = xr.Dataset( + data_vars={"a": (("x", "quantile"), [[1, 3], [4, 6]])}, + coords={"x": [1, 2], "quantile": [0, 1]}, + ) + actual = ds.groupby("x").quantile([0, 1]) + assert_identical(expected, actual) + + # Multiple dimensions + ds = xr.Dataset( + data_vars={ + "a": ( + ("x", "y"), + [[1, 11, 26], [2, 12, 22], [3, 13, 23], [4, 16, 24], [5, 15, 25]], + ) + }, + coords={"x": [1, 1, 1, 2, 2], "y": [0, 0, 1]}, + ) + + actual_x = ds.groupby("x").quantile(0, dim=...) + expected_x = xr.Dataset({"a": ("x", [1, 4])}, coords={"x": [1, 2], "quantile": 0}) + assert_identical(expected_x, actual_x) + + actual_y = ds.groupby("y").quantile(0, dim=...) + expected_y = xr.Dataset({"a": ("y", [1, 22])}, coords={"y": [0, 1], "quantile": 0}) + assert_identical(expected_y, actual_y) + + actual_xx = ds.groupby("x").quantile(0) + expected_xx = xr.Dataset( + {"a": (("x", "y"), [[1, 11, 22], [4, 15, 24]])}, + coords={"x": [1, 2], "y": [0, 0, 1], "quantile": 0}, + ) + assert_identical(expected_xx, actual_xx) + + actual_yy = ds.groupby("y").quantile(0) + expected_yy = xr.Dataset( + {"a": (("x", "y"), [[1, 26], [2, 22], [3, 23], [4, 24], [5, 25]])}, + coords={"x": [1, 1, 1, 2, 2], "y": [0, 1], "quantile": 0}, + ).transpose() + assert_identical(expected_yy, actual_yy) + + times = pd.date_range("2000-01-01", periods=365) + x = [0, 1] + foo = xr.Dataset( + {"a": (("time", "x"), np.reshape(np.arange(365 * 2), (365, 2)))}, + coords=dict(time=times, x=x), + ) + g = foo.groupby(foo.time.dt.month) + + actual = g.quantile(0, dim=...) + expected = xr.Dataset( + { + "a": ( + "month", + [ + 0.0, + 62.0, + 120.0, + 182.0, + 242.0, + 304.0, + 364.0, + 426.0, + 488.0, + 548.0, + 610.0, + 670.0, + ], + ) + }, + coords={"month": np.arange(1, 13), "quantile": 0}, + ) + assert_identical(expected, actual) + + actual = g.quantile(0, dim="time").isel(month=slice(None, 2)) + expected = xr.Dataset( + data_vars={"a": (("month", "x"), [[0.0, 1], [62.0, 63]])}, + coords={"month": [1, 2], "x": [0, 1], "quantile": 0}, + ) assert_identical(expected, actual) From 56c16e4bf45a3771fd9acba76d802c0199c14519 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sat, 16 Nov 2019 23:36:43 +0900 Subject: [PATCH 15/24] Added fill_value for unstack (#3541) * Added fill_value for unstack * remove sparse option and fix unintended changes * a bug fix * using assert_equal * assert_equals -> assert_equal --- doc/whats-new.rst | 3 +++ xarray/core/dataarray.py | 7 +++++-- xarray/core/dataset.py | 13 +++++++++---- xarray/tests/test_dataset.py | 17 +++++++++++++++++ 4 files changed, 34 insertions(+), 6 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c835fbeff45..6bf495713fe 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -39,6 +39,9 @@ Breaking changes New Features ~~~~~~~~~~~~ +- Added the ``fill_value`` option to :py:meth:`~xarray.DataArray.unstack` and + :py:meth:`~xarray.Dataset.unstack` (:issue:`3518`). + By `Keisuke Fujii `_. - Added the ``max_gap`` kwarg to :py:meth:`~xarray.DataArray.interpolate_na` and :py:meth:`~xarray.Dataset.interpolate_na`. This controls the maximum size of the data gap that will be filled by interpolation. By `Deepak Cherian `_. diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index b27a61d530b..23342fc5e0d 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1726,7 +1726,9 @@ def stack( return self._from_temp_dataset(ds) def unstack( - self, dim: Union[Hashable, Sequence[Hashable], None] = None + self, + dim: Union[Hashable, Sequence[Hashable], None] = None, + fill_value: Any = dtypes.NA, ) -> "DataArray": """ Unstack existing dimensions corresponding to MultiIndexes into @@ -1739,6 +1741,7 @@ def unstack( dim : hashable or sequence of hashable, optional Dimension(s) over which to unstack. By default unstacks all MultiIndexes. + fill_value: value to be filled. By default, np.nan Returns ------- @@ -1770,7 +1773,7 @@ def unstack( -------- DataArray.stack """ - ds = self._to_temp_dataset().unstack(dim) + ds = self._to_temp_dataset().unstack(dim, fill_value) return self._from_temp_dataset(ds) def to_unstacked_dataset(self, dim, level=0): diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 3a83b477681..371e0d6bf26 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3333,7 +3333,7 @@ def ensure_stackable(val): return data_array - def _unstack_once(self, dim: Hashable) -> "Dataset": + def _unstack_once(self, dim: Hashable, fill_value) -> "Dataset": index = self.get_index(dim) index = index.remove_unused_levels() full_idx = pd.MultiIndex.from_product(index.levels, names=index.names) @@ -3342,7 +3342,7 @@ def _unstack_once(self, dim: Hashable) -> "Dataset": if index.equals(full_idx): obj = self else: - obj = self.reindex({dim: full_idx}, copy=False) + obj = self.reindex({dim: full_idx}, copy=False, fill_value=fill_value) new_dim_names = index.names new_dim_sizes = [lev.size for lev in index.levels] @@ -3368,7 +3368,11 @@ def _unstack_once(self, dim: Hashable) -> "Dataset": variables, coord_names=coord_names, indexes=indexes ) - def unstack(self, dim: Union[Hashable, Iterable[Hashable]] = None) -> "Dataset": + def unstack( + self, + dim: Union[Hashable, Iterable[Hashable]] = None, + fill_value: Any = dtypes.NA, + ) -> "Dataset": """ Unstack existing dimensions corresponding to MultiIndexes into multiple new dimensions. @@ -3380,6 +3384,7 @@ def unstack(self, dim: Union[Hashable, Iterable[Hashable]] = None) -> "Dataset": dim : Hashable or iterable of Hashable, optional Dimension(s) over which to unstack. By default unstacks all MultiIndexes. + fill_value: value to be filled. By default, np.nan Returns ------- @@ -3417,7 +3422,7 @@ def unstack(self, dim: Union[Hashable, Iterable[Hashable]] = None) -> "Dataset": result = self.copy(deep=False) for dim in dims: - result = result._unstack_once(dim) + result = result._unstack_once(dim, fill_value) return result def update(self, other: "CoercibleMapping", inplace: bool = None) -> "Dataset": diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 780843f2e61..be40ce7c6e8 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -2794,6 +2794,23 @@ def test_unstack_errors(self): with raises_regex(ValueError, "do not have a MultiIndex"): ds.unstack("x") + def test_unstack_fill_value(self): + ds = xr.Dataset( + {"var": (("x",), np.arange(6))}, + coords={"x": [0, 1, 2] * 2, "y": (("x",), ["a"] * 3 + ["b"] * 3)}, + ) + # make ds incomplete + ds = ds.isel(x=[0, 2, 3, 4]).set_index(index=["x", "y"]) + # test fill_value + actual = ds.unstack("index", fill_value=-1) + expected = ds.unstack("index").fillna(-1).astype(np.int) + assert actual["var"].dtype == np.int + assert_equal(actual, expected) + + actual = ds["var"].unstack("index", fill_value=-1) + expected = ds["var"].unstack("index").fillna(-1).astype(np.int) + assert actual.equals(expected) + def test_stack_unstack_fast(self): ds = Dataset( { From 9755e3f3e986c3ab89797ce86201b64b7f702184 Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Sat, 16 Nov 2019 15:36:49 -0500 Subject: [PATCH 16/24] small simplification of rename from #3532 (#3539) --- xarray/core/dataset.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 371e0d6bf26..5de254614ff 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2657,13 +2657,7 @@ def _rename_indexes(self, name_dict, dims_set): continue if isinstance(v, pd.MultiIndex): new_names = [name_dict.get(k, k) for k in v.names] - index = pd.MultiIndex( - v.levels, - v.labels, - v.sortorder, - names=new_names, - verify_integrity=False, - ) + index = v.rename(names=new_names) else: index = v.rename(new_name) indexes[new_name] = index From 980a1d26969b603d4be61033791781abd702d02a Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sat, 16 Nov 2019 13:39:33 -0700 Subject: [PATCH 17/24] tweak whats-new. (#3540) * tweak whats-new. * update. --- doc/conf.py | 1 + doc/whats-new.rst | 96 ++++++++++++++++++++++++----------------------- 2 files changed, 50 insertions(+), 47 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index 0e04f8ccde8..f1199d53fb7 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -346,4 +346,5 @@ "scipy": ("https://docs.scipy.org/doc/scipy/reference", None), "numba": ("https://numba.pydata.org/numba-doc/latest", None), "matplotlib": ("https://matplotlib.org", None), + "dask": ("https://docs.dask.org/en/latest", None), } diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 6bf495713fe..cb274bcaee8 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -21,33 +21,35 @@ v0.14.1 (unreleased) Breaking changes ~~~~~~~~~~~~~~~~ -- Broken compatibility with cftime < 1.0.3. - By `Deepak Cherian `_. +- Broken compatibility with ``cftime < 1.0.3`` . By `Deepak Cherian `_. - .. note:: + .. warning:: cftime version 1.0.4 is broken (`cftime/126 `_); please use version 1.0.4.2 instead. -- All leftover support for dates from non-standard calendars through netcdftime, the +- All leftover support for dates from non-standard calendars through ``netcdftime``, the module included in versions of netCDF4 prior to 1.4 that eventually became the - cftime package, has been removed in favor of relying solely on the standalone - cftime package (:pull:`3450`). + `cftime `_ package, has been removed in favor of relying solely on + the standalone ``cftime`` package (:pull:`3450`). By `Spencer Clark `_. New Features ~~~~~~~~~~~~ -- Added the ``fill_value`` option to :py:meth:`~xarray.DataArray.unstack` and - :py:meth:`~xarray.Dataset.unstack` (:issue:`3518`). +- Added the ``max_gap`` kwarg to :py:meth:`DataArray.interpolate_na` and + :py:meth:`Dataset.interpolate_na`. This controls the maximum size of the data +- Added the ``fill_value`` option to :py:meth:`DataArray.unstack` and + :py:meth:`Dataset.unstack` (:issue:`3518`, :pull:`3541`). By `Keisuke Fujii `_. - Added the ``max_gap`` kwarg to :py:meth:`~xarray.DataArray.interpolate_na` and :py:meth:`~xarray.Dataset.interpolate_na`. This controls the maximum size of the data gap that will be filled by interpolation. By `Deepak Cherian `_. - :py:meth:`Dataset.drop_sel` & :py:meth:`DataArray.drop_sel` have been added for dropping labels. :py:meth:`Dataset.drop_vars` & :py:meth:`DataArray.drop_vars` have been added for - dropping variables (including coordinates). The existing ``drop`` methods remain as a backward compatible + dropping variables (including coordinates). The existing :py:meth:`Dataset.drop` & + :py:meth:`DataArray.drop` methods remain as a backward compatible option for dropping either labels or variables, but using the more specific methods is encouraged. (:pull:`3475`) By `Maximilian Roos `_ @@ -58,71 +60,71 @@ New Features methods is encouraged. (:pull:`3459`) By `Maximilian Roos `_ -- :py:meth:`Dataset.transpose` and :py:meth:`DataArray.transpose` now support an ellipsis (`...`) +- :py:meth:`Dataset.transpose` and :py:meth:`DataArray.transpose` now support an ellipsis (``...``) to represent all 'other' dimensions. For example, to move one dimension to the front, - use `.transpose('x', ...)`. (:pull:`3421`) + use ``.transpose('x', ...)``. (:pull:`3421`) By `Maximilian Roos `_ -- Changed `xr.ALL_DIMS` to equal python's `Ellipsis` (`...`), and changed internal usages to use - `...` directly. As before, you can use this to instruct a `groupby` operation - to reduce over all dimensions. While we have no plans to remove `xr.ALL_DIMS`, we suggest - using `...`. (:pull:`3418`) +- Changed ``xr.ALL_DIMS`` to equal python's ``Ellipsis`` (``...``), and changed internal usages to use + ``...`` directly. As before, you can use this to instruct a ``groupby`` operation + to reduce over all dimensions. While we have no plans to remove ``xr.ALL_DIMS``, we suggest + using ``...``. (:pull:`3418`) By `Maximilian Roos `_ -- :py:func:`~xarray.dot`, and :py:func:`~xarray.DataArray.dot` now support the - `dims=...` option to sum over the union of dimensions of all input arrays +- :py:func:`xarray.dot`, and :py:meth:`DataArray.dot` now support the + ``dims=...`` option to sum over the union of dimensions of all input arrays (:issue:`3423`) by `Mathias Hauser `_. - Added new :py:meth:`Dataset._repr_html_` and :py:meth:`DataArray._repr_html_` to improve - representation of objects in jupyter. By default this feature is turned off - for now. Enable it with :py:meth:`xarray.set_options(display_style="html")`. + representation of objects in Jupyter. By default this feature is turned off + for now. Enable it with ``xarray.set_options(display_style="html")``. (:pull:`3425`) by `Benoit Bovy `_ and `Julia Signell `_. - Implement `dask deterministic hashing `_ for xarray objects. Note that xarray objects with a dask.array backend already used deterministic hashing in previous releases; this change implements it when whole - xarray objects are embedded in a dask graph, e.g. when :meth:`DataArray.map` is + xarray objects are embedded in a dask graph, e.g. when :py:meth:`DataArray.map` is invoked. (:issue:`3378`, :pull:`3446`, :pull:`3515`) By `Deepak Cherian `_ and `Guido Imperiale `_. -- Add the documented-but-missing :py:meth:`xarray.core.groupby.DatasetGroupBy.quantile`. +- Add the documented-but-missing :py:meth:`DatasetGroupBy.quantile`. (:issue:`3525`, :pull:`3527`). By `Justus Magin `_. Bug fixes ~~~~~~~~~ - Ensure an index of type ``CFTimeIndex`` is not converted to a ``DatetimeIndex`` when - calling :py:meth:`Dataset.rename` (also :py:meth:`Dataset.rename_dims` - and :py:meth:`xr.Dataset.rename_vars`). By `Mathias Hauser `_ - (:issue:`3522`). -- Fix a bug in `set_index` in case that an existing dimension becomes a level variable of MultiIndex. (:pull:`3520`) - By `Keisuke Fujii `_. -- Harmonize `_FillValue`, `missing_value` during encoding and decoding steps. (:pull:`3502`) + calling :py:meth:`Dataset.rename`, :py:meth:`Dataset.rename_dims` and :py:meth:`Dataset.rename_vars`. + By `Mathias Hauser `_. (:issue:`3522`). +- Fix a bug in :py:meth:`DataArray.set_index` in case that an existing dimension becomes a level + variable of MultiIndex. (:pull:`3520`). By `Keisuke Fujii `_. +- Harmonize ``_FillValue``, ``missing_value`` during encoding and decoding steps. (:pull:`3502`) By `Anderson Banihirwe `_. - Fix regression introduced in v0.14.0 that would cause a crash if dask is installed but cloudpickle isn't (:issue:`3401`) by `Rhys Doyle `_ - Fix grouping over variables with NaNs. (:issue:`2383`, :pull:`3406`). By `Deepak Cherian `_. -- Use dask names to compare dask objects prior to comparing values after computation. +- Make alignment and concatenation significantly more efficient by using dask names to compare dask + objects prior to comparing values after computation. This change makes it more convenient to carry + around large non-dimensional coordinate variables backed by dask arrays. Existing workarounds involving + ``reset_coords(drop=True)`` should now be unnecessary in most cases. (:issue:`3068`, :issue:`3311`, :issue:`3454`, :pull:`3453`). By `Deepak Cherian `_. -- Sync with cftime by removing `dayofwk=-1` for cftime>=1.0.4. - By `Anderson Banihirwe `_. +- Add support for cftime>=1.0.4. By `Anderson Banihirwe `_. - Rolling reduction operations no longer compute dask arrays by default. (:issue:`3161`). In addition, the ``allow_lazy`` kwarg to ``reduce`` is deprecated. By `Deepak Cherian `_. -- Fix :py:meth:`xarray.core.groupby.DataArrayGroupBy.reduce` and - :py:meth:`xarray.core.groupby.DatasetGroupBy.reduce` when reducing over multiple dimensions. +- Fix :py:meth:`GroupBy.reduce` when reducing over multiple dimensions. (:issue:`3402`). By `Deepak Cherian `_ - Allow appending datetime and bool data variables to zarr stores. (:issue:`3480`). By `Akihiro Matsukawa `_. Documentation ~~~~~~~~~~~~~ -- Fix leap year condition in example (http://xarray.pydata.org/en/stable/examples/monthly-means.html) - by `Mickaël Lalande `_. +- Fix leap year condition in `monthly means example `_. + By `Mickaël Lalande `_. - Fix the documentation of :py:meth:`DataArray.resample` and - :py:meth:`Dataset.resample` and explicitly state that a + :py:meth:`Dataset.resample` — explicitly state that a datetime-like dimension is required. (:pull:`3400`) By `Justus Magin `_. -- Update the terminology page to address multidimensional coordinates. (:pull:`3410`) +- Update the :ref:`terminology` page to address multidimensional coordinates. (:pull:`3410`) By `Jon Thielen `_. - Fix the documentation of :py:meth:`Dataset.integrate` and :py:meth:`DataArray.integrate` and add an example to @@ -186,15 +188,15 @@ Breaking changes (:issue:`3222`, :issue:`3293`, :issue:`3340`, :issue:`3346`, :issue:`3358`). By `Guido Imperiale `_. -- Dropped the `drop=False` optional parameter from :meth:`Variable.isel`. +- Dropped the ``drop=False`` optional parameter from :py:meth:`Variable.isel`. It was unused and doesn't make sense for a Variable. (:pull:`3375`). By `Guido Imperiale `_. -- Remove internal usage of `collections.OrderedDict`. After dropping support for - Python <=3.5, most uses of `OrderedDict` in Xarray were no longer necessary. We - have removed the internal use of the `OrderedDict` in favor of Python's builtin - `dict` object which is now ordered itself. This change will be most obvious when - interacting with the `attrs` property on the Dataset and DataArray objects. +- Remove internal usage of :py:class:`collections.OrderedDict`. After dropping support for + Python <=3.5, most uses of ``OrderedDict`` in Xarray were no longer necessary. We + have removed the internal use of the ``OrderedDict`` in favor of Python's builtin + ``dict`` object which is now ordered itself. This change will be most obvious when + interacting with the ``attrs`` property on Dataset and DataArray objects. (:issue:`3380`, :pull:`3389`). By `Joe Hamman `_. New functions/methods @@ -220,15 +222,15 @@ Enhancements - Added a ``GroupBy.dims`` property that mirrors the dimensions of each group (:issue:`3344`). -- Speed up :meth:`Dataset.isel` up to 33% and :meth:`DataArray.isel` up to 25% for small +- Speed up :py:meth:`Dataset.isel` up to 33% and :py:meth:`DataArray.isel` up to 25% for small arrays (:issue:`2799`, :pull:`3375`). By `Guido Imperiale `_. Bug fixes ~~~~~~~~~ - Reintroduce support for :mod:`weakref` (broken in v0.13.0). Support has been - reinstated for :class:`DataArray` and :class:`Dataset` objects only. Internal xarray - objects remain unaddressable by weakref in order to save memory + reinstated for :py:class:`~xarray.DataArray` and :py:class:`~xarray.Dataset` objects only. + Internal xarray objects remain unaddressable by weakref in order to save memory (:issue:`3317`). By `Guido Imperiale `_. - Line plots with the ``x`` or ``y`` argument set to a 1D non-dimensional coord now plot the correct data for 2D DataArrays @@ -238,7 +240,7 @@ Bug fixes - The default behaviour of reducing across all dimensions for :py:class:`~xarray.core.groupby.DataArrayGroupBy` objects has now been properly removed as was done for :py:class:`~xarray.core.groupby.DatasetGroupBy` in 0.13.0 (:issue:`3337`). - Use `xarray.ALL_DIMS` if you need to replicate previous behaviour. + Use ``xarray.ALL_DIMS`` if you need to replicate previous behaviour. Also raise nicer error message when no groups are created (:issue:`1764`). By `Deepak Cherian `_. - Fix error in concatenating unlabeled dimensions (:pull:`3362`). @@ -325,7 +327,7 @@ New functions/methods - xarray can now wrap around any `NEP18 `_ compliant - numpy-like library (important: read notes about NUMPY_EXPERIMENTAL_ARRAY_FUNCTION in + numpy-like library (important: read notes about ``NUMPY_EXPERIMENTAL_ARRAY_FUNCTION`` in the above link). Added explicit test coverage for `sparse `_. (:issue:`3117`, :issue:`3202`). This requires `sparse>=0.8.0`. By `Nezar Abdennur `_ From 45fd0e63f43cf313b022a33aeec7f0f982e1908b Mon Sep 17 00:00:00 2001 From: crusaderky Date: Tue, 19 Nov 2019 14:06:45 +0000 Subject: [PATCH 18/24] Numpy 1.18 support (#3537) * Closes #3409 * Unpin versions * Rewrite unit test for clarity about its real scope * mean() on dask * Trivial * duck_array_ops should never receive xarray.Variable --- ci/azure/install.yml | 2 +- ci/requirements/py36.yml | 2 +- ci/requirements/py37.yml | 2 +- doc/whats-new.rst | 7 +++- xarray/core/dataset.py | 4 ++- xarray/core/duck_array_ops.py | 28 ++++++++++++++-- xarray/tests/test_dataset.py | 4 ++- xarray/tests/test_duck_array_ops.py | 50 +++++++++++++++++++---------- 8 files changed, 74 insertions(+), 25 deletions(-) diff --git a/ci/azure/install.yml b/ci/azure/install.yml index fee886ba804..baa69bcc8d5 100644 --- a/ci/azure/install.yml +++ b/ci/azure/install.yml @@ -16,9 +16,9 @@ steps: --pre \ --upgrade \ matplotlib \ + numpy \ pandas \ scipy - # numpy \ # FIXME https://github.com/pydata/xarray/issues/3409 pip install \ --no-deps \ --upgrade \ diff --git a/ci/requirements/py36.yml b/ci/requirements/py36.yml index 10fe69253e8..820160b19cc 100644 --- a/ci/requirements/py36.yml +++ b/ci/requirements/py36.yml @@ -25,7 +25,7 @@ dependencies: - nc-time-axis - netcdf4 - numba - - numpy<1.18 # FIXME https://github.com/pydata/xarray/issues/3409 + - numpy - pandas - pint - pip diff --git a/ci/requirements/py37.yml b/ci/requirements/py37.yml index 827c664a222..4a7aaf7d32b 100644 --- a/ci/requirements/py37.yml +++ b/ci/requirements/py37.yml @@ -25,7 +25,7 @@ dependencies: - nc-time-axis - netcdf4 - numba - - numpy<1.18 # FIXME https://github.com/pydata/xarray/issues/3409 + - numpy - pandas - pint - pip diff --git a/doc/whats-new.rst b/doc/whats-new.rst index cb274bcaee8..0c929b5b711 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -115,6 +115,12 @@ Bug fixes (:issue:`3402`). By `Deepak Cherian `_ - Allow appending datetime and bool data variables to zarr stores. (:issue:`3480`). By `Akihiro Matsukawa `_. +- Add support for numpy >=1.18 (); bugfix mean() on datetime64 arrays on dask backend + (:issue:`3409`, :pull:`3537`). By `Guido Imperiale `_. +- Add support for pandas >=0.26 (:issue:`3440`). + By `Deepak Cherian `_. +- Add support for pseudonetcdf >=3.1 (:pull:`3485`). + By `Barron Henderson `_. Documentation ~~~~~~~~~~~~~ @@ -133,7 +139,6 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ - - Added integration tests against `pint `_. (:pull:`3238`, :pull:`3447`, :pull:`3493`, :pull:`3508`) by `Justus Magin `_. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 5de254614ff..c631a4c11ea 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -5316,7 +5316,9 @@ def _integrate_one(self, coord, datetime_unit=None): datetime_unit, _ = np.datetime_data(coord_var.dtype) elif datetime_unit is None: datetime_unit = "s" # Default to seconds for cftime objects - coord_var = datetime_to_numeric(coord_var, datetime_unit=datetime_unit) + coord_var = coord_var._replace( + data=datetime_to_numeric(coord_var.data, datetime_unit=datetime_unit) + ) variables = {} coord_names = set() diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 71e79335c3d..cf616acb485 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -351,6 +351,26 @@ def f(values, axis=None, skipna=None, **kwargs): _mean = _create_nan_agg_method("mean") +def _datetime_nanmin(array): + """nanmin() function for datetime64. + + Caveats that this function deals with: + + - In numpy < 1.18, min() on datetime64 incorrectly ignores NaT + - numpy nanmin() don't work on datetime64 (all versions at the moment of writing) + - dask min() does not work on datetime64 (all versions at the moment of writing) + """ + assert array.dtype.kind in "mM" + dtype = array.dtype + # (NaT).astype(float) does not produce NaN... + array = where(pandas_isnull(array), np.nan, array.astype(float)) + array = min(array, skipna=True) + if isinstance(array, float): + array = np.array(array) + # ...but (NaN).astype("M8") does produce NaT + return array.astype(dtype) + + def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float): """Convert an array containing datetime-like data to an array of floats. @@ -370,7 +390,10 @@ def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float): """ # TODO: make this function dask-compatible? if offset is None: - offset = array.min() + if array.dtype.kind in "Mm": + offset = _datetime_nanmin(array) + else: + offset = min(array) array = array - offset if not hasattr(array, "dtype"): # scalar is converted to 0d-array @@ -401,7 +424,8 @@ def mean(array, axis=None, skipna=None, **kwargs): array = asarray(array) if array.dtype.kind in "Mm": - offset = min(array) + offset = _datetime_nanmin(array) + # xarray always uses np.datetime64[ns] for np.datetime64 data dtype = "timedelta64[ns]" return ( diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index be40ce7c6e8..de074da541f 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -5874,7 +5874,9 @@ def test_trapz_datetime(dask, which_datetime): actual = da.integrate("time", datetime_unit="D") expected_data = np.trapz( - da, duck_array_ops.datetime_to_numeric(da["time"], datetime_unit="D"), axis=0 + da.data, + duck_array_ops.datetime_to_numeric(da["time"].data, datetime_unit="D"), + axis=0, ) expected = xr.DataArray( expected_data, diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index f678af2fec5..aee7bbd6b11 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -274,23 +274,39 @@ def assert_dask_array(da, dask): @arm_xfail -@pytest.mark.parametrize("dask", [False, True]) -def test_datetime_reduce(dask): - time = np.array(pd.date_range("15/12/1999", periods=11)) - time[8:11] = np.nan - da = DataArray(np.linspace(0, 365, num=11), dims="time", coords={"time": time}) - - if dask and has_dask: - chunks = {"time": 5} - da = da.chunk(chunks) - - actual = da["time"].mean() - assert not pd.isnull(actual) - actual = da["time"].mean(skipna=False) - assert pd.isnull(actual) - - # test for a 0d array - assert da["time"][0].mean() == da["time"][:1].mean() +@pytest.mark.parametrize("dask", [False, True] if has_dask else [False]) +def test_datetime_mean(dask): + # Note: only testing numpy, as dask is broken upstream + da = DataArray( + np.array(["2010-01-01", "NaT", "2010-01-03", "NaT", "NaT"], dtype="M8"), + dims=["time"], + ) + if dask: + # Trigger use case where a chunk is full of NaT + da = da.chunk({"time": 3}) + + expect = DataArray(np.array("2010-01-02", dtype="M8")) + expect_nat = DataArray(np.array("NaT", dtype="M8")) + + actual = da.mean() + if dask: + assert actual.chunks is not None + assert_equal(actual, expect) + + actual = da.mean(skipna=False) + if dask: + assert actual.chunks is not None + assert_equal(actual, expect_nat) + + # tests for 1d array full of NaT + assert_equal(da[[1]].mean(), expect_nat) + assert_equal(da[[1]].mean(skipna=False), expect_nat) + + # tests for a 0d array + assert_equal(da[0].mean(), da[0]) + assert_equal(da[0].mean(skipna=False), da[0]) + assert_equal(da[1].mean(), expect_nat) + assert_equal(da[1].mean(skipna=False), expect_nat) @requires_cftime From dc559ea4a0b043908b5539641c2d22ab9a051b19 Mon Sep 17 00:00:00 2001 From: keewis Date: Tue, 19 Nov 2019 16:32:25 +0100 Subject: [PATCH 19/24] Silence sphinx warnings (#3516) * silence sphinx warnings * silence more sphinx warnings * fix some references * fix the docstrings of Dataset reduce methods * mark the orphaned files as such * silence some nit-picky warnings * convert all references to xray to double backtick quoted text * silence more warnings in whats-new.rst * require a whatsnew format of Name * rename the second cf conventions link * silence more sphinx warnings * get interpolate_na docstrings in sync with master * fix sphinx warnings for interpolate_na docstrings * update references to old documentation sections * cut the link to h5netcdf.File * use the correct reference types for numpy * update the reference to atop (dask renamed it to blockwise) * rewrite numpy docstrings * guard against non-str documentation * pass name to skip_signature * remove links to pandas.Panel * convince sphinx to create pages astype and groupby().quantile * more warnings --- doc/README.rst | 2 + doc/api-hidden.rst | 5 + doc/combining.rst | 6 +- doc/computation.rst | 6 +- doc/dask.rst | 2 +- doc/data-structures.rst | 6 +- doc/pandas.rst | 2 +- doc/whats-new.rst | 240 +++++++++++++++++------------------ xarray/backends/api.py | 8 +- xarray/coding/cftimeindex.py | 2 +- xarray/core/alignment.py | 2 +- xarray/core/combine.py | 2 + xarray/core/common.py | 26 ++-- xarray/core/computation.py | 2 +- xarray/core/concat.py | 1 + xarray/core/dataarray.py | 12 +- xarray/core/dataset.py | 15 ++- xarray/core/groupby.py | 7 +- xarray/plot/plot.py | 2 +- xarray/ufuncs.py | 40 ++++++ 20 files changed, 229 insertions(+), 159 deletions(-) diff --git a/doc/README.rst b/doc/README.rst index af7bc96092c..0579f85d85f 100644 --- a/doc/README.rst +++ b/doc/README.rst @@ -1,3 +1,5 @@ +:orphan: + xarray ------ diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst index 8f82b30a442..027c732697f 100644 --- a/doc/api-hidden.rst +++ b/doc/api-hidden.rst @@ -2,6 +2,8 @@ .. This extra page is a work around for sphinx not having any support for .. hiding an autosummary table. +:orphan: + .. currentmodule:: xarray .. autosummary:: @@ -30,9 +32,11 @@ core.groupby.DatasetGroupBy.first core.groupby.DatasetGroupBy.last core.groupby.DatasetGroupBy.fillna + core.groupby.DatasetGroupBy.quantile core.groupby.DatasetGroupBy.where Dataset.argsort + Dataset.astype Dataset.clip Dataset.conj Dataset.conjugate @@ -71,6 +75,7 @@ core.groupby.DataArrayGroupBy.first core.groupby.DataArrayGroupBy.last core.groupby.DataArrayGroupBy.fillna + core.groupby.DataArrayGroupBy.quantile core.groupby.DataArrayGroupBy.where DataArray.argsort diff --git a/doc/combining.rst b/doc/combining.rst index 4593d410d23..05b7f2efc50 100644 --- a/doc/combining.rst +++ b/doc/combining.rst @@ -255,11 +255,11 @@ Combining along multiple dimensions ``combine_nested``. For combining many objects along multiple dimensions xarray provides -:py:func:`~xarray.combine_nested`` and :py:func:`~xarray.combine_by_coords`. These +:py:func:`~xarray.combine_nested` and :py:func:`~xarray.combine_by_coords`. These functions use a combination of ``concat`` and ``merge`` across different variables to combine many objects into one. -:py:func:`~xarray.combine_nested`` requires specifying the order in which the +:py:func:`~xarray.combine_nested` requires specifying the order in which the objects should be combined, while :py:func:`~xarray.combine_by_coords` attempts to infer this ordering automatically from the coordinates in the data. @@ -310,4 +310,4 @@ These functions can be used by :py:func:`~xarray.open_mfdataset` to open many files as one dataset. The particular function used is specified by setting the argument ``'combine'`` to ``'by_coords'`` or ``'nested'``. This is useful for situations where your data is split across many files in multiple locations, -which have some known relationship between one another. \ No newline at end of file +which have some known relationship between one another. diff --git a/doc/computation.rst b/doc/computation.rst index 240a1e5704b..1ac30f55ee7 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -325,8 +325,8 @@ Broadcasting by dimension name ``DataArray`` objects are automatically align themselves ("broadcasting" in the numpy parlance) by dimension name instead of axis order. With xarray, you do not need to transpose arrays or insert dimensions of length 1 to get array -operations to work, as commonly done in numpy with :py:func:`np.reshape` or -:py:const:`np.newaxis`. +operations to work, as commonly done in numpy with :py:func:`numpy.reshape` or +:py:data:`numpy.newaxis`. This is best illustrated by a few examples. Consider two one-dimensional arrays with different sizes aligned along different dimensions: @@ -566,7 +566,7 @@ to set ``axis=-1``. As an example, here is how we would wrap Because ``apply_ufunc`` follows a standard convention for ufuncs, it plays nicely with tools for building vectorized functions, like -:func:`numpy.broadcast_arrays` and :func:`numpy.vectorize`. For high performance +:py:func:`numpy.broadcast_arrays` and :py:class:`numpy.vectorize`. For high performance needs, consider using Numba's :doc:`vectorize and guvectorize `. In addition to wrapping functions, ``apply_ufunc`` can automatically parallelize diff --git a/doc/dask.rst b/doc/dask.rst index 11f378aa376..ed99ffaa896 100644 --- a/doc/dask.rst +++ b/doc/dask.rst @@ -285,7 +285,7 @@ automate `embarrassingly parallel `__ "map" type operations where a function written for processing NumPy arrays should be repeatedly applied to xarray objects containing Dask arrays. It works similarly to -:py:func:`dask.array.map_blocks` and :py:func:`dask.array.atop`, but without +:py:func:`dask.array.map_blocks` and :py:func:`dask.array.blockwise`, but without requiring an intermediate layer of abstraction. For the best performance when using Dask's multi-threaded scheduler, wrap a diff --git a/doc/data-structures.rst b/doc/data-structures.rst index 93cdc7e9765..d5c8fa961f7 100644 --- a/doc/data-structures.rst +++ b/doc/data-structures.rst @@ -45,7 +45,7 @@ Creating a DataArray The :py:class:`~xarray.DataArray` constructor takes: - ``data``: a multi-dimensional array of values (e.g., a numpy ndarray, - :py:class:`~pandas.Series`, :py:class:`~pandas.DataFrame` or :py:class:`~pandas.Panel`) + :py:class:`~pandas.Series`, :py:class:`~pandas.DataFrame` or ``pandas.Panel``) - ``coords``: a list or dictionary of coordinates. If a list, it should be a list of tuples where the first element is the dimension name and the second element is the corresponding coordinate array_like object. @@ -125,7 +125,7 @@ As a dictionary with coords across multiple dimensions: If you create a ``DataArray`` by supplying a pandas :py:class:`~pandas.Series`, :py:class:`~pandas.DataFrame` or -:py:class:`~pandas.Panel`, any non-specified arguments in the +``pandas.Panel``, any non-specified arguments in the ``DataArray`` constructor will be filled in from the pandas object: .. ipython:: python @@ -301,7 +301,7 @@ names, and its data is aligned to any existing dimensions. You can also create an dataset from: -- A :py:class:`pandas.DataFrame` or :py:class:`pandas.Panel` along its columns and items +- A :py:class:`pandas.DataFrame` or ``pandas.Panel`` along its columns and items respectively, by passing it into the :py:class:`~xarray.Dataset` directly - A :py:class:`pandas.DataFrame` with :py:meth:`Dataset.from_dataframe `, which will additionally handle MultiIndexes See :ref:`pandas` diff --git a/doc/pandas.rst b/doc/pandas.rst index 4f3088b4c34..72abf6609f6 100644 --- a/doc/pandas.rst +++ b/doc/pandas.rst @@ -112,7 +112,7 @@ automatically stacking them into a ``MultiIndex``. :py:meth:`DataArray.to_pandas() ` is a shortcut that lets you convert a DataArray directly into a pandas object with the same dimensionality (i.e., a 1D array is converted to a :py:class:`~pandas.Series`, -2D to :py:class:`~pandas.DataFrame` and 3D to :py:class:`~pandas.Panel`): +2D to :py:class:`~pandas.DataFrame` and 3D to ``pandas.Panel``): .. ipython:: python diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 0c929b5b711..105d661b5f7 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -112,9 +112,9 @@ Bug fixes In addition, the ``allow_lazy`` kwarg to ``reduce`` is deprecated. By `Deepak Cherian `_. - Fix :py:meth:`GroupBy.reduce` when reducing over multiple dimensions. - (:issue:`3402`). By `Deepak Cherian `_ + (:issue:`3402`). By `Deepak Cherian `_ - Allow appending datetime and bool data variables to zarr stores. - (:issue:`3480`). By `Akihiro Matsukawa `_. + (:issue:`3480`). By `Akihiro Matsukawa `_. - Add support for numpy >=1.18 (); bugfix mean() on datetime64 arrays on dask backend (:issue:`3409`, :pull:`3537`). By `Guido Imperiale `_. - Add support for pandas >=0.26 (:issue:`3440`). @@ -239,9 +239,9 @@ Bug fixes (:issue:`3317`). By `Guido Imperiale `_. - Line plots with the ``x`` or ``y`` argument set to a 1D non-dimensional coord now plot the correct data for 2D DataArrays - (:issue:`3334`). By `Tom Nicholas `_. + (:issue:`3334`). By `Tom Nicholas `_. - Make :py:func:`~xarray.concat` more robust when merging variables present in some datasets but - not others (:issue:`508`). By `Deepak Cherian `_. + not others (:issue:`508`). By `Deepak Cherian `_. - The default behaviour of reducing across all dimensions for :py:class:`~xarray.core.groupby.DataArrayGroupBy` objects has now been properly removed as was done for :py:class:`~xarray.core.groupby.DatasetGroupBy` in 0.13.0 (:issue:`3337`). @@ -249,26 +249,26 @@ Bug fixes Also raise nicer error message when no groups are created (:issue:`1764`). By `Deepak Cherian `_. - Fix error in concatenating unlabeled dimensions (:pull:`3362`). - By `Deepak Cherian `_. + By `Deepak Cherian `_. - Warn if the ``dim`` kwarg is passed to rolling operations. This is redundant since a dimension is specified when the :py:class:`DatasetRolling` or :py:class:`DataArrayRolling` object is created. - (:pull:`3362`). By `Deepak Cherian `_. + (:pull:`3362`). By `Deepak Cherian `_. Documentation ~~~~~~~~~~~~~ - Created a glossary of important xarray terms (:issue:`2410`, :pull:`3352`). - By `Gregory Gundersen `_. + By `Gregory Gundersen `_. - Created a "How do I..." section (:ref:`howdoi`) for solutions to common questions. (:pull:`3357`). - By `Deepak Cherian `_. + By `Deepak Cherian `_. - Add examples for :py:meth:`Dataset.swap_dims` and :py:meth:`DataArray.swap_dims` (pull:`3331`, pull:`3331`). By `Justus Magin `_. - Add examples for :py:meth:`align`, :py:meth:`merge`, :py:meth:`combine_by_coords`, :py:meth:`full_like`, :py:meth:`zeros_like`, :py:meth:`ones_like`, :py:meth:`Dataset.pipe`, - :py:meth:`Dataset.assign`, :py:meth:`Dataset.reindex`, :py:meth:`Dataset.fillna` (pull:`3328`). + :py:meth:`Dataset.assign`, :py:meth:`Dataset.reindex`, :py:meth:`Dataset.fillna` (:pull:`3328`). By `Anderson Banihirwe `_. - Fixed documentation to clean up an unwanted file created in ``ipython`` example - (:pull:`3353`). By `Gregory Gundersen `_. + (:pull:`3353`). By `Gregory Gundersen `_. .. _whats-new.0.13.0: @@ -322,7 +322,7 @@ Breaking changes - :py:meth:`DataArray.to_dataset` requires ``name`` to be passed as a kwarg (previously ambiguous positional arguments were deprecated) - Reindexing with variables of a different dimension now raise an error (previously deprecated) -- :py:func:`~xarray.broadcast_array` is removed (previously deprecated in favor of +- ``xarray.broadcast_array`` is removed (previously deprecated in favor of :py:func:`~xarray.broadcast`) - :py:meth:`Variable.expand_dims` is removed (previously deprecated in favor of :py:meth:`Variable.set_dims`) @@ -358,7 +358,7 @@ New functions/methods - Added :py:meth:`DataArray.broadcast_like` and :py:meth:`Dataset.broadcast_like`. By `Deepak Cherian `_ and `David Mertz - `_. + `_. - Dataset plotting API for visualizing dependencies between two DataArrays! Currently only :py:meth:`Dataset.plot.scatter` is implemented. @@ -404,21 +404,21 @@ Enhancements By `Gerardo Rivera `_. - :py:func:`~xarray.Dataset.to_netcdf()` now supports the ``invalid_netcdf`` kwarg when used - with ``engine="h5netcdf"``. It is passed to :py:func:`h5netcdf.File`. + with ``engine="h5netcdf"``. It is passed to ``h5netcdf.File``. By `Ulrich Herter `_. -- :py:meth:`~xarray.Dataset.drop` now supports keyword arguments; dropping index +- ``xarray.Dataset.drop`` now supports keyword arguments; dropping index labels by using both ``dim`` and ``labels`` or using a :py:class:`~xarray.core.coordinates.DataArrayCoordinates` object are deprecated (:issue:`2910`). - By `Gregory Gundersen `_. + By `Gregory Gundersen `_. - Added examples of :py:meth:`Dataset.set_index` and :py:meth:`DataArray.set_index`, as well are more specific error messages when the user passes invalid arguments (:issue:`3176`). By `Gregory Gundersen `_. -- :py:func:`filter_by_attrs` now filters the coordinates as well as the variables. +- :py:meth:`Dataset.filter_by_attrs` now filters the coordinates as well as the variables. By `Spencer Jones `_. Bug fixes @@ -445,7 +445,7 @@ Bug fixes By `Hasan Ahmad `_. - Fixed bug in ``combine_by_coords()`` causing a `ValueError` if the input had an unused dimension with coordinates which were not monotonic (:issue:`3150`). - By `Tom Nicholas `_. + By `Tom Nicholas `_. - Fixed crash when applying ``distributed.Client.compute()`` to a DataArray (:issue:`3171`). By `Guido Imperiale `_. - Better error message when using groupby on an empty DataArray (:issue:`3037`). @@ -469,7 +469,7 @@ Documentation - Fixed documentation to clean up unwanted files created in ``ipython`` examples (:issue:`3227`). - By `Gregory Gundersen `_. + By `Gregory Gundersen `_. .. _whats-new.0.12.3: @@ -539,7 +539,7 @@ New functions/methods To avoid FutureWarnings switch to using ``combine_nested`` or ``combine_by_coords``, (or set the ``combine`` argument in ``open_mfdataset``). (:issue:`2159`) - By `Tom Nicholas `_. + By `Tom Nicholas `_. - :py:meth:`~xarray.DataArray.rolling_exp` and :py:meth:`~xarray.Dataset.rolling_exp` added, similar to pandas' @@ -585,12 +585,12 @@ Enhancements to existing functionality :py:meth:`DataArray.groupby_bins`, and :py:meth:`DataArray.resample` now accept a keyword argument ``restore_coord_dims`` which keeps the order of the dimensions of multi-dimensional coordinates intact (:issue:`1856`). - By `Peter Hausamann `_. + By `Peter Hausamann `_. - Clean up Python 2 compatibility in code (:issue:`2950`) By `Guido Imperiale `_. - Better warning message when supplying invalid objects to ``xr.merge`` (:issue:`2948`). By `Mathias Hauser `_. -- Add ``errors`` keyword argument to :py:meth:`Dataset.drop` and :py:meth:`Dataset.drop_dims` +- Add ``errors`` keyword argument to ``Dataset.drop`` and :py:meth:`Dataset.drop_dims` that allows ignoring errors if a passed label or dimension is not in the dataset (:issue:`2994`). By `Andrew Ross `_. @@ -786,7 +786,7 @@ Bug fixes `Spencer Clark `_. - Line plots with the ``x`` argument set to a non-dimensional coord now plot the correct data for 1D DataArrays. - (:issue:`2725`). By `Tom Nicholas `_. + (:issue:`2725`). By `Tom Nicholas `_. - Subtracting a scalar ``cftime.datetime`` object from a :py:class:`CFTimeIndex` now results in a :py:class:`pandas.TimedeltaIndex` instead of raising a ``TypeError`` (:issue:`2671`). By `Spencer Clark @@ -802,14 +802,14 @@ Bug fixes By `Yohai Bar-Sinai `_. - Fixed error when trying to reduce a DataArray using a function which does not require an axis argument. (:issue:`2768`) - By `Tom Nicholas `_. + By `Tom Nicholas `_. - Concatenating a sequence of :py:class:`~xarray.DataArray` with varying names sets the name of the output array to ``None``, instead of the name of the first input array. If the names are the same it sets the name to that, instead to the name of the first DataArray in the list as it did before. - (:issue:`2775`). By `Tom Nicholas `_. + (:issue:`2775`). By `Tom Nicholas `_. -- Per `CF conventions +- Per the `CF conventions section on calendars `_, specifying ``'standard'`` as the calendar type in :py:meth:`~xarray.cftime_range` now correctly refers to the ``'gregorian'`` @@ -827,7 +827,7 @@ Bug fixes (e.g. '2000-01-01T00:00:00-05:00') no longer raises an error (:issue:`2649`). By `Spencer Clark `_. - Fixed performance regression with ``open_mfdataset`` (:issue:`2662`). - By `Tom Nicholas `_. + By `Tom Nicholas `_. - Fixed supplying an explicit dimension in the ``concat_dim`` argument to to ``open_mfdataset`` (:issue:`2647`). By `Ben Root `_. @@ -892,13 +892,13 @@ Enhancements but were not explicitly closed. This is mostly useful for debugging; we recommend enabling it in your test suites if you use xarray for IO. By `Stephan Hoyer `_ -- Support Dask ``HighLevelGraphs`` by `Matthew Rocklin `_. +- Support Dask ``HighLevelGraphs`` by `Matthew Rocklin `_. - :py:meth:`DataArray.resample` and :py:meth:`Dataset.resample` now supports the ``loffset`` kwarg just like Pandas. By `Deepak Cherian `_ - Datasets are now guaranteed to have a ``'source'`` encoding, so the source file name is always stored (:issue:`2550`). - By `Tom Nicholas `_. + By `Tom Nicholas `_. - The ``apply`` methods for ``DatasetGroupBy``, ``DataArrayGroupBy``, ``DatasetResample`` and ``DataArrayResample`` now support passing positional arguments to the applied function as a tuple to the ``args`` argument. @@ -1020,7 +1020,7 @@ Enhancements dataset and dataarray attrs upon operations. The option is set with ``xarray.set_options(keep_attrs=True)``, and the default is to use the old behaviour. - By `Tom Nicholas `_. + By `Tom Nicholas `_. - Added a new backend for the GRIB file format based on ECMWF *cfgrib* python driver and *ecCodes* C-library. (:issue:`2475`) By `Alessandro Amici `_, @@ -1076,7 +1076,7 @@ Bug fixes CFTimeIndex is now allowed (:issue:`2484`). By `Spencer Clark `_. - Avoid use of Dask's deprecated ``get=`` parameter in tests - by `Matthew Rocklin `_. + by `Matthew Rocklin `_. - An ``OverflowError`` is now accurately raised and caught during the encoding process if a reference date is used that is so distant that the dates must be encoded using cftime rather than NumPy (:issue:`2272`). @@ -1122,7 +1122,7 @@ Enhancements (:issue:`2230`) By `Keisuke Fujii `_. -- :py:meth:`plot()` now accepts the kwargs +- :py:func:`~plot.plot()` now accepts the kwargs ``xscale, yscale, xlim, ylim, xticks, yticks`` just like Pandas. Also ``xincrease=False, yincrease=False`` now use matplotlib's axis inverting methods instead of setting limits. By `Deepak Cherian `_. (:issue:`2224`) @@ -1189,7 +1189,7 @@ Bug fixes - Follow up the renamings in dask; from dask.ghost to dask.overlap By `Keisuke Fujii `_. -- Now :py:func:`xr.apply_ufunc` raises a ValueError when the size of +- Now :py:func:`~xarray.apply_ufunc` raises a ValueError when the size of ``input_core_dims`` is inconsistent with the number of arguments. (:issue:`2341`) By `Keisuke Fujii `_. @@ -1272,7 +1272,7 @@ Enhancements - :py:meth:`~xarray.DataArray.interp` and :py:meth:`~xarray.Dataset.interp` methods are newly added. - See :ref:`interpolating values with interp` for the detail. + See :ref:`interp` for the detail. (:issue:`2079`) By `Keisuke Fujii `_. @@ -1389,7 +1389,7 @@ non-standard calendars used in climate modeling. Documentation ~~~~~~~~~~~~~ -- New FAQ entry, :ref:`faq.other_projects`. +- New FAQ entry, :ref:`related-projects`. By `Deepak Cherian `_. - :ref:`assigning_values` now includes examples on how to select and assign values to a :py:class:`~xarray.DataArray` with ``.loc``. @@ -1445,7 +1445,7 @@ Bug fixes - ``ValueError`` is raised when coordinates with the wrong size are assigned to a :py:class:`DataArray`. (:issue:`2112`) By `Keisuke Fujii `_. -- Fixed a bug in :py:meth:`~xarary.DatasArray.rolling` with bottleneck. Also, +- Fixed a bug in :py:meth:`~xarray.DataArray.rolling` with bottleneck. Also, fixed a bug in rolling an integer dask array. (:issue:`2113`) By `Keisuke Fujii `_. - Fixed a bug where `keep_attrs=True` flag was neglected if @@ -1482,7 +1482,7 @@ Enhancements supplied list, returning a bool array. See :ref:`selecting values with isin` for full details. Similar to the ``np.isin`` function. By `Maximilian Roos `_. -- Some speed improvement to construct :py:class:`~xarray.DataArrayRolling` +- Some speed improvement to construct :py:class:`~xarray.core.rolling.DataArrayRolling` object (:issue:`1993`) By `Keisuke Fujii `_. - Handle variables with different values for ``missing_value`` and @@ -1562,8 +1562,8 @@ Enhancements NumPy. By `Stephan Hoyer `_. - Improve :py:func:`~xarray.DataArray.rolling` logic. - :py:func:`~xarray.DataArrayRolling` object now supports - :py:func:`~xarray.DataArrayRolling.construct` method that returns a view + :py:func:`~xarray.core.rolling.DataArrayRolling` object now supports + :py:func:`~xarray.core.rolling.DataArrayRolling.construct` method that returns a view of the DataArray / Dataset object with the rolling-window dimension added to the last axis. This enables more flexible operation, such as strided rolling, windowed rolling, ND-rolling, short-time FFT and convolution. @@ -1634,7 +1634,7 @@ Enhancements 1D coordinate (e.g. time) and a 2D coordinate (e.g. depth as a function of time) (:issue:`1737`). By `Deepak Cherian `_. -- :py:func:`~plot()` rotates x-axis ticks if x-axis is time. +- :py:func:`~plot.plot()` rotates x-axis ticks if x-axis is time. By `Deepak Cherian `_. - :py:func:`~plot.line()` can draw multiple lines if provided with a 2D variable. @@ -1909,7 +1909,7 @@ Enhancements concatenated array/dataset (:issue:`1521`). By `Guido Imperiale `_. -- Speed-up (x 100) of :py:func:`~xarray.conventions.decode_cf_datetime`. +- Speed-up (x 100) of ``xarray.conventions.decode_cf_datetime``. By `Christian Chwala `_. **IO related improvements** @@ -2555,7 +2555,7 @@ Enhancements raising an error (:issue:`1082`). By `Stephan Hoyer `_. - Options for axes sharing between subplots are exposed to - :py:class:`FacetGrid` and :py:func:`~xarray.plot.plot`, so axes + :py:class:`~xarray.plot.FacetGrid` and :py:func:`~xarray.plot.plot`, so axes sharing can be disabled for polar plots. By `Bas Hoonhout `_. - New utility functions :py:func:`~xarray.testing.assert_equal`, @@ -2571,8 +2571,8 @@ Enhancements similar to what the command line utility ``ncdump -h`` produces (:issue:`1150`). By `Joe Hamman `_. - Added the ability write unlimited netCDF dimensions with the ``scipy`` and - ``netcdf4`` backends via the new :py:attr:`~xray.Dataset.encoding` attribute - or via the ``unlimited_dims`` argument to :py:meth:`~xray.Dataset.to_netcdf`. + ``netcdf4`` backends via the new ``xray.Dataset.encoding`` attribute + or via the ``unlimited_dims`` argument to ``xray.Dataset.to_netcdf``. By `Joe Hamman `_. - New :py:meth:`~DataArray.quantile` method to calculate quantiles from DataArray objects (:issue:`1187`). @@ -2651,10 +2651,9 @@ Bug fixes Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- :py:meth:`~xarray.Dataset.isel_points` and - :py:meth:`~xarray.Dataset.sel_points` now use vectorised indexing in numpy - and dask (:issue:`1161`), which can result in several orders of magnitude - speedup. +- ``xarray.Dataset.isel_points`` and ``xarray.Dataset.sel_points`` now + use vectorised indexing in numpy and dask (:issue:`1161`), which can + result in several orders of magnitude speedup. By `Jonathan Chambers `_. .. _whats-new.0.8.2: @@ -2763,16 +2762,17 @@ Enhancements any number of ``Dataset`` and/or ``DataArray`` variables. See :ref:`merge` for more details. By `Stephan Hoyer `_. -- DataArray and Dataset method :py:meth:`resample` now supports the +- :py:meth:`DataArray.resample` and :py:meth:`Dataset.resample` now support the ``keep_attrs=False`` option that determines whether variable and dataset attributes are retained in the resampled object. By `Jeremy McGibbon `_. -- Better multi-index support in DataArray and Dataset :py:meth:`sel` and - :py:meth:`loc` methods, which now behave more closely to pandas and which - also accept dictionaries for indexing based on given level names and labels - (see :ref:`multi-level indexing`). By - `Benoit Bovy `_. +- Better multi-index support in :py:meth:`DataArray.sel`, + :py:meth:`DataArray.loc`, :py:meth:`Dataset.sel` and + :py:meth:`Dataset.loc`, which now behave more closely to pandas and + which also accept dictionaries for indexing based on given level names + and labels (see :ref:`multi-level indexing`). + By `Benoit Bovy `_. - New (experimental) decorators :py:func:`~xarray.register_dataset_accessor` and :py:func:`~xarray.register_dataarray_accessor` for registering custom xarray @@ -2788,7 +2788,7 @@ Enhancements allowing more control on the colorbar (:issue:`872`). By `Fabien Maussion `_. -- New Dataset method :py:meth:`filter_by_attrs`, akin to +- New Dataset method :py:meth:`Dataset.filter_by_attrs`, akin to ``netCDF4.Dataset.get_variables_by_attributes``, to easily filter data variables using its attributes. `Filipe Fernandes `_. @@ -2915,7 +2915,7 @@ Enhancements - Numerical operations now return empty objects on no overlapping labels rather than raising ``ValueError`` (:issue:`739`). -- :py:class:`~pd.Series` is now supported as valid input to the ``Dataset`` +- :py:class:`~pandas.Series` is now supported as valid input to the ``Dataset`` constructor (:issue:`740`). Bug fixes @@ -2934,7 +2934,7 @@ Bug fixes reindexing leads to NaN values (:issue:`738`). - ``Dataset.rename`` and ``DataArray.rename`` support the old and new names being the same (:issue:`724`). -- Fix :py:meth:`~xarray.Dataset.from_dataset` for DataFrames with Categorical +- Fix :py:meth:`~xarray.Dataset.from_dataframe` for DataFrames with Categorical column and a MultiIndex index (:issue:`737`). - Fixes to ensure xarray works properly after the upcoming pandas v0.18 and NumPy v1.11 releases. @@ -2985,7 +2985,7 @@ recommend switching your import statements to ``import xarray as xr``. Breaking changes ~~~~~~~~~~~~~~~~ -- The internal data model used by :py:class:`~xray.DataArray` has been +- The internal data model used by ``xray.DataArray`` has been rewritten to fix several outstanding issues (:issue:`367`, :issue:`634`, `this stackoverflow report`_). Internally, ``DataArray`` is now implemented in terms of ``._variable`` and ``._coords`` attributes instead of holding @@ -3023,7 +3023,7 @@ Breaking changes * x (x) int64 0 1 2 - It is no longer possible to convert a DataArray to a Dataset with - :py:meth:`xray.DataArray.to_dataset` if it is unnamed. This will now + ``xray.DataArray.to_dataset`` if it is unnamed. This will now raise ``ValueError``. If the array is unnamed, you need to supply the ``name`` argument. @@ -3092,7 +3092,7 @@ Enhancements - Plotting: more control on colormap parameters (:issue:`642`). ``vmin`` and ``vmax`` will not be silently ignored anymore. Setting ``center=False`` prevents automatic selection of a divergent colormap. -- New :py:meth:`~xray.Dataset.shift` and :py:meth:`~xray.Dataset.roll` methods +- New ``xray.Dataset.shift`` and ``xray.Dataset.roll`` methods for shifting/rotating datasets or arrays along a dimension: .. ipython:: python @@ -3106,9 +3106,9 @@ Enhancements moves both data and coordinates. - Assigning a ``pandas`` object directly as a ``Dataset`` variable is now permitted. Its index names correspond to the ``dims`` of the ``Dataset``, and its data is aligned. -- Passing a :py:class:`pandas.DataFrame` or :py:class:`pandas.Panel` to a Dataset constructor +- Passing a :py:class:`pandas.DataFrame` or ``pandas.Panel`` to a Dataset constructor is now permitted. -- New function :py:func:`~xray.broadcast` for explicitly broadcasting +- New function ``xray.broadcast`` for explicitly broadcasting ``DataArray`` and ``Dataset`` objects against each other. For example: .. ipython:: python @@ -3166,7 +3166,7 @@ API Changes ~~~~~~~~~~~ - The handling of colormaps and discrete color lists for 2D plots in - :py:meth:`~xray.DataArray.plot` was changed to provide more compatibility + ``xray.DataArray.plot`` was changed to provide more compatibility with matplotlib's ``contour`` and ``contourf`` functions (:issue:`538`). Now discrete lists of colors should be specified using ``colors`` keyword, rather than ``cmap``. @@ -3174,10 +3174,10 @@ API Changes Enhancements ~~~~~~~~~~~~ -- Faceted plotting through :py:class:`~xray.plot.FacetGrid` and the - :py:meth:`~xray.plot.plot` method. See :ref:`plotting.faceting` for more details +- Faceted plotting through ``xray.plot.FacetGrid`` and the + ``xray.plot.plot`` method. See :ref:`plotting.faceting` for more details and examples. -- :py:meth:`~xray.Dataset.sel` and :py:meth:`~xray.Dataset.reindex` now support +- ``xray.Dataset.sel`` and ``xray.Dataset.reindex`` now support the ``tolerance`` argument for controlling nearest-neighbor selection (:issue:`629`): @@ -3194,12 +3194,12 @@ Enhancements * x (x) float64 0.9 1.5 This feature requires pandas v0.17 or newer. -- New ``encoding`` argument in :py:meth:`~xray.Dataset.to_netcdf` for writing +- New ``encoding`` argument in ``xray.Dataset.to_netcdf`` for writing netCDF files with compression, as described in the new documentation section on :ref:`io.netcdf.writing_encoded`. -- Add :py:attr:`~xray.Dataset.real` and :py:attr:`~xray.Dataset.imag` +- Add ``xray.Dataset.real`` and ``xray.Dataset.imag`` attributes to Dataset and DataArray (:issue:`553`). -- More informative error message with :py:meth:`~xray.Dataset.from_dataframe` +- More informative error message with ``xray.Dataset.from_dataframe`` if the frame has duplicate columns. - xray now uses deterministic names for dask arrays it creates or opens from disk. This allows xray users to take advantage of dask's nascent support for @@ -3214,9 +3214,9 @@ Bug fixes - Aggregation functions now correctly skip ``NaN`` for data for ``complex128`` dtype (:issue:`554`). - Fixed indexing 0d arrays with unicode dtype (:issue:`568`). -- :py:meth:`~xray.DataArray.name` and Dataset keys must be a string or None to +- ``xray.DataArray.name`` and Dataset keys must be a string or None to be written to netCDF (:issue:`533`). -- :py:meth:`~xray.DataArray.where` now uses dask instead of numpy if either the +- ``xray.DataArray.where`` now uses dask instead of numpy if either the array or ``other`` is a dask array. Previously, if ``other`` was a numpy array the method was evaluated eagerly. - Global attributes are now handled more consistently when loading remote @@ -3243,24 +3243,24 @@ v0.6.0 (21 August 2015) This release includes numerous bug fixes and enhancements. Highlights include the introduction of a plotting module and the new Dataset and DataArray -methods :py:meth:`~xray.Dataset.isel_points`, :py:meth:`~xray.Dataset.sel_points`, -:py:meth:`~xray.Dataset.where` and :py:meth:`~xray.Dataset.diff`. There are no +methods ``xray.Dataset.isel_points``, ``xray.Dataset.sel_points``, +``xray.Dataset.where`` and ``xray.Dataset.diff``. There are no breaking changes from v0.5.2. Enhancements ~~~~~~~~~~~~ - Plotting methods have been implemented on DataArray objects - :py:meth:`~xray.DataArray.plot` through integration with matplotlib + ``xray.DataArray.plot`` through integration with matplotlib (:issue:`185`). For an introduction, see :ref:`plotting`. - Variables in netCDF files with multiple missing values are now decoded as NaN after issuing a warning if open_dataset is called with mask_and_scale=True. - We clarified our rules for when the result from an xray operation is a copy - vs. a view (see :ref:`copies vs views` for more details). + vs. a view (see :ref:`copies_vs_views` for more details). - Dataset variables are now written to netCDF files in order of appearance when using the netcdf4 backend (:issue:`479`). -- Added :py:meth:`~xray.Dataset.isel_points` and :py:meth:`~xray.Dataset.sel_points` +- Added ``xray.Dataset.isel_points`` and ``xray.Dataset.sel_points`` to support pointwise indexing of Datasets and DataArrays (:issue:`475`). .. ipython:: @@ -3305,7 +3305,7 @@ Enhancements x (points) |S1 'a' 'b' 'g' * points (points) int64 0 1 2 -- New :py:meth:`~xray.Dataset.where` method for masking xray objects according +- New ``xray.Dataset.where`` method for masking xray objects according to some criteria. This works particularly well with multi-dimensional data: .. ipython:: python @@ -3316,11 +3316,10 @@ Enhancements @savefig where_example.png width=4in height=4in ds.distance.where(ds.distance < 100).plot() -- Added new methods :py:meth:`DataArray.diff ` - and :py:meth:`Dataset.diff ` for finite - difference calculations along a given axis. +- Added new methods ``xray.DataArray.diff`` and ``xray.Dataset.diff`` + for finite difference calculations along a given axis. -- New :py:meth:`~xray.DataArray.to_masked_array` convenience method for +- New ``xray.DataArray.to_masked_array`` convenience method for returning a numpy.ma.MaskedArray. .. ipython:: python @@ -3329,7 +3328,7 @@ Enhancements da.where(da < 0.5) da.where(da < 0.5).to_masked_array(copy=True) -- Added new flag "drop_variables" to :py:meth:`~xray.open_dataset` for +- Added new flag "drop_variables" to ``xray.open_dataset`` for excluding variables from being parsed. This may be useful to drop variables with problems or inconsistent values. @@ -3358,7 +3357,7 @@ options for ``xray.concat``. Backwards incompatible changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- The optional arguments ``concat_over`` and ``mode`` in :py:func:`~xray.concat` have +- The optional arguments ``concat_over`` and ``mode`` in ``xray.concat`` have been removed and replaced by ``data_vars`` and ``coords``. The new arguments are both more easily understood and more robustly implemented, and allowed us to fix a bug where ``concat`` accidentally loaded data into memory. If you set values for @@ -3368,16 +3367,16 @@ Backwards incompatible changes Enhancements ~~~~~~~~~~~~ -- :py:func:`~xray.open_mfdataset` now supports a ``preprocess`` argument for +- ``xray.open_mfdataset`` now supports a ``preprocess`` argument for preprocessing datasets prior to concatenaton. This is useful if datasets cannot be otherwise merged automatically, e.g., if the original datasets have conflicting index coordinates (:issue:`443`). -- :py:func:`~xray.open_dataset` and :py:func:`~xray.open_mfdataset` now use a +- ``xray.open_dataset`` and ``xray.open_mfdataset`` now use a global thread lock by default for reading from netCDF files with dask. This avoids possible segmentation faults for reading from netCDF4 files when HDF5 is not configured properly for concurrent access (:issue:`444`). - Added support for serializing arrays of complex numbers with `engine='h5netcdf'`. -- The new :py:func:`~xray.save_mfdataset` function allows for saving multiple +- The new ``xray.save_mfdataset`` function allows for saving multiple datasets to disk simultaneously. This is useful when processing large datasets with dask.array. For example, to save a dataset too big to fit into memory to one file per year, we could write: @@ -3396,7 +3395,7 @@ Bug fixes - Fixed ``min``, ``max``, ``argmin`` and ``argmax`` for arrays with string or unicode types (:issue:`453`). -- :py:func:`~xray.open_dataset` and :py:func:`~xray.open_mfdataset` support +- ``xray.open_dataset`` and ``xray.open_mfdataset`` support supplying chunks as a single integer. - Fixed a bug in serializing scalar datetime variable to netCDF. - Fixed a bug that could occur in serialization of 0-dimensional integer arrays. @@ -3413,9 +3412,9 @@ adds the ``pipe`` method, copied from pandas. Enhancements ~~~~~~~~~~~~ -- Added :py:meth:`~xray.Dataset.pipe`, replicating the `new pandas method`_ in version +- Added ``xray.Dataset.pipe``, replicating the `new pandas method`_ in version 0.16.2. See :ref:`transforming datasets` for more details. -- :py:meth:`~xray.Dataset.assign` and :py:meth:`~xray.Dataset.assign_coords` +- ``xray.Dataset.assign`` and ``xray.Dataset.assign_coords`` now assign new variables in sorted (alphabetical) order, mirroring the behavior in pandas. Previously, the order was arbitrary. @@ -3437,7 +3436,7 @@ Highlights The headline feature in this release is experimental support for out-of-core computing (data that doesn't fit into memory) with dask_. This includes a new -top-level function :py:func:`~xray.open_mfdataset` that makes it easy to open +top-level function ``xray.open_mfdataset`` that makes it easy to open a collection of netCDF (using dask) as a single ``xray.Dataset`` object. For more on dask, read the `blog post introducing xray + dask`_ and the new documentation section :doc:`dask`. @@ -3452,7 +3451,7 @@ Backwards incompatible changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - The logic used for choosing which variables are concatenated with - :py:func:`~xray.concat` has changed. Previously, by default any variables + ``xray.concat`` has changed. Previously, by default any variables which were equal across a dimension were not concatenated. This lead to some surprising behavior, where the behavior of groupby and concat operations could depend on runtime values (:issue:`268`). For example: @@ -3487,8 +3486,8 @@ Backwards incompatible changes Enhancements ~~~~~~~~~~~~ -- New :py:meth:`~xray.Dataset.to_array` and enhanced - :py:meth:`~xray.DataArray.to_dataset` methods make it easy to switch back +- New ``xray.Dataset.to_array`` and enhanced + ``xray.DataArray.to_dataset`` methods make it easy to switch back and forth between arrays and datasets: .. ipython:: python @@ -3498,7 +3497,7 @@ Enhancements ds.to_array() ds.to_array().to_dataset(dim='variable') -- New :py:meth:`~xray.Dataset.fillna` method to fill missing values, modeled +- New ``xray.Dataset.fillna`` method to fill missing values, modeled off the pandas method of the same name: .. ipython:: python @@ -3510,7 +3509,7 @@ Enhancements index based alignment and broadcasting like standard binary operations. It also can be applied by group, as illustrated in :ref:`fill with climatology`. -- New :py:meth:`~xray.Dataset.assign` and :py:meth:`~xray.Dataset.assign_coords` +- New ``xray.Dataset.assign`` and ``xray.Dataset.assign_coords`` methods patterned off the new :py:meth:`DataFrame.assign ` method in pandas: @@ -3522,8 +3521,8 @@ Enhancements These methods return a new Dataset (or DataArray) with updated data or coordinate variables. -- :py:meth:`~xray.Dataset.sel` now supports the ``method`` parameter, which works - like the paramter of the same name on :py:meth:`~xray.Dataset.reindex`. It +- ``xray.Dataset.sel`` now supports the ``method`` parameter, which works + like the paramter of the same name on ``xray.Dataset.reindex``. It provides a simple interface for doing nearest-neighbor interpolation: .. use verbatim because I can't seem to install pandas 0.16.1 on RTD :( @@ -3560,7 +3559,7 @@ Enhancements - Accessing data from remote datasets now has retrying logic (with exponential backoff) that should make it robust to occasional bad responses from DAP servers. -- You can control the width of the Dataset repr with :py:class:`xray.set_options`. +- You can control the width of the Dataset repr with ``xray.set_options``. It can be used either as a context manager, in which case the default is restored outside the context: @@ -3586,7 +3585,7 @@ Deprecations ~~~~~~~~~~~~ - The method ``load_data()`` has been renamed to the more succinct - :py:meth:`~xray.Dataset.load`. + ``xray.Dataset.load``. v0.4.1 (18 March 2015) ---------------------- @@ -3599,7 +3598,7 @@ Enhancements - New documentation sections on :ref:`time-series` and :ref:`combining multiple files`. -- :py:meth:`~xray.Dataset.resample` lets you resample a dataset or data array to +- ``xray.Dataset.resample`` lets you resample a dataset or data array to a new temporal resolution. The syntax is the `same as pandas`_, except you need to supply the time dimension explicitly: @@ -3642,7 +3641,7 @@ Enhancements array.resample('1D', dim='time', how='first') -- :py:meth:`~xray.Dataset.swap_dims` allows for easily swapping one dimension +- ``xray.Dataset.swap_dims`` allows for easily swapping one dimension out for another: .. ipython:: python @@ -3652,7 +3651,7 @@ Enhancements ds.swap_dims({'x': 'y'}) This was possible in earlier versions of xray, but required some contortions. -- :py:func:`~xray.open_dataset` and :py:meth:`~xray.Dataset.to_netcdf` now +- ``xray.open_dataset`` and ``xray.Dataset.to_netcdf`` now accept an ``engine`` argument to explicitly select which underlying library (netcdf4 or scipy) is used for reading/writing a netCDF file. @@ -3687,7 +3686,7 @@ Breaking changes - We now automatically align index labels in arithmetic, dataset construction, merging and updating. This means the need for manually invoking methods like - :py:func:`~xray.align` and :py:meth:`~xray.Dataset.reindex_like` should be + ``xray.align`` and ``xray.Dataset.reindex_like`` should be vastly reduced. :ref:`For arithmetic`, we align @@ -3739,7 +3738,7 @@ Breaking changes (a + b).coords This functionality can be controlled through the ``compat`` option, which - has also been added to the :py:class:`~xray.Dataset` constructor. + has also been added to the ``xray.Dataset`` constructor. - Datetime shortcuts such as ``'time.month'`` now return a ``DataArray`` with the name ``'month'``, not ``'time.month'`` (:issue:`345`). This makes it easier to index the resulting arrays when they are used with ``groupby``: @@ -3777,7 +3776,7 @@ Breaking changes Enhancements ~~~~~~~~~~~~ -- Support for :py:meth:`~xray.Dataset.reindex` with a fill method. This +- Support for ``xray.Dataset.reindex`` with a fill method. This provides a useful shortcut for upsampling: .. ipython:: python @@ -3791,16 +3790,15 @@ Enhancements - Use functions that return generic ndarrays with DataArray.groupby.apply and Dataset.apply (:issue:`327` and :issue:`329`). Thanks Jeff Gerard! - Consolidated the functionality of ``dumps`` (writing a dataset to a netCDF3 - bytestring) into :py:meth:`~xray.Dataset.to_netcdf` (:issue:`333`). -- :py:meth:`~xray.Dataset.to_netcdf` now supports writing to groups in netCDF4 + bytestring) into ``xray.Dataset.to_netcdf`` (:issue:`333`). +- ``xray.Dataset.to_netcdf`` now supports writing to groups in netCDF4 files (:issue:`333`). It also finally has a full docstring -- you should read it! -- :py:func:`~xray.open_dataset` and :py:meth:`~xray.Dataset.to_netcdf` now +- ``xray.open_dataset`` and ``xray.Dataset.to_netcdf`` now work on netCDF3 files when netcdf4-python is not installed as long as scipy is available (:issue:`333`). -- The new :py:meth:`Dataset.drop ` and - :py:meth:`DataArray.drop ` methods makes it easy to drop - explicitly listed variables or index labels: +- The new ``xray.Dataset.drop`` and ``xray.DataArray.drop`` methods + makes it easy to drop explicitly listed variables or index labels: .. ipython:: python :okwarning: @@ -3813,7 +3811,7 @@ Enhancements arr = xray.DataArray([1, 2, 3], coords=[('x', list('abc'))]) arr.drop(['a', 'c'], dim='x') -- :py:meth:`~xray.Dataset.broadcast_equals` has been added to correspond to +- ``xray.Dataset.broadcast_equals`` has been added to correspond to the new ``compat`` option. - Long attributes are now truncated at 500 characters when printing a dataset (:issue:`338`). This should make things more convenient for working with @@ -3839,8 +3837,8 @@ Deprecations ~~~~~~~~~~~~ - ``dump`` and ``dumps`` have been deprecated in favor of - :py:meth:`~xray.Dataset.to_netcdf`. -- ``drop_vars`` has been deprecated in favor of :py:meth:`~xray.Dataset.drop`. + ``xray.Dataset.to_netcdf``. +- ``drop_vars`` has been deprecated in favor of ``xray.Dataset.drop``. Future plans ~~~~~~~~~~~~ @@ -3970,10 +3968,10 @@ backwards incompatible changes. New features ~~~~~~~~~~~~ -- Added :py:meth:`~xray.Dataset.count` and :py:meth:`~xray.Dataset.dropna` +- Added ``xray.Dataset.count`` and ``xray.Dataset.dropna`` methods, copied from pandas, for working with missing values (:issue:`247`, :issue:`58`). -- Added :py:meth:`DataArray.to_pandas ` for +- Added ``xray.DataArray.to_pandas`` for converting a data array into the pandas object with the same dimensionality (1D to Series, 2D to DataFrame, etc.) (:issue:`255`). - Support for reading gzipped netCDF3 files (:issue:`239`). @@ -4006,7 +4004,7 @@ New features of arrays of metadata that describe the grid on which the points in "variable" arrays lie. They are preserved (when unambiguous) even though mathematical operations. -- **Dataset math** :py:class:`~xray.Dataset` objects now support all arithmetic +- **Dataset math** ``xray.Dataset`` objects now support all arithmetic operations directly. Dataset-array operations map across all dataset variables; dataset-dataset operations act on each pair of variables with the same name. @@ -4022,7 +4020,7 @@ Backwards incompatible changes - ``Dataset.__eq__`` and ``Dataset.__ne__`` are now element-wise operations instead of comparing all values to obtain a single boolean. Use the method - :py:meth:`~xray.Dataset.equals` instead. + ``xray.Dataset.equals`` instead. Deprecations ~~~~~~~~~~~~ @@ -4031,7 +4029,7 @@ Deprecations - ``Dataset.select_vars`` deprecated: index a ``Dataset`` with a list of variable names instead. - ``DataArray.select_vars`` and ``DataArray.drop_vars`` deprecated: use - :py:meth:`~xray.DataArray.reset_coords` instead. + ``xray.DataArray.reset_coords`` instead. v0.2 (14 August 2014) --------------------- @@ -4041,16 +4039,16 @@ fixes. Here are the highlights: - There is now a direct constructor for ``DataArray`` objects, which makes it possible to create a DataArray without using a Dataset. This is highlighted - in the refreshed :doc:`tutorial`. + in the refreshed ``tutorial``. - You can perform aggregation operations like ``mean`` directly on - :py:class:`~xray.Dataset` objects, thanks to Joe Hamman. These aggregation + ``xray.Dataset`` objects, thanks to Joe Hamman. These aggregation methods also worked on grouped datasets. - xray now works on Python 2.6, thanks to Anna Kuznetsova. - A number of methods and attributes were given more sensible (usually shorter) names: ``labeled`` -> ``sel``, ``indexed`` -> ``isel``, ``select`` -> ``select_vars``, ``unselect`` -> ``drop_vars``, ``dimensions`` -> ``dims``, ``coordinates`` -> ``coords``, ``attributes`` -> ``attrs``. -- New :py:meth:`~xray.Dataset.load_data` and :py:meth:`~xray.Dataset.close` +- New ``xray.Dataset.load_data`` and ``xray.Dataset.close`` methods for datasets facilitate lower level of control of data loaded from disk. diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 945b3937c43..23d09ba5e33 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -729,13 +729,13 @@ def open_mfdataset( ``combine_by_coords`` and ``combine_nested``. By default the old (now deprecated) ``auto_combine`` will be used, please specify either ``combine='by_coords'`` or ``combine='nested'`` in future. Requires dask to be installed. See documentation for - details on dask [1]. Attributes from the first dataset file are used for the + details on dask [1]_. Attributes from the first dataset file are used for the combined dataset. Parameters ---------- paths : str or sequence - Either a string glob in the form "path/to/my/files/*.nc" or an explicit list of + Either a string glob in the form ``"path/to/my/files/*.nc"`` or an explicit list of files to open. Paths can be given as strings or as pathlib Paths. If concatenation along more than one dimension is desired, then ``paths`` must be a nested list-of-lists (see ``manual_combine`` for details). (A string glob will @@ -745,7 +745,7 @@ def open_mfdataset( In general, these should divide the dimensions of each dataset. If int, chunk each dimension by ``chunks``. By default, chunks will be chosen to load entire input files into memory at once. This has a major impact on performance: please - see the full documentation for more details [2]. + see the full documentation for more details [2]_. concat_dim : str, or list of str, DataArray, Index or None, optional Dimensions to concatenate files along. You only need to provide this argument if any of the dimensions along which you want to concatenate is not a dimension @@ -761,6 +761,7 @@ def open_mfdataset( 'no_conflicts', 'override'}, optional String indicating how to compare variables of the same name for potential conflicts when merging: + * 'broadcast_equals': all values must be equal when variables are broadcast against each other to ensure common dimensions. * 'equals': all values and dimensions must be the same. @@ -770,6 +771,7 @@ def open_mfdataset( must be equal. The returned dataset then contains the combination of all non-null values. * 'override': skip comparing and pick variable from first dataset + preprocess : callable, optional If provided, call this function on each dataset prior to concatenation. You can find the file-name from which each dataset was loaded in diff --git a/xarray/coding/cftimeindex.py b/xarray/coding/cftimeindex.py index 559c5e16287..4005d4fbf6d 100644 --- a/xarray/coding/cftimeindex.py +++ b/xarray/coding/cftimeindex.py @@ -506,7 +506,7 @@ def strftime(self, date_format): Returns ------- - Index + pandas.Index Index of formatted strings Examples diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index 41ff5a3b32d..b820d215d2f 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -108,7 +108,7 @@ def align( Returns ------- - aligned : same as *objects + aligned : same as `*objects` Tuple of objects with aligned coordinates. Raises diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 3308dcef285..b9db30a9f92 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -531,6 +531,7 @@ def combine_by_coords( * 'all': All data variables will be concatenated. * list of str: The listed data variables will be concatenated, in addition to the 'minimal' data variables. + If objects are DataArrays, `data_vars` must be 'all'. coords : {'minimal', 'different', 'all' or list of str}, optional As per the 'data_vars' kwarg, but for coordinate variables. @@ -747,6 +748,7 @@ def auto_combine( 'no_conflicts', 'override'}, optional String indicating how to compare variables of the same name for potential conflicts: + - 'broadcast_equals': all values must be equal when variables are broadcast against each other to ensure common dimensions. - 'equals': all values and dimensions must be the same. diff --git a/xarray/core/common.py b/xarray/core/common.py index 2afe4b4c3a7..a74318b2f90 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -91,15 +91,23 @@ def wrapped_func(self, dim=None, **kwargs): # type: ignore return wrapped_func - _reduce_extra_args_docstring = """dim : str or sequence of str, optional + _reduce_extra_args_docstring = dedent( + """ + dim : str or sequence of str, optional Dimension(s) over which to apply `{name}`. By default `{name}` is - applied over all dimensions.""" + applied over all dimensions. + """ + ).strip() - _cum_extra_args_docstring = """dim : str or sequence of str, optional + _cum_extra_args_docstring = dedent( + """ + dim : str or sequence of str, optional Dimension over which to apply `{name}`. axis : int or sequence of int, optional Axis over which to apply `{name}`. Only one of the 'dim' - and 'axis' arguments can be supplied.""" + and 'axis' arguments can be supplied. + """ + ).strip() class AbstractArray(ImplementsArrayReduce): @@ -454,7 +462,7 @@ def assign_coords(self, coords=None, **coords_kwargs): def assign_attrs(self, *args, **kwargs): """Assign new attrs to this object. - Returns a new object equivalent to self.attrs.update(*args, **kwargs). + Returns a new object equivalent to ``self.attrs.update(*args, **kwargs)``. Parameters ---------- @@ -481,7 +489,7 @@ def pipe( **kwargs, ) -> T: """ - Apply func(self, *args, **kwargs) + Apply ``func(self, *args, **kwargs)`` This method replicates the pandas method of the same name. @@ -810,6 +818,7 @@ def rolling_exp( ---------- window : A single mapping from a dimension name to window value, optional + dim : str Name of the dimension to create the rolling exponential window along (e.g., `time`). @@ -848,6 +857,7 @@ def coarsen( ---------- dim: dict, optional Mapping from the dimension name to the window size. + dim : str Name of the dimension to create the rolling iterator along (e.g., `time`). @@ -858,7 +868,7 @@ def coarsen( multiple of the window size. If 'trim', the excess entries are dropped. If 'pad', NA will be padded. side : 'left' or 'right' or mapping from dimension to 'left' or 'right' - coord_func: function (name) that is applied to the coordintes, + coord_func : function (name) that is applied to the coordintes, or a mapping from coordinate name to function (name). Returns @@ -921,7 +931,7 @@ def resample( Parameters ---------- indexer : {dim: freq}, optional - Mapping from the dimension name to resample frequency. The + Mapping from the dimension name to resample frequency [1]_. The dimension must be datetime-like. skipna : bool, optional Whether to skip missing values when aggregating in downsampling. diff --git a/xarray/core/computation.py b/xarray/core/computation.py index bb5ab07d8dd..f8e4914e57b 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -947,7 +947,7 @@ def earth_mover_distance(first_samples, appropriately for use in `apply`. You may find helper functions such as numpy.broadcast_arrays helpful in writing your function. `apply_ufunc` also works well with numba's vectorize and guvectorize. Further explanation with - examples are provided in the xarray documentation [3]. + examples are provided in the xarray documentation [3]_. See also -------- diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 5b4fc078236..5ccbfa3f2b4 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -45,6 +45,7 @@ def concat( * 'all': All data variables will be concatenated. * list of str: The listed data variables will be concatenated, in addition to the 'minimal' data variables. + If objects are DataArrays, data_vars must be 'all'. coords : {'minimal', 'different', 'all' or list of str}, optional These coordinate variables will be concatenated together: diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 23342fc5e0d..1205362ad91 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -239,7 +239,7 @@ class DataArray(AbstractArray, DataWithCoords): ---------- dims : tuple Dimension names associated with this array. - values : np.ndarray + values : numpy.ndarray Access or modify DataArray values as a numpy array. coords : dict-like Dictionary of DataArray objects that label values along each dimension. @@ -1315,7 +1315,7 @@ def interp( values. kwargs: dictionary Additional keyword passed to scipy's interpolator. - **coords_kwarg : {dim: coordinate, ...}, optional + ``**coords_kwarg`` : {dim: coordinate, ...}, optional The keyword arguments form of ``coords``. One of coords or coords_kwargs must be provided. @@ -2044,6 +2044,7 @@ def interpolate_na( provided. - 'barycentric', 'krog', 'pchip', 'spline', 'akima': use their respective :py:class:`scipy.interpolate` classes. + use_coordinate : bool, str, default True Specifies which index to use as the x values in the interpolation formulated as `y = f(x)`. If False, values are treated as if @@ -2063,6 +2064,7 @@ def interpolate_na( - a string that is valid input for pandas.to_timedelta - a :py:class:`numpy.timedelta64` object - a :py:class:`pandas.Timedelta` object + Otherwise, ``max_gap`` must be an int or a float. Use of ``max_gap`` with unlabeled dimensions has not been implemented yet. Gap length is defined as the difference between coordinate values at the first data point after a gap and the last value @@ -2946,7 +2948,7 @@ def quantile( is a scalar. If multiple percentiles are given, first axis of the result corresponds to the quantile and a quantile dimension is added to the return array. The other dimensions are the - dimensions that remain after the reduction of the array. + dimensions that remain after the reduction of the array. See Also -------- @@ -3071,8 +3073,8 @@ def integrate( Coordinate(s) used for the integration. datetime_unit: str, optional Can be used to specify the unit if datetime coordinate is used. - One of {'Y', 'M', 'W', 'D', 'h', 'm', 's', 'ms', 'us', 'ns', - 'ps', 'fs', 'as'} + One of {'Y', 'M', 'W', 'D', 'h', 'm', 's', 'ms', 'us', 'ns', 'ps', + 'fs', 'as'} Returns ------- diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index c631a4c11ea..5b9663c2453 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1509,7 +1509,7 @@ def to_netcdf( Nested dictionary with variable names as keys and dictionaries of variable specific encodings as values, e.g., ``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1, - 'zlib': True}, ...}`` + 'zlib': True}, ...}`` The `h5netcdf` engine supports both the NetCDF4-style compression encoding parameters ``{'zlib': True, 'complevel': 9}`` and the h5py @@ -2118,7 +2118,7 @@ def thin( indexers: Union[Mapping[Hashable, int], int] = None, **indexers_kwargs: Any, ) -> "Dataset": - """Returns a new dataset with each array indexed along every `n`th + """Returns a new dataset with each array indexed along every `n`-th value for the specified dimension(s) Parameters @@ -2127,7 +2127,7 @@ def thin( A dict with keys matching dimensions and integer values `n` or a single integer `n` applied over all dimensions. One of indexers or indexers_kwargs must be provided. - **indexers_kwargs : {dim: n, ...}, optional + ``**indexers_kwargs`` : {dim: n, ...}, optional The keyword arguments form of ``indexers``. One of indexers or indexers_kwargs must be provided. @@ -3476,6 +3476,7 @@ def merge( 'no_conflicts'}, optional String indicating how to compare variables of the same name for potential conflicts: + - 'broadcast_equals': all values must be equal when variables are broadcast against each other to ensure common dimensions. - 'equals': all values and dimensions must be the same. @@ -3484,6 +3485,7 @@ def merge( - 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. + join : {'outer', 'inner', 'left', 'right', 'exact'}, optional Method for joining ``self`` and ``other`` along shared dimensions: @@ -3624,7 +3626,7 @@ def drop_sel(self, labels=None, *, errors="raise", **labels_kwargs): in the dataset. If 'ignore', any given labels that are in the dataset are dropped and no error is raised. **labels_kwargs : {dim: label, ...}, optional - The keyword arguments form of ``dim`` and ``labels` + The keyword arguments form of ``dim`` and ``labels`` Returns ------- @@ -3914,6 +3916,7 @@ def interpolate_na( ---------- dim : str Specifies the dimension along which to interpolate. + method : str, optional String indicating which method to use for interpolation: @@ -3925,6 +3928,7 @@ def interpolate_na( provided. - 'barycentric', 'krog', 'pchip', 'spline', 'akima': use their respective :py:class:`scipy.interpolate` classes. + use_coordinate : bool, str, default True Specifies which index to use as the x values in the interpolation formulated as `y = f(x)`. If False, values are treated as if @@ -3944,6 +3948,7 @@ def interpolate_na( - a string that is valid input for pandas.to_timedelta - a :py:class:`numpy.timedelta64` object - a :py:class:`pandas.Timedelta` object + Otherwise, ``max_gap`` must be an int or a float. Use of ``max_gap`` with unlabeled dimensions has not been implemented yet. Gap length is defined as the difference between coordinate values at the first data point after a gap and the last value @@ -5251,7 +5256,7 @@ def integrate(self, coord, datetime_unit=None): datetime_unit Can be specify the unit if datetime coordinate is used. One of {'Y', 'M', 'W', 'D', 'h', 'm', 's', 'ms', 'us', 'ns', 'ps', 'fs', - 'as'} + 'as'} Returns ------- diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 38ecc04534a..ec752721781 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -573,6 +573,7 @@ def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None): This optional parameter specifies the interpolation method to use when the desired quantile lies between two data points ``i < j``: + * linear: ``i + (j - i) * fraction``, where ``fraction`` is the fractional part of the index surrounded by ``i`` and ``j``. @@ -728,17 +729,19 @@ def map(self, func, shortcut=False, args=(), **kwargs): Callable to apply to each array. shortcut : bool, optional Whether or not to shortcut evaluation under the assumptions that: + (1) The action of `func` does not depend on any of the array metadata (attributes or coordinates) but only on the data and dimensions. (2) The action of `func` creates arrays with homogeneous metadata, that is, with the same dimensions and attributes. + If these conditions are satisfied `shortcut` provides significant speedup. This should be the case for many common groupby operations (e.g., applying numpy ufuncs). - args : tuple, optional + ``*args`` : tuple, optional Positional arguments passed to `func`. - **kwargs + ``**kwargs`` Used to call `func(ar, **kwargs)` for each array `ar`. Returns diff --git a/xarray/plot/plot.py b/xarray/plot/plot.py index 5c754c3f49b..16a4943627e 100644 --- a/xarray/plot/plot.py +++ b/xarray/plot/plot.py @@ -269,7 +269,7 @@ def line( if None, use the default for the matplotlib function. add_legend : boolean, optional Add legend with y axis coordinates (2D inputs only). - *args, **kwargs : optional + ``*args``, ``**kwargs`` : optional Additional arguments to matplotlib.pyplot.plot """ # Handle facetgrids first diff --git a/xarray/ufuncs.py b/xarray/ufuncs.py index 0f6fc3b1334..ae2c5c574b6 100644 --- a/xarray/ufuncs.py +++ b/xarray/ufuncs.py @@ -13,6 +13,7 @@ Once NumPy 1.10 comes out with support for overriding ufuncs, this module will hopefully no longer be necessary. """ +import textwrap import warnings as _warnings import numpy as _np @@ -78,10 +79,49 @@ def __call__(self, *args, **kwargs): return res +def _skip_signature(doc, name): + if not isinstance(doc, str): + return doc + + if doc.startswith(name): + signature_end = doc.find("\n\n") + doc = doc[signature_end + 2 :] + + return doc + + +def _remove_unused_reference_labels(doc): + if not isinstance(doc, str): + return doc + + max_references = 5 + for num in range(max_references): + label = f".. [{num}]" + reference = f"[{num}]_" + index = f"{num}. " + + if label not in doc or reference in doc: + continue + + doc = doc.replace(label, index) + + return doc + + +def _dedent(doc): + if not isinstance(doc, str): + return doc + + return textwrap.dedent(doc) + + def _create_op(name): func = _UFuncDispatcher(name) func.__name__ = name doc = getattr(_np, name).__doc__ + + doc = _remove_unused_reference_labels(_skip_signature(_dedent(doc), name)) + func.__doc__ = ( "xarray specific variant of numpy.%s. Handles " "xarray.Dataset, xarray.DataArray, xarray.Variable, " From 220adbc65e0b8c46feddaa6984df4a3a1ce0af6b Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Wed, 20 Nov 2019 01:23:33 +0900 Subject: [PATCH 20/24] sparse option to reindex and unstack (#3542) * Added fill_value for unstack * remove sparse option and fix unintended changes * a bug fix * Added sparse option to unstack and reindex * black * More tests * black * Remove sparse option from reindex * try __array_function__ where * flake8 --- doc/whats-new.rst | 4 ++++ xarray/core/alignment.py | 5 +++++ xarray/core/dataarray.py | 4 +++- xarray/core/dataset.py | 35 +++++++++++++++++++++++++++++--- xarray/core/variable.py | 38 +++++++++++++++++++++++++++++++++++ xarray/tests/test_dataset.py | 19 ++++++++++++++++++ xarray/tests/test_variable.py | 12 +++++++++++ 7 files changed, 113 insertions(+), 4 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 105d661b5f7..9f5d57d4a72 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -37,6 +37,10 @@ Breaking changes New Features ~~~~~~~~~~~~ +- Added the ``sparse`` option to :py:meth:`~xarray.DataArray.unstack`, + :py:meth:`~xarray.Dataset.unstack`, :py:meth:`~xarray.DataArray.reindex`, + :py:meth:`~xarray.Dataset.reindex` (:issue:`3518`). + By `Keisuke Fujii `_. - Added the ``max_gap`` kwarg to :py:meth:`DataArray.interpolate_na` and :py:meth:`Dataset.interpolate_na`. This controls the maximum size of the data diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index b820d215d2f..908119f7995 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -466,6 +466,7 @@ def reindex_variables( tolerance: Any = None, copy: bool = True, fill_value: Optional[Any] = dtypes.NA, + sparse: bool = False, ) -> Tuple[Dict[Hashable, Variable], Dict[Hashable, pd.Index]]: """Conform a dictionary of aligned variables onto a new set of variables, filling in missing values with NaN. @@ -503,6 +504,8 @@ def reindex_variables( the input. In either case, new xarray objects are always returned. fill_value : scalar, optional Value to use for newly missing values + sparse: bool, optional + Use an sparse-array Returns ------- @@ -571,6 +574,8 @@ def reindex_variables( for name, var in variables.items(): if name not in indexers: + if sparse: + var = var._as_sparse(fill_value=fill_value) key = tuple( slice(None) if d in unchanged_dims else int_indexers.get(d, slice(None)) for d in var.dims diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 1205362ad91..c92fcb956b1 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1729,6 +1729,7 @@ def unstack( self, dim: Union[Hashable, Sequence[Hashable], None] = None, fill_value: Any = dtypes.NA, + sparse: bool = False, ) -> "DataArray": """ Unstack existing dimensions corresponding to MultiIndexes into @@ -1742,6 +1743,7 @@ def unstack( Dimension(s) over which to unstack. By default unstacks all MultiIndexes. fill_value: value to be filled. By default, np.nan + sparse: use sparse-array if True Returns ------- @@ -1773,7 +1775,7 @@ def unstack( -------- DataArray.stack """ - ds = self._to_temp_dataset().unstack(dim, fill_value) + ds = self._to_temp_dataset().unstack(dim, fill_value, sparse) return self._from_temp_dataset(ds) def to_unstacked_dataset(self, dim, level=0): diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 5b9663c2453..206f2f55b3c 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2286,6 +2286,7 @@ def reindex( the input. In either case, a new xarray object is always returned. fill_value : scalar, optional Value to use for newly missing values + sparse: use sparse-array. By default, False **indexers_kwarg : {dim: indexer, ...}, optional Keyword arguments in the same form as ``indexers``. One of indexers or indexers_kwargs must be provided. @@ -2428,6 +2429,29 @@ def reindex( the original and desired indexes. If you do want to fill in the `NaN` values present in the original dataset, use the :py:meth:`~Dataset.fillna()` method. + """ + return self._reindex( + indexers, + method, + tolerance, + copy, + fill_value, + sparse=False, + **indexers_kwargs, + ) + + def _reindex( + self, + indexers: Mapping[Hashable, Any] = None, + method: str = None, + tolerance: Number = None, + copy: bool = True, + fill_value: Any = dtypes.NA, + sparse: bool = False, + **indexers_kwargs: Any, + ) -> "Dataset": + """ + same to _reindex but support sparse option """ indexers = utils.either_dict_or_kwargs(indexers, indexers_kwargs, "reindex") @@ -2444,6 +2468,7 @@ def reindex( tolerance, copy=copy, fill_value=fill_value, + sparse=sparse, ) coord_names = set(self._coord_names) coord_names.update(indexers) @@ -3327,7 +3352,7 @@ def ensure_stackable(val): return data_array - def _unstack_once(self, dim: Hashable, fill_value) -> "Dataset": + def _unstack_once(self, dim: Hashable, fill_value, sparse) -> "Dataset": index = self.get_index(dim) index = index.remove_unused_levels() full_idx = pd.MultiIndex.from_product(index.levels, names=index.names) @@ -3336,7 +3361,9 @@ def _unstack_once(self, dim: Hashable, fill_value) -> "Dataset": if index.equals(full_idx): obj = self else: - obj = self.reindex({dim: full_idx}, copy=False, fill_value=fill_value) + obj = self._reindex( + {dim: full_idx}, copy=False, fill_value=fill_value, sparse=sparse + ) new_dim_names = index.names new_dim_sizes = [lev.size for lev in index.levels] @@ -3366,6 +3393,7 @@ def unstack( self, dim: Union[Hashable, Iterable[Hashable]] = None, fill_value: Any = dtypes.NA, + sparse: bool = False, ) -> "Dataset": """ Unstack existing dimensions corresponding to MultiIndexes into @@ -3379,6 +3407,7 @@ def unstack( Dimension(s) over which to unstack. By default unstacks all MultiIndexes. fill_value: value to be filled. By default, np.nan + sparse: use sparse-array if True Returns ------- @@ -3416,7 +3445,7 @@ def unstack( result = self.copy(deep=False) for dim in dims: - result = result._unstack_once(dim, fill_value) + result = result._unstack_once(dim, fill_value, sparse) return result def update(self, other: "CoercibleMapping", inplace: bool = None) -> "Dataset": diff --git a/xarray/core/variable.py b/xarray/core/variable.py index e630dc4b457..55e8f64d56c 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -993,6 +993,36 @@ def chunk(self, chunks=None, name=None, lock=False): return type(self)(self.dims, data, self._attrs, self._encoding, fastpath=True) + def _as_sparse(self, sparse_format=_default, fill_value=dtypes.NA): + """ + use sparse-array as backend. + """ + import sparse + + # TODO what to do if dask-backended? + if fill_value is dtypes.NA: + dtype, fill_value = dtypes.maybe_promote(self.dtype) + else: + dtype = dtypes.result_type(self.dtype, fill_value) + + if sparse_format is _default: + sparse_format = "coo" + try: + as_sparse = getattr(sparse, "as_{}".format(sparse_format.lower())) + except AttributeError: + raise ValueError("{} is not a valid sparse format".format(sparse_format)) + + data = as_sparse(self.data.astype(dtype), fill_value=fill_value) + return self._replace(data=data) + + def _to_dense(self): + """ + Change backend from sparse to np.array + """ + if hasattr(self._data, "todense"): + return self._replace(data=self._data.todense()) + return self.copy(deep=False) + def isel( self: VariableType, indexers: Mapping[Hashable, Any] = None, @@ -2021,6 +2051,14 @@ def chunk(self, chunks=None, name=None, lock=False): # Dummy - do not chunk. This method is invoked e.g. by Dataset.chunk() return self.copy(deep=False) + def _as_sparse(self, sparse_format=_default, fill_value=_default): + # Dummy + return self.copy(deep=False) + + def _to_dense(self): + # Dummy + return self.copy(deep=False) + def _finalize_indexing_result(self, dims, data): if getattr(data, "ndim", 0) != 1: # returns Variable rather than IndexVariable if multi-dimensional diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index de074da541f..e8fe768b783 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -2811,6 +2811,25 @@ def test_unstack_fill_value(self): expected = ds["var"].unstack("index").fillna(-1).astype(np.int) assert actual.equals(expected) + @requires_sparse + def test_unstack_sparse(self): + ds = xr.Dataset( + {"var": (("x",), np.arange(6))}, + coords={"x": [0, 1, 2] * 2, "y": (("x",), ["a"] * 3 + ["b"] * 3)}, + ) + # make ds incomplete + ds = ds.isel(x=[0, 2, 3, 4]).set_index(index=["x", "y"]) + # test fill_value + actual = ds.unstack("index", sparse=True) + expected = ds.unstack("index") + assert actual["var"].variable._to_dense().equals(expected["var"].variable) + assert actual["var"].data.density < 1.0 + + actual = ds["var"].unstack("index", sparse=True) + expected = ds["var"].unstack("index") + assert actual.variable._to_dense().equals(expected.variable) + assert actual.data.density < 1.0 + def test_stack_unstack_fast(self): ds = Dataset( { diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index d92a68729b5..ee8d54e567e 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -33,6 +33,7 @@ assert_identical, raises_regex, requires_dask, + requires_sparse, source_ndarray, ) @@ -1862,6 +1863,17 @@ def test_getitem_with_mask_nd_indexer(self): ) +@requires_sparse +class TestVariableWithSparse: + # TODO inherit VariableSubclassobjects to cover more tests + + def test_as_sparse(self): + data = np.arange(12).reshape(3, 4) + var = Variable(("x", "y"), data)._as_sparse(fill_value=-1) + actual = var._to_dense() + assert_identical(var, actual) + + class TestIndexVariable(VariableSubclassobjects): cls = staticmethod(IndexVariable) From 0ef9aa3abae55833e4431d690bc55c5b5a44911b Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Tue, 19 Nov 2019 17:21:48 -0500 Subject: [PATCH 21/24] 0.14.1 whatsnew (#3547) --- doc/whats-new.rst | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9f5d57d4a72..f47aad9b5a8 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -15,7 +15,7 @@ What's New .. _whats-new.0.14.1: -v0.14.1 (unreleased) +v0.14.1 (19 Nov 2019) -------------------- Breaking changes @@ -41,23 +41,20 @@ New Features :py:meth:`~xarray.Dataset.unstack`, :py:meth:`~xarray.DataArray.reindex`, :py:meth:`~xarray.Dataset.reindex` (:issue:`3518`). By `Keisuke Fujii `_. - -- Added the ``max_gap`` kwarg to :py:meth:`DataArray.interpolate_na` and - :py:meth:`Dataset.interpolate_na`. This controls the maximum size of the data - Added the ``fill_value`` option to :py:meth:`DataArray.unstack` and :py:meth:`Dataset.unstack` (:issue:`3518`, :pull:`3541`). By `Keisuke Fujii `_. - Added the ``max_gap`` kwarg to :py:meth:`~xarray.DataArray.interpolate_na` and :py:meth:`~xarray.Dataset.interpolate_na`. This controls the maximum size of the data gap that will be filled by interpolation. By `Deepak Cherian `_. -- :py:meth:`Dataset.drop_sel` & :py:meth:`DataArray.drop_sel` have been added for dropping labels. +- Added :py:meth:`Dataset.drop_sel` & :py:meth:`DataArray.drop_sel` for dropping labels. :py:meth:`Dataset.drop_vars` & :py:meth:`DataArray.drop_vars` have been added for dropping variables (including coordinates). The existing :py:meth:`Dataset.drop` & :py:meth:`DataArray.drop` methods remain as a backward compatible option for dropping either labels or variables, but using the more specific methods is encouraged. (:pull:`3475`) By `Maximilian Roos `_ -- :py:meth:`Dataset.map` & :py:meth:`GroupBy.map` & :py:meth:`Resample.map` have been added for +- Added :py:meth:`Dataset.map` & :py:meth:`GroupBy.map` & :py:meth:`Resample.map` for mapping / applying a function over each item in the collection, reflecting the widely used and least surprising name for this operation. The existing ``apply`` methods remain for backward compatibility, though using the ``map`` @@ -131,7 +128,7 @@ Documentation - Fix leap year condition in `monthly means example `_. By `Mickaël Lalande `_. - Fix the documentation of :py:meth:`DataArray.resample` and - :py:meth:`Dataset.resample` — explicitly state that a + :py:meth:`Dataset.resample`, explicitly stating that a datetime-like dimension is required. (:pull:`3400`) By `Justus Magin `_. - Update the :ref:`terminology` page to address multidimensional coordinates. (:pull:`3410`) From 7466be623fbb4bbb5efc389d31436bd38e53d198 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Tue, 19 Nov 2019 17:49:32 -0500 Subject: [PATCH 22/24] Revert to dev version --- doc/whats-new.rst | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index f47aad9b5a8..de834512e36 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -15,6 +15,31 @@ What's New .. _whats-new.0.14.1: + +v0.15.0 (unreleased) +-------------------- + +Breaking changes +~~~~~~~~~~~~~~~~ + + +New Features +~~~~~~~~~~~~ + + +Bug fixes +~~~~~~~~~ + + +Documentation +~~~~~~~~~~~~~ + + +Internal Changes +~~~~~~~~~~~~~~~~ + + + v0.14.1 (19 Nov 2019) -------------------- From 6b70107ab3063187b663290538c0d5a4107dab6e Mon Sep 17 00:00:00 2001 From: crusaderky Date: Wed, 20 Nov 2019 09:47:56 +0000 Subject: [PATCH 23/24] Clarify conda environments for new contributors (#3551) --- doc/contributing.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/contributing.rst b/doc/contributing.rst index 028ec47e014..3cd0b3e8868 100644 --- a/doc/contributing.rst +++ b/doc/contributing.rst @@ -151,7 +151,9 @@ We'll now kick off a two-step process: .. code-block:: none # Create and activate the build environment - conda env create -f ci/requirements/py36.yml + # This is for Linux and MacOS. On Windows, use py37-windows.yml instead. + conda env create -f ci/requirements/py37.yml + conda activate xarray-tests # or with older versions of Anaconda: From 8d09879748d2e201ac6de7345e71fa7320801131 Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Thu, 21 Nov 2019 09:45:20 -0500 Subject: [PATCH 24/24] Tweaks to release instructions (#3555) * tweaks to release instructions * Update HOW_TO_RELEASE.md Co-Authored-By: keewis * no need for --reverse either * add cool script as option from @keewis * whatsnew reference * tweak --- HOW_TO_RELEASE => HOW_TO_RELEASE.md | 51 ++++++++++++++++++++++------- doc/whats-new.rst | 2 +- 2 files changed, 41 insertions(+), 12 deletions(-) rename HOW_TO_RELEASE => HOW_TO_RELEASE.md (74%) diff --git a/HOW_TO_RELEASE b/HOW_TO_RELEASE.md similarity index 74% rename from HOW_TO_RELEASE rename to HOW_TO_RELEASE.md index 5bf9bf38ded..cdeb0e19a3e 100644 --- a/HOW_TO_RELEASE +++ b/HOW_TO_RELEASE.md @@ -1,9 +1,11 @@ -How to issue an xarray release in 15 easy steps +How to issue an xarray release in 14 easy steps Time required: about an hour. 1. Ensure your master branch is synced to upstream: - git pull upstream master + ``` + git pull upstream master + ``` 2. Look over whats-new.rst and the docs. Make sure "What's New" is complete (check the date!) and consider adding a brief summary note describing the release at the top. @@ -12,37 +14,53 @@ Time required: about an hour. - Function/method references should include links to the API docs. - Sometimes notes get added in the wrong section of whats-new, typically due to a bad merge. Check for these before a release by using git diff, - e.g., ``git diff v0.X.Y whats-new.rst`` where 0.X.Y is the previous + e.g., `git diff v0.X.Y whats-new.rst` where 0.X.Y is the previous release. 3. If you have any doubts, run the full test suite one final time! - py.test + ``` + pytest + ``` 4. On the master branch, commit the release in git: + ``` git commit -a -m 'Release v0.X.Y' + ``` 5. Tag the release: + ``` git tag -a v0.X.Y -m 'v0.X.Y' + ``` 6. Build source and binary wheels for pypi: + ``` git clean -xdf # this deletes all uncommited changes! python setup.py bdist_wheel sdist + ``` 7. Use twine to register and upload the release on pypi. Be careful, you can't take this back! + ``` twine upload dist/xarray-0.X.Y* + ``` You will need to be listed as a package owner at https://pypi.python.org/pypi/xarray for this to work. 8. Push your changes to master: + ``` git push upstream master git push upstream --tags + ``` 9. Update the stable branch (used by ReadTheDocs) and switch back to master: + ``` git checkout stable git rebase master git push upstream stable git checkout master - It's OK to force push to 'stable' if necessary. - We also update the stable branch with `git cherrypick` for documentation - only fixes that apply the current released version. + ``` + It's OK to force push to 'stable' if necessary. (We also update the stable + branch with `git cherrypick` for documentation only fixes that apply the + current released version.) 10. Add a section for the next release (v.X.(Y+1)) to doc/whats-new.rst. 11. Commit your changes and push to master again: - git commit -a -m 'Revert to dev version' + ``` + git commit -a -m 'New whatsnew section' git push upstream master + ``` You're done pushing to master! 12. Issue the release on GitHub. Click on "Draft a new release" at https://github.com/pydata/xarray/releases. Type in the version number, but @@ -53,11 +71,22 @@ Time required: about an hour. 14. Issue the release announcement! For bug fix releases, I usually only email xarray@googlegroups.com. For major/feature releases, I will email a broader list (no more than once every 3-6 months): - pydata@googlegroups.com, xarray@googlegroups.com, - numpy-discussion@scipy.org, scipy-user@scipy.org, - pyaos@lists.johnny-lin.com + - pydata@googlegroups.com + - xarray@googlegroups.com + - numpy-discussion@scipy.org + - scipy-user@scipy.org + - pyaos@lists.johnny-lin.com + Google search will turn up examples of prior release announcements (look for "ANN xarray"). + You can get a list of contributors with: + ``` + git log "$(git tag --sort="v:refname" | sed -n 'x;$p').." --format="%aN" | sort -u + ``` + or by replacing `v0.X.Y` with the _previous_ release in: + ``` + git log v0.X.Y.. --format="%aN" | sort -u + ``` Note on version numbering: diff --git a/doc/whats-new.rst b/doc/whats-new.rst index de834512e36..91eed098522 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -13,7 +13,7 @@ What's New import xarray as xr np.random.seed(123456) -.. _whats-new.0.14.1: +.. _whats-new.0.15.0: v0.15.0 (unreleased)