Merge remote-tracking branch 'upstream/master' into groupby-plot

* upstream/master: Add nanmedian for dask arrays (pydata#3604) added pyinterp to related projects (pydata#3655) Allow incomplete hypercubes in combine_by_coords (pydata#3649) concat keeps attrs from first variable. (pydata#3637)
dcherian · Dec 30, 2019 · e018ea8 · e018ea8
2 parents 97f3323 + b3d3b44
commit e018ea8
Show file tree

Hide file tree

Showing 12 changed files with 176 additions and 25 deletions.
diff --git a/doc/related-projects.rst b/doc/related-projects.rst
@@ -25,6 +25,7 @@ Geosciences
 - `PyGDX <https://pygdx.readthedocs.io/en/latest/>`_: Python 3 package for
   accessing data stored in GAMS Data eXchange (GDX) files. Also uses a custom
   subclass.
+- `pyinterp <https://pangeo-pyinterp.readthedocs.io/en/latest/>`_: Python 3 package for interpolating geo-referenced data used in the field of geosciences.
 - `pyXpcm <https://pyxpcm.readthedocs.io>`_: xarray-based Profile Classification Modelling (PCM), mostly for ocean data.
 - `Regionmask <https://regionmask.readthedocs.io/>`_: plotting and creation of masks of spatial regions
 - `rioxarray <https://corteva.github.io/rioxarray>`_: geospatial xarray extension powered by rasterio

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -25,6 +25,12 @@ Breaking changes
 
 New Features
 ~~~~~~~~~~~~
+- Implement :py:func:`median` and :py:func:`nanmedian` for dask arrays. This works by rechunking
+  to a single chunk along all reduction axes. (:issue:`2999`).
+  By `Deepak Cherian <https://github.com/dcherian>`_.
+- :py:func:`xarray.concat` now preserves attributes from the first Variable.
+  (:issue:`2575`, :issue:`2060`, :issue:`1614`)
+  By `Deepak Cherian <https://github.com/dcherian>`_.
 - :py:meth:`Dataset.quantile`, :py:meth:`DataArray.quantile` and ``GroupBy.quantile``
   now work with dask Variables.
   By `Deepak Cherian <https://github.com/dcherian>`_.
@@ -38,6 +44,9 @@ New Features
 
 Bug fixes
 ~~~~~~~~~
+- Fix :py:meth:`xarray.combine_by_coords` to allow for combining incomplete
+  hypercubes of Datasets (:issue:`3648`).  By `Ian Bolliger
+  <https://github.com/bolliger32>`_.
 - Fix :py:meth:`xarray.combine_by_coords` when combining cftime coordinates
   which span long time intervals (:issue:`3535`).  By `Spencer Clark
   <https://github.com/spencerkclark>`_.

diff --git a/xarray/core/combine.py b/xarray/core/combine.py
@@ -115,11 +115,12 @@ def _infer_concat_order_from_coords(datasets):
     return combined_ids, concat_dims
 
 
-def _check_shape_tile_ids(combined_tile_ids):
+def _check_dimension_depth_tile_ids(combined_tile_ids):
+    """
+    Check all tuples are the same length, i.e. check that all lists are
+    nested to the same depth.
+    """
     tile_ids = combined_tile_ids.keys()
-
-    # Check all tuples are the same length
-    # i.e. check that all lists are nested to the same depth
     nesting_depths = [len(tile_id) for tile_id in tile_ids]
     if not nesting_depths:
         nesting_depths = [0]
@@ -128,8 +129,13 @@ def _check_shape_tile_ids(combined_tile_ids):
             "The supplied objects do not form a hypercube because"
             " sub-lists do not have consistent depths"
         )
+    # return these just to be reused in _check_shape_tile_ids
+    return tile_ids, nesting_depths
 
-    # Check all lists along one dimension are same length
+
+def _check_shape_tile_ids(combined_tile_ids):
+    """Check all lists along one dimension are same length."""
+    tile_ids, nesting_depths = _check_dimension_depth_tile_ids(combined_tile_ids)
     for dim in range(nesting_depths[0]):
         indices_along_dim = [tile_id[dim] for tile_id in tile_ids]
         occurrences = Counter(indices_along_dim)
@@ -536,7 +542,8 @@ def combine_by_coords(
     coords : {'minimal', 'different', 'all' or list of str}, optional
         As per the 'data_vars' kwarg, but for coordinate variables.
     fill_value : scalar, optional
-        Value to use for newly missing values
+        Value to use for newly missing values. If None, raises a ValueError if
+        the passed Datasets do not create a complete hypercube.
     join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
         String indicating how to combine differing indexes
         (excluding concat_dim) in objects
@@ -653,6 +660,15 @@ def combine_by_coords(
     temperature    (y, x) float64 1.654 10.63 7.015 2.543 ... 12.46 2.22 15.96
     precipitation  (y, x) float64 0.2136 0.9974 0.7603 ... 0.6125 0.4654 0.5953
 
+    >>> xr.combine_by_coords([x1, x2, x3])
+    <xarray.Dataset>
+    Dimensions:        (x: 6, y: 4)
+    Coordinates:
+    * x              (x) int64 10 20 30 40 50 60
+    * y              (y) int64 0 1 2 3
+    Data variables:
+    temperature    (y, x) float64 1.654 10.63 7.015 nan ... 12.46 2.22 15.96
+    precipitation  (y, x) float64 0.2136 0.9974 0.7603 ... 0.6125 0.4654 0.5953
     """
 
     # Group by data vars
@@ -667,7 +683,13 @@ def combine_by_coords(
             list(datasets_with_same_vars)
         )
 
-        _check_shape_tile_ids(combined_ids)
+        if fill_value is None:
+            # check that datasets form complete hypercube
+            _check_shape_tile_ids(combined_ids)
+        else:
+            # check only that all datasets have same dimension depth for these
+            # vars
+            _check_dimension_depth_tile_ids(combined_ids)
 
         # Concatenate along all of concat_dims one by one to create single ds
         concatenated = _combine_nd(

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
@@ -93,12 +93,14 @@ def concat(
           those of the first object with that dimension. Indexes for the same
           dimension must have the same size in all objects.
 
-    indexers, mode, concat_over : deprecated
-
     Returns
     -------
     concatenated : type of objs
 
+    Notes
+    -----
+    Each concatenated Variable preserves corresponding ``attrs`` from the first element of ``objs``.
+
     See also
     --------
     merge

diff --git a/xarray/core/dask_array_compat.py b/xarray/core/dask_array_compat.py
@@ -1,8 +1,14 @@
 from distutils.version import LooseVersion
+from typing import Iterable
 
-import dask.array as da
 import numpy as np
-from dask import __version__ as dask_version
+
+try:
+    import dask.array as da
+    from dask import __version__ as dask_version
+except ImportError:
+    dask_version = "0.0.0"
+    da = None
 
 if LooseVersion(dask_version) >= LooseVersion("2.0.0"):
     meta_from_array = da.utils.meta_from_array
@@ -89,3 +95,76 @@ def meta_from_array(x, ndim=None, dtype=None):
             meta = meta.astype(dtype)
 
         return meta
+
+
+if LooseVersion(dask_version) >= LooseVersion("2.8.1"):
+    median = da.median
+else:
+    # Copied from dask v2.8.1
+    # Used under the terms of Dask's license, see licenses/DASK_LICENSE.
+    def median(a, axis=None, keepdims=False):
+        """
+        This works by automatically chunking the reduced axes to a single chunk
+        and then calling ``numpy.median`` function across the remaining dimensions
+        """
+
+        if axis is None:
+            raise NotImplementedError(
+                "The da.median function only works along an axis.  "
+                "The full algorithm is difficult to do in parallel"
+            )
+
+        if not isinstance(axis, Iterable):
+            axis = (axis,)
+
+        axis = [ax + a.ndim if ax < 0 else ax for ax in axis]
+
+        a = a.rechunk({ax: -1 if ax in axis else "auto" for ax in range(a.ndim)})
+
+        result = a.map_blocks(
+            np.median,
+            axis=axis,
+            keepdims=keepdims,
+            drop_axis=axis if not keepdims else None,
+            chunks=[1 if ax in axis else c for ax, c in enumerate(a.chunks)]
+            if keepdims
+            else None,
+        )
+
+        return result
+
+
+if LooseVersion(dask_version) > LooseVersion("2.9.0"):
+    nanmedian = da.nanmedian
+else:
+
+    def nanmedian(a, axis=None, keepdims=False):
+        """
+        This works by automatically chunking the reduced axes to a single chunk
+        and then calling ``numpy.nanmedian`` function across the remaining dimensions
+        """
+
+        if axis is None:
+            raise NotImplementedError(
+                "The da.nanmedian function only works along an axis.  "
+                "The full algorithm is difficult to do in parallel"
+            )
+
+        if not isinstance(axis, Iterable):
+            axis = (axis,)
+
+        axis = [ax + a.ndim if ax < 0 else ax for ax in axis]
+
+        a = a.rechunk({ax: -1 if ax in axis else "auto" for ax in range(a.ndim)})
+
+        result = a.map_blocks(
+            np.nanmedian,
+            axis=axis,
+            keepdims=keepdims,
+            drop_axis=axis if not keepdims else None,
+            chunks=[1 if ax in axis else c for ax, c in enumerate(a.chunks)]
+            if keepdims
+            else None,
+        )
+
+        return result
diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py
@@ -11,7 +11,7 @@
 import numpy as np
 import pandas as pd
 
-from . import dask_array_ops, dtypes, npcompat, nputils
+from . import dask_array_ops, dask_array_compat, dtypes, npcompat, nputils
 from .nputils import nanfirst, nanlast
 from .pycompat import dask_array_type
 
@@ -284,7 +284,7 @@ def _ignore_warnings_if(condition):
         yield
 
 
-def _create_nan_agg_method(name, coerce_strings=False):
+def _create_nan_agg_method(name, dask_module=dask_array, coerce_strings=False):
     from . import nanops
 
     def f(values, axis=None, skipna=None, **kwargs):
@@ -301,7 +301,7 @@ def f(values, axis=None, skipna=None, **kwargs):
             nanname = "nan" + name
             func = getattr(nanops, nanname)
         else:
-            func = _dask_or_eager_func(name)
+            func = _dask_or_eager_func(name, dask_module=dask_module)
 
         try:
             return func(values, axis=axis, **kwargs)
@@ -337,7 +337,7 @@ def f(values, axis=None, skipna=None, **kwargs):
 std.numeric_only = True
 var = _create_nan_agg_method("var")
 var.numeric_only = True
-median = _create_nan_agg_method("median")
+median = _create_nan_agg_method("median", dask_module=dask_array_compat)
 median.numeric_only = True
 prod = _create_nan_agg_method("prod")
 prod.numeric_only = True

diff --git a/xarray/core/nanops.py b/xarray/core/nanops.py
@@ -6,8 +6,10 @@
 
 try:
     import dask.array as dask_array
+    from . import dask_array_compat
 except ImportError:
     dask_array = None
+    dask_array_compat = None  # type: ignore
 
 
 def _replace_nan(a, val):
@@ -141,7 +143,15 @@ def nanmean(a, axis=None, dtype=None, out=None):
 
 
 def nanmedian(a, axis=None, out=None):
-    return _dask_or_eager_func("nanmedian", eager_module=nputils)(a, axis=axis)
+    # The dask algorithm works by rechunking to one chunk along axis
+    # Make sure we trigger the dask error when passing all dimensions
+    # so that we don't rechunk the entire array to one chunk and
+    # possibly blow memory
+    if axis is not None and len(np.atleast_1d(axis)) == a.ndim:
+        axis = None
+    return _dask_or_eager_func(
+        "nanmedian", dask_module=dask_array_compat, eager_module=nputils
+    )(a, axis=axis)
 
 
 def _nanvar_object(value, axis=None, ddof=0, keepdims=False, **kwargs):

diff --git a/xarray/core/variable.py b/xarray/core/variable.py
@@ -1625,8 +1625,9 @@ def concat(cls, variables, dim="concat_dim", positions=None, shortcut=False):
         if not shortcut:
             for var in variables:
                 if var.dims != first_var.dims:
-                    raise ValueError("inconsistent dimensions")
-                utils.remove_incompatible_items(attrs, var.attrs)
+                    raise ValueError(
+                        f"Variable has dimensions {list(var.dims)} but first Variable has dimensions {list(first_var.dims)}"
+                    )
 
         return cls(dims, data, attrs, encoding)
 

diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py
@@ -711,6 +711,22 @@ def test_check_for_impossible_ordering(self):
         ):
             combine_by_coords([ds1, ds0])
 
+    def test_combine_by_coords_incomplete_hypercube(self):
+        # test that this succeeds with default fill_value
+        x1 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [0], "x": [0]})
+        x2 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [1], "x": [0]})
+        x3 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [0], "x": [1]})
+        actual = combine_by_coords([x1, x2, x3])
+        expected = Dataset(
+            {"a": (("y", "x"), [[1, 1], [1, np.nan]])},
+            coords={"y": [0, 1], "x": [0, 1]},
+        )
+        assert_identical(expected, actual)
+
+        # test that this fails if fill_value is None
+        with pytest.raises(ValueError):
+            combine_by_coords([x1, x2, x3], fill_value=None)
+
 
 @pytest.mark.filterwarnings(
     "ignore:In xarray version 0.15 `auto_combine` " "will be deprecated"

diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py
@@ -462,3 +462,16 @@ def test_concat_join_kwarg(self):
         for join in expected:
             actual = concat([ds1, ds2], join=join, dim="x")
             assert_equal(actual, expected[join].to_array())
+
+
+@pytest.mark.parametrize("attr1", ({"a": {"meta": [10, 20, 30]}}, {"a": [1, 2, 3]}, {}))
+@pytest.mark.parametrize("attr2", ({"a": [1, 2, 3]}, {}))
+def test_concat_attrs_first_variable(attr1, attr2):
+
+    arrs = [
+        DataArray([[1], [2]], dims=["x", "y"], attrs=attr1),
+        DataArray([[3], [4]], dims=["x", "y"], attrs=attr2),
+    ]
+
+    concat_attrs = concat(arrs, "y").attrs
+    assert concat_attrs == attr1
diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py
@@ -216,8 +216,10 @@ def test_reduce(self):
         self.assertLazyAndAllClose(u.argmin(dim="x"), actual)
         self.assertLazyAndAllClose((u > 1).any(), (v > 1).any())
         self.assertLazyAndAllClose((u < 1).all("x"), (v < 1).all("x"))
-        with raises_regex(NotImplementedError, "dask"):
+        with raises_regex(NotImplementedError, "only works along an axis"):
             v.median()
+        with raises_regex(NotImplementedError, "only works along an axis"):
+            v.median(v.dims)
         with raise_if_dask_computes():
             v.reduce(duck_array_ops.mean)
 

diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py
@@ -432,7 +432,7 @@ def test_concat(self):
         assert_identical(
             Variable(["b", "a"], np.array([x, y])), Variable.concat((v, w), "b")
         )
-        with raises_regex(ValueError, "inconsistent dimensions"):
+        with raises_regex(ValueError, "Variable has dimensions"):
             Variable.concat([v, Variable(["c"], y)], "b")
         # test indexers
         actual = Variable.concat(
@@ -451,16 +451,12 @@ def test_concat(self):
             Variable.concat([v[:, 0], v[:, 1:]], "x")
 
     def test_concat_attrs(self):
-        # different or conflicting attributes should be removed
+        # always keep attrs from first variable
         v = self.cls("a", np.arange(5), {"foo": "bar"})
         w = self.cls("a", np.ones(5))
         expected = self.cls(
             "a", np.concatenate([np.arange(5), np.ones(5)])
         ).to_base_variable()
-        assert_identical(expected, Variable.concat([v, w], "a"))
-        w.attrs["foo"] = 2
-        assert_identical(expected, Variable.concat([v, w], "a"))
-        w.attrs["foo"] = "bar"
         expected.attrs["foo"] = "bar"
         assert_identical(expected, Variable.concat([v, w], "a"))