Merge branch 'master' into flake

pydata · Dec 24, 2018 · e2d4ad0 · e2d4ad0
2 parents 1cb412c + d8d87d2
commit e2d4ad0
Show file tree

Hide file tree

Showing 11 changed files with 236 additions and 15 deletions.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -66,6 +66,10 @@ Enhancements
 - :py:meth:`DataArray.resample` and :py:meth:`Dataset.resample` now supports the
   ``loffset`` kwarg just like Pandas.
   By `Deepak Cherian <https://github.com/dcherian>`_
+- The `apply` methods for `DatasetGroupBy`, `DataArrayGroupBy`,
+  `DatasetResample` and `DataArrayResample` can now pass positional arguments to
+  the applied function.
+  By `Matti Eskelinen <https://github.com/maaleske>`_.
 - 0d slices of ndarrays are now obtained directly through indexing, rather than
   extracting and wrapping a scalar, avoiding unnecessary copying. By `Daniel
   Wennberg <https://github.com/danielwe>`_.
@@ -83,7 +87,11 @@ Bug fixes
   By `Martin Raspaud <https://github.com/mraspaud>`_.
 - Fix parsing of ``_Unsigned`` attribute set by OPENDAP servers. (:issue:`2583`).
   By `Deepak Cherian <https://github.com/dcherian>`_
-
+- Fix failure in time encoding when exporting to netCDF with versions of pandas
+  less than 0.21.1 (:issue:`2623`).  By `Spencer Clark
+  <https://github.com/spencerkclark>`_.
+- Fix MultiIndex selection to update label and level (:issue:`2619`).
+  By `Keisuke Fujii <https://github.com/fujiisoup>`_.
 
 .. _whats-new.0.11.0:
 
@@ -256,13 +264,17 @@ Announcements of note:
   for more details.
 - We have a new :doc:`roadmap` that outlines our future development plans.
 
+- `Dataset.apply` now properly documents the way `func` is called.
+  By `Matti Eskelinen <https://github.com/maaleske>`_.
+
 Enhancements
 ~~~~~~~~~~~~
 
 - :py:meth:`~xarray.DataArray.differentiate` and
   :py:meth:`~xarray.Dataset.differentiate` are newly added.
   (:issue:`1332`)
   By `Keisuke Fujii <https://github.com/fujiisoup>`_.
+
 - Default colormap for sequential and divergent data can now be set via
   :py:func:`~xarray.set_options()`
   (:issue:`2394`)

diff --git a/xarray/coding/times.py b/xarray/coding/times.py
@@ -357,7 +357,7 @@ def encode_cf_datetime(dates, units=None, calendar=None):
 
         delta_units = _netcdf_to_numpy_timeunit(delta)
         time_delta = np.timedelta64(1, delta_units).astype('timedelta64[ns]')
-        ref_date = np.datetime64(pd.Timestamp(ref_date))
+        ref_date = pd.Timestamp(ref_date)
 
         # Wrap the dates in a DatetimeIndex to do the subtraction to ensure
         # an OverflowError is raised if the ref_date is too far away from

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -13,8 +13,8 @@
 import xarray as xr
 
 from . import (
-    alignment, duck_array_ops, formatting, groupby, indexing, ops, resample,
-    rolling, utils)
+    alignment, duck_array_ops, formatting, groupby, indexing, ops, pdcompat,
+    resample, rolling, utils)
 from ..coding.cftimeindex import _parse_array_of_cftime_strings
 from .alignment import align
 from .common import (
@@ -2426,6 +2426,12 @@ def stack(self, dimensions=None, **dimensions_kwargs):
 
     def _unstack_once(self, dim):
         index = self.get_index(dim)
+        # GH2619. For MultiIndex, we need to call remove_unused.
+        if LooseVersion(pd.__version__) >= "0.20":
+            index = index.remove_unused_levels()
+        else:  # for pandas 0.19
+            index = pdcompat.remove_unused_levels(index)
+
         full_idx = pd.MultiIndex.from_product(index.levels, names=index.names)
 
         # take a shortcut in case the MultiIndex was not modified.
@@ -2948,8 +2954,8 @@ def apply(self, func, keep_attrs=None, args=(), **kwargs):
         Parameters
         ----------
         func : function
-            Function which can be called in the form `f(x, **kwargs)` to
-            transform each DataArray `x` in this dataset into another
+            Function which can be called in the form `func(x, *args, **kwargs)`
+            to transform each DataArray `x` in this dataset into another
             DataArray.
         keep_attrs : bool, optional
             If True, the dataset's attributes (`attrs`) will be copied from

diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py
@@ -503,7 +503,7 @@ def lookup_order(dimension):
         new_order = sorted(stacked.dims, key=lookup_order)
         return stacked.transpose(*new_order)
 
-    def apply(self, func, shortcut=False, **kwargs):
+    def apply(self, func, shortcut=False, args=(), **kwargs):
         """Apply a function over each array in the group and concatenate them
         together into a new array.
 
@@ -532,6 +532,8 @@ def apply(self, func, shortcut=False, **kwargs):
             If these conditions are satisfied `shortcut` provides significant
             speedup. This should be the case for many common groupby operations
             (e.g., applying numpy ufuncs).
+        args : tuple, optional
+            Positional arguments passed to `func`.
         **kwargs
             Used to call `func(ar, **kwargs)` for each array `ar`.
 
@@ -544,7 +546,7 @@ def apply(self, func, shortcut=False, **kwargs):
             grouped = self._iter_grouped_shortcut()
         else:
             grouped = self._iter_grouped()
-        applied = (maybe_wrap_array(arr, func(arr, **kwargs))
+        applied = (maybe_wrap_array(arr, func(arr, *args, **kwargs))
                    for arr in grouped)
         return self._combine(applied, shortcut=shortcut)
 
@@ -642,7 +644,7 @@ def wrapped_func(self, dim=DEFAULT_DIMS, axis=None,
 
 
 class DatasetGroupBy(GroupBy, ImplementsDatasetReduce):
-    def apply(self, func, **kwargs):
+    def apply(self, func, args=(), **kwargs):
         """Apply a function over each Dataset in the group and concatenate them
         together into a new Dataset.
 
@@ -661,6 +663,8 @@ def apply(self, func, **kwargs):
         ----------
         func : function
             Callable to apply to each sub-dataset.
+        args : tuple, optional
+            Positional arguments to pass to `func`.
         **kwargs
             Used to call `func(ds, **kwargs)` for each sub-dataset `ar`.
 
@@ -670,7 +674,7 @@ def apply(self, func, **kwargs):
             The result of splitting, applying and combining this dataset.
         """
         kwargs.pop('shortcut', None)  # ignore shortcut if set (for now)
-        applied = (func(ds, **kwargs) for ds in self._iter_grouped())
+        applied = (func(ds, *args, **kwargs) for ds in self._iter_grouped())
         return self._combine(applied)
 
     def _combine(self, applied):

diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py
@@ -159,6 +159,10 @@ def convert_label_indexer(index, label, index_name='', method=None,
             indexer, new_index = index.get_loc_level(
                 tuple(label.values()), level=tuple(label.keys()))
 
+            # GH2619. Raise a KeyError if nothing is chosen
+            if indexer.dtype.kind == 'b' and indexer.sum() == 0:
+                raise KeyError('{} not found'.format(label))
+
     elif isinstance(label, tuple) and isinstance(index, pd.MultiIndex):
         if _is_nested_tuple(label):
             indexer = index.get_locs(label)
@@ -168,7 +172,6 @@ def convert_label_indexer(index, label, index_name='', method=None,
             indexer, new_index = index.get_loc_level(
                 label, level=list(range(len(label)))
             )
-
     else:
         label = (label if getattr(label, 'ndim', 1) > 1  # vectorized-indexing
                  else _asarray_tuplesafe(label))

diff --git a/xarray/core/pdcompat.py b/xarray/core/pdcompat.py
@@ -0,0 +1,119 @@
+# The remove_unused_levels defined here was copied based on the source code
+# defined in pandas.core.indexes.muli.py
+
+# For reference, here is a copy of the pandas copyright notice:
+
+# (c) 2011-2012, Lambda Foundry, Inc. and PyData Development Team
+# All rights reserved.
+
+# Copyright (c) 2008-2011 AQR Capital Management, LLC
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+
+#     * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+
+#     * Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials provided
+#        with the distribution.
+
+#     * Neither the name of the copyright holder nor the names of any
+#        contributors may be used to endorse or promote products derived
+#        from this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import numpy as np
+import pandas as pd
+
+
+# for pandas 0.19
+def remove_unused_levels(self):
+    """
+    create a new MultiIndex from the current that removing
+    unused levels, meaning that they are not expressed in the labels
+    The resulting MultiIndex will have the same outward
+    appearance, meaning the same .values and ordering. It will also
+    be .equals() to the original.
+    .. versionadded:: 0.20.0
+    Returns
+    -------
+    MultiIndex
+    Examples
+    --------
+    >>> i = pd.MultiIndex.from_product([range(2), list('ab')])
+    MultiIndex(levels=[[0, 1], ['a', 'b']],
+               labels=[[0, 0, 1, 1], [0, 1, 0, 1]])
+    >>> i[2:]
+    MultiIndex(levels=[[0, 1], ['a', 'b']],
+               labels=[[1, 1], [0, 1]])
+    The 0 from the first level is not represented
+    and can be removed
+    >>> i[2:].remove_unused_levels()
+    MultiIndex(levels=[[1], ['a', 'b']],
+               labels=[[0, 0], [0, 1]])
+    """
+    import pandas.core.algorithms as algos
+
+    new_levels = []
+    new_labels = []
+
+    changed = False
+    for lev, lab in zip(self.levels, self.labels):
+
+        # Since few levels are typically unused, bincount() is more
+        # efficient than unique() - however it only accepts positive values
+        # (and drops order):
+        uniques = np.where(np.bincount(lab + 1) > 0)[0] - 1
+        has_na = int(len(uniques) and (uniques[0] == -1))
+
+        if len(uniques) != len(lev) + has_na:
+            # We have unused levels
+            changed = True
+
+            # Recalculate uniques, now preserving order.
+            # Can easily be cythonized by exploiting the already existing
+            # "uniques" and stop parsing "lab" when all items are found:
+            uniques = algos.unique(lab)
+            if has_na:
+                na_idx = np.where(uniques == -1)[0]
+                # Just ensure that -1 is in first position:
+                uniques[[0, na_idx[0]]] = uniques[[na_idx[0], 0]]
+
+            # labels get mapped from uniques to 0:len(uniques)
+            # -1 (if present) is mapped to last position
+            label_mapping = np.zeros(len(lev) + has_na)
+            # ... and reassigned value -1:
+            label_mapping[uniques] = np.arange(len(uniques)) - has_na
+
+            lab = label_mapping[lab]
+
+            # new levels are simple
+            lev = lev.take(uniques[has_na:])
+
+        new_levels.append(lev)
+        new_labels.append(lab)
+
+    result = self._shallow_copy()
+
+    if changed:
+        result._reset_identity()
+        result._set_levels(new_levels, validate=False)
+        result._set_labels(new_labels, validate=False)
+
+    return result
diff --git a/xarray/core/resample.py b/xarray/core/resample.py
@@ -129,7 +129,7 @@ def __init__(self, *args, **kwargs):
                              "('{}')! ".format(self._resample_dim, self._dim))
         super(DataArrayResample, self).__init__(*args, **kwargs)
 
-    def apply(self, func, shortcut=False, **kwargs):
+    def apply(self, func, shortcut=False, args=(), **kwargs):
         """Apply a function over each array in the group and concatenate them
         together into a new array.
 
@@ -158,6 +158,8 @@ def apply(self, func, shortcut=False, **kwargs):
             If these conditions are satisfied `shortcut` provides significant
             speedup. This should be the case for many common groupby operations
             (e.g., applying numpy ufuncs).
+        args : tuple, optional
+            Positional arguments passed on to `func`.
         **kwargs
             Used to call `func(ar, **kwargs)` for each array `ar`.
 
@@ -167,7 +169,7 @@ def apply(self, func, shortcut=False, **kwargs):
             The result of splitting, applying and combining this array.
         """
         combined = super(DataArrayResample, self).apply(
-            func, shortcut=shortcut, **kwargs)
+            func, shortcut=shortcut, args=args, **kwargs)
 
         # If the aggregation function didn't drop the original resampling
         # dimension, then we need to do so before we can rename the proxy
@@ -240,7 +242,7 @@ def __init__(self, *args, **kwargs):
                              "('{}')! ".format(self._resample_dim, self._dim))
         super(DatasetResample, self).__init__(*args, **kwargs)
 
-    def apply(self, func, **kwargs):
+    def apply(self, func, args=(), **kwargs):
         """Apply a function over each Dataset in the groups generated for
         resampling  and concatenate them together into a new Dataset.
 
@@ -259,6 +261,8 @@ def apply(self, func, **kwargs):
         ----------
         func : function
             Callable to apply to each sub-dataset.
+        args : tuple, optional
+            Positional arguments passed on to `func`.
         **kwargs
             Used to call `func(ds, **kwargs)` for each sub-dataset `ar`.
 
@@ -268,7 +272,7 @@ def apply(self, func, **kwargs):
             The result of splitting, applying and combining this dataset.
         """
         kwargs.pop('shortcut', None)  # ignore shortcut if set (for now)
-        applied = (func(ds, **kwargs) for ds in self._iter_grouped())
+        applied = (func(ds, *args, **kwargs) for ds in self._iter_grouped())
         combined = self._combine(applied)
 
         return combined.rename({self._resample_dim: self._dim})

diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py
@@ -737,3 +737,16 @@ def test_encode_cf_datetime_overflow(shape):
     num, _, _ = encode_cf_datetime(dates, units, calendar)
     roundtrip = decode_cf_datetime(num, units, calendar)
     np.testing.assert_array_equal(dates, roundtrip)
+
+
+def test_encode_cf_datetime_pandas_min():
+    # Test that encode_cf_datetime does not fail for versions
+    # of pandas < 0.21.1 (GH 2623).
+    dates = pd.date_range('2000', periods=3)
+    num, units, calendar = encode_cf_datetime(dates)
+    expected_num = np.array([0., 1., 2.])
+    expected_units = 'days since 2000-01-01 00:00:00'
+    expected_calendar = 'proleptic_gregorian'
+    np.testing.assert_array_equal(num, expected_num)
+    assert units == expected_units
+    assert calendar == expected_calendar
diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py
@@ -1027,6 +1027,20 @@ def test_sel(lab_indexer, pos_indexer, replaced_idx=False,
         assert_identical(mdata.sel(x={'one': 'a', 'two': 1}),
                          mdata.sel(one='a', two=1))
 
+    def test_selection_multiindex(self):
+        # GH2619. For MultiIndex, we need to call remove_unused.
+        ds = xr.DataArray(np.arange(40).reshape(8, 5), dims=['x', 'y'],
+                          coords={'x': np.arange(8), 'y': np.arange(5)})
+        ds = ds.stack(xy=['x', 'y'])
+        ds_isel = ds.isel(xy=ds['x'] < 4)
+        with pytest.raises(KeyError):
+            ds_isel.sel(x=5)
+
+        actual = ds_isel.unstack()
+        expected = ds.reset_index('xy').isel(xy=ds['x'] < 4)
+        expected = expected.set_index(xy=['x', 'y']).unstack()
+        assert_identical(expected, actual)
+
     def test_virtual_default_coords(self):
         array = DataArray(np.zeros((5,)), dims='x')
         expected = DataArray(range(5), dims='x', name='x')
@@ -2281,6 +2295,17 @@ def test_resample(self):
         with raises_regex(ValueError, 'index must be monotonic'):
             array[[2, 0, 1]].resample(time='1D')
 
+    def test_da_resample_func_args(self):
+
+        def func(arg1, arg2, arg3=0.):
+            return arg1.mean('time') + arg2 + arg3
+
+        times = pd.date_range('2000', periods=3, freq='D')
+        da = xr.DataArray([1., 1., 1.], coords=[times], dims=['time'])
+        expected = xr.DataArray([3., 3., 3.], coords=[times], dims=['time'])
+        actual = da.resample(time='D').apply(func, args=(1.,), arg3=1.)
+        assert_identical(actual, expected)
+
     @requires_cftime
     def test_resample_cftimeindex(self):
         cftime = _import_cftime()