Skip to content

Commit

Permalink
ENH: keepdims=True for xarray reductions (#3033)
Browse files Browse the repository at this point in the history
* ENH: keepdims=True for xarray reductions

Addresses #2170

Add new option `keepdims` to xarray reduce operations, following the
behaviour of Numpy.

`keepdims` may be passed to reductions on either Datasets or DataArrays,
and will result in the reduced dimensions being still present in the
output with size 1.

Coordinates that depend on the reduced dimensions will be removed from
the Dataset/DataArray

* Set the default to be `False`

* Correct lint error

* Apply suggestions from code review

Co-Authored-By: Maximilian Roos <[email protected]>

* Add test for dask and fix implementation

* Move 'keepdims' up to where 'dims' is set

* Fix lint, add test for scalar variable
  • Loading branch information
Scott Wales authored and shoyer committed Jun 23, 2019
1 parent 724ad83 commit ff41988
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 8 deletions.
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ Enhancements
~~~~~~~~~~~~


- Add ``keepdims`` argument for reduce operations (:issue:`2170`)
By `Scott Wales <https://github.com/ScottWales>`_.
- netCDF chunksizes are now only dropped when original_shape is different,
not when it isn't found. (:issue:`2207`)
By `Karel van de Plassche <https://github.com/Karel-van-de-Plassche>`_.
Expand Down
18 changes: 15 additions & 3 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,8 +259,14 @@ def _replace(self, variable=None, coords=None, name=__default):
return type(self)(variable, coords, name=name, fastpath=True)

def _replace_maybe_drop_dims(self, variable, name=__default):
if variable.dims == self.dims:
if variable.dims == self.dims and variable.shape == self.shape:
coords = self._coords.copy()
elif variable.dims == self.dims:
# Shape has changed (e.g. from reduce(..., keepdims=True)
new_sizes = dict(zip(self.dims, variable.shape))
coords = OrderedDict((k, v) for k, v in self._coords.items()
if v.shape == tuple(new_sizes[d]
for d in v.dims))
else:
allowed_dims = set(variable.dims)
coords = OrderedDict((k, v) for k, v in self._coords.items()
Expand Down Expand Up @@ -1642,7 +1648,8 @@ def combine_first(self, other):
"""
return ops.fillna(self, other, join="outer")

def reduce(self, func, dim=None, axis=None, keep_attrs=None, **kwargs):
def reduce(self, func, dim=None, axis=None, keep_attrs=None,
keepdims=False, **kwargs):
"""Reduce this array by applying `func` along some dimension(s).
Parameters
Expand All @@ -1662,6 +1669,10 @@ def reduce(self, func, dim=None, axis=None, keep_attrs=None, **kwargs):
If True, the variable's attributes (`attrs`) will be copied from
the original object to the new one. If False (default), the new
object will be returned without attributes.
keepdims : bool, default False
If True, the dimensions which are reduced are left in the result
as dimensions of size one. Coordinates that use these dimensions
are removed.
**kwargs : dict
Additional keyword arguments passed on to `func`.
Expand All @@ -1672,7 +1683,8 @@ def reduce(self, func, dim=None, axis=None, keep_attrs=None, **kwargs):
summarized data and the indicated dimension(s) removed.
"""

var = self.variable.reduce(func, dim, axis, keep_attrs, **kwargs)
var = self.variable.reduce(func, dim, axis, keep_attrs, keepdims,
**kwargs)
return self._replace_maybe_drop_dims(var)

def to_pandas(self):
Expand Down
9 changes: 7 additions & 2 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3152,8 +3152,8 @@ def combine_first(self, other):
out = ops.fillna(self, other, join="outer", dataset_join="outer")
return out

def reduce(self, func, dim=None, keep_attrs=None, numeric_only=False,
allow_lazy=False, **kwargs):
def reduce(self, func, dim=None, keep_attrs=None, keepdims=False,
numeric_only=False, allow_lazy=False, **kwargs):
"""Reduce this dataset by applying `func` along some dimension(s).
Parameters
Expand All @@ -3169,6 +3169,10 @@ def reduce(self, func, dim=None, keep_attrs=None, numeric_only=False,
If True, the dataset's attributes (`attrs`) will be copied from
the original object to the new one. If False (default), the new
object will be returned without attributes.
keepdims : bool, default False
If True, the dimensions which are reduced are left in the result
as dimensions of size one. Coordinates that use these dimensions
are removed.
numeric_only : bool, optional
If True, only apply ``func`` to variables with a numeric dtype.
**kwargs : dict
Expand Down Expand Up @@ -3218,6 +3222,7 @@ def reduce(self, func, dim=None, keep_attrs=None, numeric_only=False,
reduce_dims = None
variables[name] = var.reduce(func, dim=reduce_dims,
keep_attrs=keep_attrs,
keepdims=keepdims,
allow_lazy=allow_lazy,
**kwargs)

Expand Down
20 changes: 17 additions & 3 deletions xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -1334,7 +1334,7 @@ def where(self, cond, other=dtypes.NA):
return ops.where_method(self, cond, other)

def reduce(self, func, dim=None, axis=None,
keep_attrs=None, allow_lazy=False, **kwargs):
keep_attrs=None, keepdims=False, allow_lazy=False, **kwargs):
"""Reduce this array by applying `func` along some dimension(s).
Parameters
Expand All @@ -1354,6 +1354,9 @@ def reduce(self, func, dim=None, axis=None,
If True, the variable's attributes (`attrs`) will be copied from
the original object to the new one. If False (default), the new
object will be returned without attributes.
keepdims : bool, default False
If True, the dimensions which are reduced are left in the result
as dimensions of size one
**kwargs : dict
Additional keyword arguments passed on to `func`.
Expand Down Expand Up @@ -1381,8 +1384,19 @@ def reduce(self, func, dim=None, axis=None,
else:
removed_axes = (range(self.ndim) if axis is None
else np.atleast_1d(axis) % self.ndim)
dims = [adim for n, adim in enumerate(self.dims)
if n not in removed_axes]
if keepdims:
# Insert np.newaxis for removed dims
slices = tuple(np.newaxis if i in removed_axes else
slice(None, None) for i in range(self.ndim))
if getattr(data, 'shape', None) is None:
# Reduce has produced a scalar value, not an array-like
data = np.asanyarray(data)[slices]
else:
data = data[slices]
dims = self.dims
else:
dims = [adim for n, adim in enumerate(self.dims)
if n not in removed_axes]

if keep_attrs is None:
keep_attrs = _get_keep_attrs(default=False)
Expand Down
38 changes: 38 additions & 0 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -1991,6 +1991,44 @@ def test_reduce(self):
dims=['x', 'y']).mean('x')
assert_equal(actual, expected)

def test_reduce_keepdims(self):
coords = {'x': [-1, -2], 'y': ['ab', 'cd', 'ef'],
'lat': (['x', 'y'], [[1, 2, 3], [-1, -2, -3]]),
'c': -999}
orig = DataArray([[-1, 0, 1], [-3, 0, 3]], coords, dims=['x', 'y'])

# Mean on all axes loses non-constant coordinates
actual = orig.mean(keepdims=True)
expected = DataArray(orig.data.mean(keepdims=True), dims=orig.dims,
coords={k: v for k, v in coords.items()
if k in ['c']})
assert_equal(actual, expected)

assert actual.sizes['x'] == 1
assert actual.sizes['y'] == 1

# Mean on specific axes loses coordinates not involving that axis
actual = orig.mean('y', keepdims=True)
expected = DataArray(orig.data.mean(axis=1, keepdims=True),
dims=orig.dims,
coords={k: v for k, v in coords.items()
if k not in ['y', 'lat']})
assert_equal(actual, expected)

@requires_bottleneck
def test_reduce_keepdims_bottleneck(self):
import bottleneck

coords = {'x': [-1, -2], 'y': ['ab', 'cd', 'ef'],
'lat': (['x', 'y'], [[1, 2, 3], [-1, -2, -3]]),
'c': -999}
orig = DataArray([[-1, 0, 1], [-3, 0, 3]], coords, dims=['x', 'y'])

# Bottleneck does not have its own keepdims implementation
actual = orig.reduce(bottleneck.nanmean, keepdims=True)
expected = orig.mean(keepdims=True)
assert_equal(actual, expected)

def test_reduce_dtype(self):
coords = {'x': [-1, -2], 'y': ['ab', 'cd', 'ef'],
'lat': (['x', 'y'], [[1, 2, 3], [-1, -2, -3]]),
Expand Down
19 changes: 19 additions & 0 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3898,6 +3898,25 @@ def total_sum(x):
with raises_regex(TypeError, "unexpected keyword argument 'axis'"):
ds.reduce(total_sum, dim='x')

def test_reduce_keepdims(self):
ds = Dataset({'a': (['x', 'y'], [[0, 1, 2, 3, 4]])},
coords={'y': [0, 1, 2, 3, 4], 'x': [0],
'lat': (['x', 'y'], [[0, 1, 2, 3, 4]]),
'c': -999.0})

# Shape should match behaviour of numpy reductions with keepdims=True
# Coordinates involved in the reduction should be removed
actual = ds.mean(keepdims=True)
expected = Dataset({'a': (['x', 'y'], np.mean(ds.a, keepdims=True))},
coords={'c': ds.c})
assert_identical(expected, actual)

actual = ds.mean('x', keepdims=True)
expected = Dataset({'a': (['x', 'y'],
np.mean(ds.a, axis=0, keepdims=True))},
coords={'y': ds.y, 'c': ds.c})
assert_identical(expected, actual)

def test_quantile(self):

ds = create_test_data(seed=123)
Expand Down
36 changes: 36 additions & 0 deletions xarray/tests/test_variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -1540,6 +1540,42 @@ def test_reduce_funcs(self):
assert_identical(
v.max(), Variable([], pd.Timestamp('2000-01-03')))

def test_reduce_keepdims(self):
v = Variable(['x', 'y'], self.d)

assert_identical(v.mean(keepdims=True),
Variable(v.dims, np.mean(self.d, keepdims=True)))
assert_identical(v.mean(dim='x', keepdims=True),
Variable(v.dims, np.mean(self.d, axis=0,
keepdims=True)))
assert_identical(v.mean(dim='y', keepdims=True),
Variable(v.dims, np.mean(self.d, axis=1,
keepdims=True)))
assert_identical(v.mean(dim=['y', 'x'], keepdims=True),
Variable(v.dims, np.mean(self.d, axis=(1, 0),
keepdims=True)))

v = Variable([], 1.0)
assert_identical(v.mean(keepdims=True),
Variable([], np.mean(v.data, keepdims=True)))

@requires_dask
def test_reduce_keepdims_dask(self):
import dask.array
v = Variable(['x', 'y'], self.d).chunk()

actual = v.mean(keepdims=True)
assert isinstance(actual.data, dask.array.Array)

expected = Variable(v.dims, np.mean(self.d, keepdims=True))
assert_identical(actual, expected)

actual = v.mean(dim='y', keepdims=True)
assert isinstance(actual.data, dask.array.Array)

expected = Variable(v.dims, np.mean(self.d, axis=1, keepdims=True))
assert_identical(actual, expected)

def test_reduce_keep_attrs(self):
_attrs = {'units': 'test', 'long_name': 'testing'}

Expand Down

0 comments on commit ff41988

Please sign in to comment.