Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Faster unstacking #4746

Merged
merged 16 commits into from
Jan 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions asv_bench/benchmarks/unstacking.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,23 @@

class Unstacking:
def setup(self):
data = np.random.RandomState(0).randn(1, 1000, 500)
self.ds = xr.DataArray(data).stack(flat_dim=["dim_1", "dim_2"])
data = np.random.RandomState(0).randn(500, 1000)
self.da_full = xr.DataArray(data, dims=list("ab")).stack(flat_dim=[...])
self.da_missing = self.da_full[:-1]
self.df_missing = self.da_missing.to_pandas()

def time_unstack_fast(self):
self.ds.unstack("flat_dim")
self.da_full.unstack("flat_dim")

def time_unstack_slow(self):
self.ds[:, ::-1].unstack("flat_dim")
self.da_missing.unstack("flat_dim")

def time_unstack_pandas_slow(self):
self.df_missing.unstack()


class UnstackingDask(Unstacking):
def setup(self, *args, **kwargs):
requires_dask()
super().setup(**kwargs)
self.ds = self.ds.chunk({"flat_dim": 50})
self.da_full = self.da_full.chunk({"flat_dim": 50})
7 changes: 6 additions & 1 deletion doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ What's New

.. _whats-new.0.16.3:

v0.16.3 (unreleased)
v0.17.0 (unreleased)
--------------------

Breaking changes
Expand Down Expand Up @@ -45,6 +45,11 @@ Breaking changes

New Features
~~~~~~~~~~~~
- Significantly higher ``unstack`` performance on numpy-backed arrays which
contain missing values; 8x faster in our benchmark, and 2x faster than pandas.
(:pull:`4746`);
By `Maximilian Roos <https://github.com/max-sixty>`_.

- Performance improvement when constructing DataArrays. Significantly speeds up repr for Datasets with large number of variables.
By `Deepak Cherian <https://github.com/dcherian>`_

Expand Down
75 changes: 72 additions & 3 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sys
import warnings
from collections import defaultdict
from distutils.version import LooseVersion
from html import escape
from numbers import Number
from operator import methodcaller
Expand Down Expand Up @@ -79,7 +80,7 @@
)
from .missing import get_clean_interp_index
from .options import OPTIONS, _get_keep_attrs
from .pycompat import is_duck_dask_array
from .pycompat import is_duck_dask_array, sparse_array_type
from .utils import (
Default,
Frozen,
Expand Down Expand Up @@ -3707,7 +3708,40 @@ def ensure_stackable(val):

return data_array

def _unstack_once(self, dim: Hashable, fill_value, sparse) -> "Dataset":
def _unstack_once(self, dim: Hashable, fill_value) -> "Dataset":
index = self.get_index(dim)
index = remove_unused_levels_categories(index)

variables: Dict[Hashable, Variable] = {}
indexes = {k: v for k, v in self.indexes.items() if k != dim}

for name, var in self.variables.items():
if name != dim:
if dim in var.dims:
if isinstance(fill_value, Mapping):
fill_value_ = fill_value[name]
else:
fill_value_ = fill_value

variables[name] = var._unstack_once(
index=index, dim=dim, fill_value=fill_value_
)
else:
variables[name] = var

for name, lev in zip(index.names, index.levels):
variables[name] = IndexVariable(name, lev)
indexes[name] = lev

coord_names = set(self._coord_names) - {dim} | set(index.names)

return self._replace_with_new_dims(
variables, coord_names=coord_names, indexes=indexes
)

def _unstack_full_reindex(
self, dim: Hashable, fill_value, sparse: bool
) -> "Dataset":
index = self.get_index(dim)
index = remove_unused_levels_categories(index)
full_idx = pd.MultiIndex.from_product(index.levels, names=index.names)
Expand Down Expand Up @@ -3804,7 +3838,38 @@ def unstack(

result = self.copy(deep=False)
for dim in dims:
result = result._unstack_once(dim, fill_value, sparse)

if (
# Dask arrays don't support assignment by index, which the fast unstack
# function requires.
# https://github.com/pydata/xarray/pull/4746#issuecomment-753282125
any(is_duck_dask_array(v.data) for v in self.variables.values())
# Sparse doesn't currently support (though we could special-case
# it)
# https://github.com/pydata/sparse/issues/422
or any(
isinstance(v.data, sparse_array_type)
dcherian marked this conversation as resolved.
Show resolved Hide resolved
for v in self.variables.values()
)
or sparse
# numpy full_like only added `shape` in 1.17
or LooseVersion(np.__version__) < LooseVersion("1.17")
# Until https://github.com/pydata/xarray/pull/4751 is resolved,
# we check explicitly whether it's a numpy array. Once that is
# resolved, explicitly exclude pint arrays.
# # pint doesn't implement `np.full_like` in a way that's
# # currently compatible.
# # https://github.com/pydata/xarray/pull/4746#issuecomment-753425173
# # or any(
# # isinstance(v.data, pint_array_type) for v in self.variables.values()
# # )
or any(
not isinstance(v.data, np.ndarray) for v in self.variables.values()
)
):
result = result._unstack_full_reindex(dim, fill_value, sparse)
else:
result = result._unstack_once(dim, fill_value)
return result

def update(self, other: "CoercibleMapping") -> "Dataset":
Expand Down Expand Up @@ -4974,6 +5039,10 @@ def _set_numpy_data_from_dataframe(
self[name] = (dims, values)
return

# NB: similar, more general logic, now exists in
# variable.unstack_once; we could consider combining them at some
# point.

shape = tuple(lev.size for lev in idx.levels)
indexer = tuple(idx.codes)

Expand Down
68 changes: 65 additions & 3 deletions xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
Any,
Dict,
Hashable,
List,
Mapping,
Optional,
Sequence,
Expand Down Expand Up @@ -1488,7 +1489,7 @@ def set_dims(self, dims, shape=None):
)
return expanded_var.transpose(*dims)

def _stack_once(self, dims, new_dim):
def _stack_once(self, dims: List[Hashable], new_dim: Hashable):
if not set(dims) <= set(self.dims):
raise ValueError("invalid existing dimensions: %s" % dims)

Expand Down Expand Up @@ -1544,7 +1545,15 @@ def stack(self, dimensions=None, **dimensions_kwargs):
result = result._stack_once(dims, new_dim)
return result

def _unstack_once(self, dims, old_dim):
def _unstack_once_full(
self, dims: Mapping[Hashable, int], old_dim: Hashable
) -> "Variable":
"""
Unstacks the variable without needing an index.

Unlike `_unstack_once`, this function requires the existing dimension to
contain the full product of the new dimensions.
"""
new_dim_names = tuple(dims.keys())
new_dim_sizes = tuple(dims.values())

Expand Down Expand Up @@ -1573,13 +1582,64 @@ def _unstack_once(self, dims, old_dim):

return Variable(new_dims, new_data, self._attrs, self._encoding, fastpath=True)

def _unstack_once(
self,
index: pd.MultiIndex,
dim: Hashable,
fill_value=dtypes.NA,
) -> "Variable":
"""
Unstacks this variable given an index to unstack and the name of the
dimension to which the index refers.
"""

reordered = self.transpose(..., dim)

new_dim_sizes = [lev.size for lev in index.levels]
new_dim_names = index.names
indexer = index.codes

# Potentially we could replace `len(other_dims)` with just `-1`
other_dims = [d for d in self.dims if d != dim]
new_shape = list(reordered.shape[: len(other_dims)]) + new_dim_sizes
new_dims = reordered.dims[: len(other_dims)] + new_dim_names

if fill_value is dtypes.NA:
is_missing_values = np.prod(new_shape) > np.prod(self.shape)
if is_missing_values:
dtype, fill_value = dtypes.maybe_promote(self.dtype)
else:
dtype = self.dtype
fill_value = dtypes.get_fill_value(dtype)
else:
dtype = self.dtype

# Currently fails on sparse due to https://github.com/pydata/sparse/issues/422
data = np.full_like(
self.data,
fill_value=fill_value,
shape=new_shape,
dtype=dtype,
)

# Indexer is a list of lists of locations. Each list is the locations
# on the new dimension. This is robust to the data being sparse; in that
# case the destinations will be NaN / zero.
data[(..., *indexer)] = reordered

return self._replace(dims=new_dims, data=data)

def unstack(self, dimensions=None, **dimensions_kwargs):
"""
Unstack an existing dimension into multiple new dimensions.

New dimensions will be added at the end, and the order of the data
along each new dimension will be in contiguous (C) order.

Note that unlike ``DataArray.unstack`` and ``Dataset.unstack``, this
method requires the existing dimension to contain the full product of
the new dimensions.

Parameters
----------
dimensions : mapping of hashable to mapping of hashable to int
Expand All @@ -1598,11 +1658,13 @@ def unstack(self, dimensions=None, **dimensions_kwargs):
See also
--------
Variable.stack
DataArray.unstack
Dataset.unstack
"""
dimensions = either_dict_or_kwargs(dimensions, dimensions_kwargs, "unstack")
result = self
for old_dim, dims in dimensions.items():
result = result._unstack_once(dims, old_dim)
result = result._unstack_once_full(dims, old_dim)
return result

def fillna(self, value):
Expand Down