From 510a6e1741486890eaf1003123c8a11f426bde2d Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Thu, 31 Dec 2020 14:30:28 -0800 Subject: [PATCH 01/10] Significantly improve unstacking performance --- xarray/core/dataset.py | 44 +++++++++++++++++++++++++++++++++++++++-- xarray/core/variable.py | 43 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 83 insertions(+), 4 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 3bb5cd8b586..30cba39d249 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3702,7 +3702,40 @@ def ensure_stackable(val): return data_array - def _unstack_once(self, dim: Hashable, fill_value, sparse) -> "Dataset": + def _unstack_once_fast(self, dim: Hashable, fill_value, sparse: bool) -> "Dataset": + # FIXME: pass sparse through + index = self.get_index(dim) + index = remove_unused_levels_categories(index) + + variables: Dict[Hashable, Variable] = {} + indexes = {k: v for k, v in self.indexes.items() if k != dim} + + for name, var in self.variables.items(): + if name != dim: + if dim in var.dims: + # new_dims = dict(zip(new_dim_names, new_dim_sizes)) + if isinstance(fill_value, Mapping): + fill_value_ = fill_value[name] + else: + fill_value_ = fill_value + + variables[name] = var._unstack_once_fast( + index=index, dim=dim, fill_value=fill_value_ + ) + else: + variables[name] = var + + for name, lev in zip(index.names, index.levels): + variables[name] = IndexVariable(name, lev) + indexes[name] = lev + + coord_names = set(self._coord_names) - {dim} | set(index.names) + + return self._replace_with_new_dims( + variables, coord_names=coord_names, indexes=indexes + ) + + def _unstack_once(self, dim: Hashable, fill_value, sparse: bool) -> "Dataset": index = self.get_index(dim) index = remove_unused_levels_categories(index) full_idx = pd.MultiIndex.from_product(index.levels, names=index.names) @@ -3799,7 +3832,10 @@ def unstack( result = self.copy(deep=False) for dim in dims: - result = result._unstack_once(dim, fill_value, sparse) + if sparse: + result = result._unstack_once(dim, fill_value, sparse) + else: + result = result._unstack_once_fast(dim, fill_value, sparse) return result def update(self, other: "CoercibleMapping") -> "Dataset": @@ -4896,6 +4932,10 @@ def _set_numpy_data_from_dataframe( self[name] = (dims, values) return + # NB: similar, more general logic, now exists in + # variable.unstack_once_fast; we could consider combining them at some + # point. + shape = tuple(lev.size for lev in idx.levels) indexer = tuple(idx.codes) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 0a6eef44c90..b75def6d509 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -10,6 +10,7 @@ Any, Dict, Hashable, + List, Mapping, Optional, Sequence, @@ -1487,7 +1488,7 @@ def set_dims(self, dims, shape=None): ) return expanded_var.transpose(*dims) - def _stack_once(self, dims, new_dim): + def _stack_once(self, dims: List[Hashable], new_dim: Hashable): if not set(dims) <= set(self.dims): raise ValueError("invalid existing dimensions: %s" % dims) @@ -1543,7 +1544,9 @@ def stack(self, dimensions=None, **dimensions_kwargs): result = result._stack_once(dims, new_dim) return result - def _unstack_once(self, dims, old_dim): + def _unstack_once( + self, dims: Mapping[Hashable, int], old_dim: Hashable + ) -> "Variable": new_dim_names = tuple(dims.keys()) new_dim_sizes = tuple(dims.values()) @@ -1572,6 +1575,42 @@ def _unstack_once(self, dims, old_dim): return Variable(new_dims, new_data, self._attrs, self._encoding, fastpath=True) + def _unstack_once_fast( + self, + index: pd.MultiIndex, + dim: Hashable, + fill_value=dtypes.NA, + ) -> "Variable": + """ + Unstacks this variable given an index to unstack and the name of the + dimension to which the index refers. + """ + + reordered = self.transpose(..., dim) + + new_dim_sizes = [lev.size for lev in index.levels] + new_dim_names = index.names + indexer = index.codes + + # Potentially we could replace `len(other_dims)` with just `-1` + other_dims = [d for d in self.dims if d != dim] + new_shape = list(reordered.shape[: len(other_dims)]) + new_dim_sizes + new_dims = reordered.dims[: len(other_dims)] + new_dim_names + + # missing_values = np.prod(new_shape) > np.prod(self.shape) + + if fill_value is dtypes.NA: + fill_value = dtypes.get_fill_value(self.dtype) + + data = np.full(new_shape, fill_value) + + # Indexer is a list of lists of locations. Each list is the locations + # on the new dimension. This is robust to the data being sparse; in that + # case the destinations will be NaN / zero. + data[(..., *indexer)] = reordered + + return self._replace(dims=new_dims, data=data) + def unstack(self, dimensions=None, **dimensions_kwargs): """ Unstack an existing dimension into multiple new dimensions. From a1e37b96dcf0d3b8d9c5c239e32f3926f5907f6a Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Thu, 31 Dec 2020 15:47:09 -0800 Subject: [PATCH 02/10] Hack to get sparse tests passing --- xarray/core/dataset.py | 8 ++++++-- xarray/core/variable.py | 2 -- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 30cba39d249..6decfab454d 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3713,7 +3713,6 @@ def _unstack_once_fast(self, dim: Hashable, fill_value, sparse: bool) -> "Datase for name, var in self.variables.items(): if name != dim: if dim in var.dims: - # new_dims = dict(zip(new_dim_names, new_dim_sizes)) if isinstance(fill_value, Mapping): fill_value_ = fill_value[name] else: @@ -3832,7 +3831,12 @@ def unstack( result = self.copy(deep=False) for dim in dims: - if sparse: + # FIXME: remove + import sparse as sparse_ + + if sparse or any( + isinstance(v.data, sparse_.COO) for v in self.variables.values() + ): result = result._unstack_once(dim, fill_value, sparse) else: result = result._unstack_once_fast(dim, fill_value, sparse) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index b75def6d509..ec43c3b0ab5 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1597,8 +1597,6 @@ def _unstack_once_fast( new_shape = list(reordered.shape[: len(other_dims)]) + new_dim_sizes new_dims = reordered.dims[: len(other_dims)] + new_dim_names - # missing_values = np.prod(new_shape) > np.prod(self.shape) - if fill_value is dtypes.NA: fill_value = dtypes.get_fill_value(self.dtype) From e64e4dd597178e75a9a436856188ae2152fc37dd Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Fri, 1 Jan 2021 14:55:01 -0800 Subject: [PATCH 03/10] Use the existing unstack function for dask & sparse --- xarray/core/dataset.py | 34 ++++++++++++++++++++++------------ xarray/core/variable.py | 35 ++++++++++++++++++++++++++++++----- 2 files changed, 52 insertions(+), 17 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 6decfab454d..f39f68ad339 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -80,7 +80,7 @@ ) from .missing import get_clean_interp_index from .options import OPTIONS, _get_keep_attrs -from .pycompat import is_duck_dask_array +from .pycompat import is_duck_dask_array, sparse_array_type from .utils import ( Default, Frozen, @@ -3702,8 +3702,7 @@ def ensure_stackable(val): return data_array - def _unstack_once_fast(self, dim: Hashable, fill_value, sparse: bool) -> "Dataset": - # FIXME: pass sparse through + def _unstack_once(self, dim: Hashable, fill_value) -> "Dataset": index = self.get_index(dim) index = remove_unused_levels_categories(index) @@ -3718,7 +3717,7 @@ def _unstack_once_fast(self, dim: Hashable, fill_value, sparse: bool) -> "Datase else: fill_value_ = fill_value - variables[name] = var._unstack_once_fast( + variables[name] = var._unstack_once( index=index, dim=dim, fill_value=fill_value_ ) else: @@ -3734,7 +3733,9 @@ def _unstack_once_fast(self, dim: Hashable, fill_value, sparse: bool) -> "Datase variables, coord_names=coord_names, indexes=indexes ) - def _unstack_once(self, dim: Hashable, fill_value, sparse: bool) -> "Dataset": + def _unstack_full_reindex( + self, dim: Hashable, fill_value, sparse: bool + ) -> "Dataset": index = self.get_index(dim) index = remove_unused_levels_categories(index) full_idx = pd.MultiIndex.from_product(index.levels, names=index.names) @@ -3831,15 +3832,24 @@ def unstack( result = self.copy(deep=False) for dim in dims: - # FIXME: remove - import sparse as sparse_ - if sparse or any( - isinstance(v.data, sparse_.COO) for v in self.variables.values() + if ( + # Dask arrays don't support assignment by index, which the fast unstack + # function requires. + # https://github.com/pydata/xarray/pull/4746#issuecomment-753282125 + any(is_duck_dask_array(v.data) for v in self.variables.values()) + # Sparse doesn't currently support (though we could special-case + # it) + # https://github.com/pydata/sparse/issues/422 + or any( + isinstance(v.data, sparse_array_type) + for v in self.variables.values() + ) + or sparse ): - result = result._unstack_once(dim, fill_value, sparse) + result = result._unstack_full_reindex(dim, fill_value, sparse) else: - result = result._unstack_once_fast(dim, fill_value, sparse) + result = result._unstack_once(dim, fill_value) return result def update(self, other: "CoercibleMapping") -> "Dataset": @@ -4937,7 +4947,7 @@ def _set_numpy_data_from_dataframe( return # NB: similar, more general logic, now exists in - # variable.unstack_once_fast; we could consider combining them at some + # variable.unstack_once; we could consider combining them at some # point. shape = tuple(lev.size for lev in idx.levels) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index ec43c3b0ab5..51e4b36e6f2 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1544,9 +1544,15 @@ def stack(self, dimensions=None, **dimensions_kwargs): result = result._stack_once(dims, new_dim) return result - def _unstack_once( + def _unstack_once_full( self, dims: Mapping[Hashable, int], old_dim: Hashable ) -> "Variable": + """ + Unstacks the variable without needing an index. + + Unlike `_unstack_once`, this function requires the existing dimension to + contain the full product of the new dimensions. + """ new_dim_names = tuple(dims.keys()) new_dim_sizes = tuple(dims.values()) @@ -1575,7 +1581,7 @@ def _unstack_once( return Variable(new_dims, new_data, self._attrs, self._encoding, fastpath=True) - def _unstack_once_fast( + def _unstack_once( self, index: pd.MultiIndex, dim: Hashable, @@ -1598,9 +1604,22 @@ def _unstack_once_fast( new_dims = reordered.dims[: len(other_dims)] + new_dim_names if fill_value is dtypes.NA: - fill_value = dtypes.get_fill_value(self.dtype) + is_missing_values = np.prod(new_shape) > np.prod(self.shape) + if is_missing_values: + dtype, fill_value = dtypes.maybe_promote(self.dtype) + else: + dtype = self.dtype + fill_value = dtypes.get_fill_value(dtype) + else: + dtype = self.dtype - data = np.full(new_shape, fill_value) + # Currently fails on sparse due to https://github.com/pydata/sparse/issues/422 + data = np.full_like( + self.data, + fill_value=fill_value, + shape=new_shape, + dtype=dtype, + ) # Indexer is a list of lists of locations. Each list is the locations # on the new dimension. This is robust to the data being sparse; in that @@ -1616,6 +1635,10 @@ def unstack(self, dimensions=None, **dimensions_kwargs): New dimensions will be added at the end, and the order of the data along each new dimension will be in contiguous (C) order. + Note that unlike ``DataArray.unstack`` and ``Dataset.unstack``, this + method requires the existing dimension to contain the full product of + the new dimensions. + Parameters ---------- dimensions : mapping of hashable to mapping of hashable to int @@ -1634,11 +1657,13 @@ def unstack(self, dimensions=None, **dimensions_kwargs): See also -------- Variable.stack + DataArray.unstack + Dataset.unstack """ dimensions = either_dict_or_kwargs(dimensions, dimensions_kwargs, "unstack") result = self for old_dim, dims in dimensions.items(): - result = result._unstack_once(dims, old_dim) + result = result._unstack_once_full(dims, old_dim) return result def fillna(self, value): From ff8f5b038650d14500360db23381e5f1e21d35bd Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Fri, 1 Jan 2021 15:00:30 -0800 Subject: [PATCH 04/10] Add whatsnew --- doc/whats-new.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ecc134fc026..33962be65cb 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -17,7 +17,7 @@ What's New .. _whats-new.0.16.3: -v0.16.3 (unreleased) +v0.17.0 (unreleased) -------------------- Breaking changes @@ -27,6 +27,10 @@ Breaking changes New Features ~~~~~~~~~~~~ +- Significantly higher ``unstack`` performance on numpy-backed arrays which + contain missing values; 100x faster in our benchmark. + (:pull:`4746`); + By `Maximilian Roos `_. Bug fixes From 1c150fce90a9dfd19d32f432e76bfbb9cf999382 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Fri, 1 Jan 2021 15:12:10 -0800 Subject: [PATCH 05/10] Require numpy 1.17 for new unstack --- xarray/core/dataset.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index f39f68ad339..6db33a27e9e 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4,6 +4,7 @@ import sys import warnings from collections import defaultdict +from distutils.version import LooseVersion from html import escape from numbers import Number from operator import methodcaller @@ -3846,6 +3847,8 @@ def unstack( for v in self.variables.values() ) or sparse + # numpy full_like only added `shape` in 1.17 + or LooseVersion(np.__version__) < LooseVersion("1.17") ): result = result._unstack_full_reindex(dim, fill_value, sparse) else: From b33adedbfbd92df0f4188568691c7e2915bf8c19 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Fri, 1 Jan 2021 19:58:43 -0800 Subject: [PATCH 06/10] Also special case pint --- xarray/core/dataset.py | 8 +++++++- xarray/core/pycompat.py | 7 +++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 6db33a27e9e..9089b1df173 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -81,7 +81,7 @@ ) from .missing import get_clean_interp_index from .options import OPTIONS, _get_keep_attrs -from .pycompat import is_duck_dask_array, sparse_array_type +from .pycompat import is_duck_dask_array, pint_array_type, sparse_array_type from .utils import ( Default, Frozen, @@ -3849,6 +3849,12 @@ def unstack( or sparse # numpy full_like only added `shape` in 1.17 or LooseVersion(np.__version__) < LooseVersion("1.17") + # pint doesn't implement `np.full_like` in a way that's + # currently compatible. + # https://github.com/pydata/xarray/pull/4746#issuecomment-753425173 + or any( + isinstance(v.data, pint_array_type) for v in self.variables.values() + ) ): result = result._unstack_full_reindex(dim, fill_value, sparse) else: diff --git a/xarray/core/pycompat.py b/xarray/core/pycompat.py index 8d613038957..5f81f8530bc 100644 --- a/xarray/core/pycompat.py +++ b/xarray/core/pycompat.py @@ -35,3 +35,10 @@ def is_duck_dask_array(x): cupy_array_type = (cupy.ndarray,) except ImportError: # pragma: no cover cupy_array_type = () + +try: + import pint + + pint_array_type = (pint.Quantity,) +except ImportError: # pragma: no cover + pint_array_type = () From 9a53fec51e25f73eed61cb47640a1899d8de913d Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Sun, 3 Jan 2021 14:08:01 -0800 Subject: [PATCH 07/10] Revert "Also special case pint" This reverts commit b33adedbfbd92df0f4188568691c7e2915bf8c19. --- xarray/core/dataset.py | 8 +------- xarray/core/pycompat.py | 7 ------- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 9089b1df173..6db33a27e9e 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -81,7 +81,7 @@ ) from .missing import get_clean_interp_index from .options import OPTIONS, _get_keep_attrs -from .pycompat import is_duck_dask_array, pint_array_type, sparse_array_type +from .pycompat import is_duck_dask_array, sparse_array_type from .utils import ( Default, Frozen, @@ -3849,12 +3849,6 @@ def unstack( or sparse # numpy full_like only added `shape` in 1.17 or LooseVersion(np.__version__) < LooseVersion("1.17") - # pint doesn't implement `np.full_like` in a way that's - # currently compatible. - # https://github.com/pydata/xarray/pull/4746#issuecomment-753425173 - or any( - isinstance(v.data, pint_array_type) for v in self.variables.values() - ) ): result = result._unstack_full_reindex(dim, fill_value, sparse) else: diff --git a/xarray/core/pycompat.py b/xarray/core/pycompat.py index 5f81f8530bc..8d613038957 100644 --- a/xarray/core/pycompat.py +++ b/xarray/core/pycompat.py @@ -35,10 +35,3 @@ def is_duck_dask_array(x): cupy_array_type = (cupy.ndarray,) except ImportError: # pragma: no cover cupy_array_type = () - -try: - import pint - - pint_array_type = (pint.Quantity,) -except ImportError: # pragma: no cover - pint_array_type = () From e2ba14484ee648ab8c0613574c1d782908189795 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Sun, 3 Jan 2021 14:13:18 -0800 Subject: [PATCH 08/10] Only run fast unstack on numpy arrays --- xarray/core/dataset.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 6db33a27e9e..8f2cf05b29e 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3849,6 +3849,18 @@ def unstack( or sparse # numpy full_like only added `shape` in 1.17 or LooseVersion(np.__version__) < LooseVersion("1.17") + # Until https://github.com/pydata/xarray/pull/4751 is resolved, + # we check explicitly whether it's a numpy array. Once that is + # resolved, explicitly exclude pint arrays. + # # pint doesn't implement `np.full_like` in a way that's + # # currently compatible. + # # https://github.com/pydata/xarray/pull/4746#issuecomment-753425173 + # # or any( + # # isinstance(v.data, pint_array_type) for v in self.variables.values() + # # ) + or any( + not isinstance(v.data, np.ndarray) for v in self.variables.values() + ) ): result = result._unstack_full_reindex(dim, fill_value, sparse) else: From c280cd9c2a3f2544cf3330eca3b3575d9c60ecb1 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Thu, 14 Jan 2021 14:48:11 -0800 Subject: [PATCH 09/10] Update asvs for unstacking --- asv_bench/benchmarks/unstacking.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/asv_bench/benchmarks/unstacking.py b/asv_bench/benchmarks/unstacking.py index 342475b96df..8d0c3932870 100644 --- a/asv_bench/benchmarks/unstacking.py +++ b/asv_bench/benchmarks/unstacking.py @@ -7,18 +7,23 @@ class Unstacking: def setup(self): - data = np.random.RandomState(0).randn(1, 1000, 500) - self.ds = xr.DataArray(data).stack(flat_dim=["dim_1", "dim_2"]) + data = np.random.RandomState(0).randn(500, 1000) + self.da_full = xr.DataArray(data, dims=list("ab")).stack(flat_dim=[...]) + self.da_missing = self.da_full[:-1] + self.df_missing = self.da_missing.to_pandas() def time_unstack_fast(self): - self.ds.unstack("flat_dim") + self.da_full.unstack("flat_dim") def time_unstack_slow(self): - self.ds[:, ::-1].unstack("flat_dim") + self.da_missing.unstack("flat_dim") + + def time_unstack_pandas_slow(self): + self.df_missing.unstack() class UnstackingDask(Unstacking): def setup(self, *args, **kwargs): requires_dask() super().setup(**kwargs) - self.ds = self.ds.chunk({"flat_dim": 50}) + self.da_full = self.da_full.chunk({"flat_dim": 50}) From be76382d2bfbd29ee81a8f8711835153106a46c3 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Thu, 14 Jan 2021 14:49:19 -0800 Subject: [PATCH 10/10] Update whatsnew --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b4e077a99c3..d39da4a80f6 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -34,7 +34,7 @@ Breaking changes New Features ~~~~~~~~~~~~ - Significantly higher ``unstack`` performance on numpy-backed arrays which - contain missing values; 100x faster in our benchmark. + contain missing values; 8x faster in our benchmark, and 2x faster than pandas. (:pull:`4746`); By `Maximilian Roos `_.