diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index dafa0fc47e1..7e0b11b815a 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -65,6 +65,7 @@ "bottleneck": [""], "dask": [""], "distributed": [""], + "sparse": [""] }, diff --git a/asv_bench/benchmarks/__init__.py b/asv_bench/benchmarks/__init__.py index 02c3896e236..aa600c88003 100644 --- a/asv_bench/benchmarks/__init__.py +++ b/asv_bench/benchmarks/__init__.py @@ -22,6 +22,13 @@ def requires_dask(): raise NotImplementedError() +def requires_sparse(): + try: + import sparse # noqa: F401 + except ImportError: + raise NotImplementedError() + + def randn(shape, frac_nan=None, chunks=None, seed=0): rng = np.random.RandomState(seed) if chunks is None: diff --git a/asv_bench/benchmarks/unstacking.py b/asv_bench/benchmarks/unstacking.py index 2c5b7ca7821..dc8bc3307c3 100644 --- a/asv_bench/benchmarks/unstacking.py +++ b/asv_bench/benchmarks/unstacking.py @@ -1,8 +1,9 @@ import numpy as np +import pandas as pd import xarray as xr -from . import requires_dask +from . import requires_dask, requires_sparse class Unstacking: @@ -27,3 +28,37 @@ def setup(self, *args, **kwargs): requires_dask() super().setup(**kwargs) self.da_full = self.da_full.chunk({"flat_dim": 25}) + + +class UnstackingSparse(Unstacking): + def setup(self, *args, **kwargs): + requires_sparse() + + import sparse + + data = sparse.random((500, 1000), random_state=0, fill_value=0) + self.da_full = xr.DataArray(data, dims=list("ab")).stack(flat_dim=[...]) + self.da_missing = self.da_full[:-1] + + mindex = pd.MultiIndex.from_arrays([np.arange(100), np.arange(100)]) + self.da_eye_2d = xr.DataArray(np.ones((100,)), dims="z", coords={"z": mindex}) + self.da_eye_3d = xr.DataArray( + np.ones((100, 50)), + dims=("z", "foo"), + coords={"z": mindex, "foo": np.arange(50)}, + ) + + def time_unstack_to_sparse_2d(self): + self.da_eye_2d.unstack(sparse=True) + + def time_unstack_to_sparse_3d(self): + self.da_eye_3d.unstack(sparse=True) + + def peakmem_unstack_to_sparse_2d(self): + self.da_eye_2d.unstack(sparse=True) + + def peakmem_unstack_to_sparse_3d(self): + self.da_eye_3d.unstack(sparse=True) + + def time_unstack_pandas_slow(self): + pass diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 8c49a648bd6..9a6f6e21e5e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -33,6 +33,12 @@ Deprecations ~~~~~~~~~~~~ +Performance +~~~~~~~~~~~ + +- Significantly faster unstacking to a ``sparse`` array. :pull:`5577` + By `Deepak Cherian `_. + Bug fixes ~~~~~~~~~ - :py:func:`xr.map_blocks` and :py:func:`xr.corr` now work when dask is not installed (:issue:`3391`, :issue:`5715`, :pull:`5731`). diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index e882495dce5..83c7b154658 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4045,7 +4045,9 @@ def ensure_stackable(val): return data_array - def _unstack_once(self, dim: Hashable, fill_value) -> "Dataset": + def _unstack_once( + self, dim: Hashable, fill_value, sparse: bool = False + ) -> "Dataset": index = self.get_index(dim) index = remove_unused_levels_categories(index) @@ -4061,7 +4063,7 @@ def _unstack_once(self, dim: Hashable, fill_value) -> "Dataset": fill_value_ = fill_value variables[name] = var._unstack_once( - index=index, dim=dim, fill_value=fill_value_ + index=index, dim=dim, fill_value=fill_value_, sparse=sparse ) else: variables[name] = var @@ -4195,7 +4197,7 @@ def unstack( # Once that is resolved, explicitly exclude pint arrays. # pint doesn't implement `np.full_like` in a way that's # currently compatible. - needs_full_reindex = sparse or any( + needs_full_reindex = any( is_duck_dask_array(v.data) or isinstance(v.data, sparse_array_type) or not isinstance(v.data, np.ndarray) @@ -4206,7 +4208,7 @@ def unstack( if needs_full_reindex: result = result._unstack_full_reindex(dim, fill_value, sparse) else: - result = result._unstack_once(dim, fill_value) + result = result._unstack_once(dim, fill_value, sparse) return result def update(self, other: "CoercibleMapping") -> "Dataset": diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 52125ec4113..e2d02b41a17 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1631,6 +1631,7 @@ def _unstack_once( index: pd.MultiIndex, dim: Hashable, fill_value=dtypes.NA, + sparse: bool = False, ) -> "Variable": """ Unstacks this variable given an index to unstack and the name of the @@ -1658,19 +1659,38 @@ def _unstack_once( else: dtype = self.dtype - data = np.full_like( - self.data, - fill_value=fill_value, - shape=new_shape, - dtype=dtype, - ) + if sparse: + # unstacking a dense multitindexed array to a sparse array + from sparse import COO + + codes = zip(*index.codes) + if reordered.ndim == 1: + indexes = codes + else: + sizes = itertools.product(*[range(s) for s in reordered.shape[:-1]]) + tuple_indexes = itertools.product(sizes, codes) + indexes = map(lambda x: list(itertools.chain(*x)), tuple_indexes) # type: ignore + + data = COO( + coords=np.array(list(indexes)).T, + data=self.data.astype(dtype).ravel(), + fill_value=fill_value, + shape=new_shape, + sorted=index.is_monotonic_increasing, + ) + + else: + data = np.full_like( + self.data, + fill_value=fill_value, + shape=new_shape, + dtype=dtype, + ) - # Indexer is a list of lists of locations. Each list is the locations - # on the new dimension. This is robust to the data being sparse; in that - # case the destinations will be NaN / zero. - # sparse doesn't support item assigment, - # https://github.com/pydata/sparse/issues/114 - data[(..., *indexer)] = reordered + # Indexer is a list of lists of locations. Each list is the locations + # on the new dimension. This is robust to the data being sparse; in that + # case the destinations will be NaN / zero. + data[(..., *indexer)] = reordered return self._replace(dims=new_dims, data=data) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index cdb8382c8ee..16148c21b43 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -28,7 +28,7 @@ from xarray.core import dtypes, indexing, utils from xarray.core.common import duck_array_ops, full_like from xarray.core.indexes import Index -from xarray.core.pycompat import integer_types +from xarray.core.pycompat import integer_types, sparse_array_type from xarray.core.utils import is_scalar from . import ( @@ -3085,14 +3085,42 @@ def test_unstack_sparse(self): # test fill_value actual = ds.unstack("index", sparse=True) expected = ds.unstack("index") + assert isinstance(actual["var"].data, sparse_array_type) assert actual["var"].variable._to_dense().equals(expected["var"].variable) assert actual["var"].data.density < 1.0 actual = ds["var"].unstack("index", sparse=True) expected = ds["var"].unstack("index") + assert isinstance(actual.data, sparse_array_type) assert actual.variable._to_dense().equals(expected.variable) assert actual.data.density < 1.0 + mindex = pd.MultiIndex.from_arrays( + [np.arange(3), np.arange(3)], names=["a", "b"] + ) + ds_eye = Dataset( + {"var": (("z", "foo", "bar"), np.ones((3, 4, 5)))}, + coords={"z": mindex, "foo": np.arange(4), "bar": np.arange(5)}, + ) + actual = ds_eye.unstack(sparse=True, fill_value=0) + assert isinstance(actual["var"].data, sparse_array_type) + expected = xr.Dataset( + { + "var": ( + ("foo", "bar", "a", "b"), + np.broadcast_to(np.eye(3, 3), (4, 5, 3, 3)), + ) + }, + coords={ + "foo": np.arange(4), + "bar": np.arange(5), + "a": np.arange(3), + "b": np.arange(3), + }, + ) + actual["var"].data = actual["var"].data.todense() + assert_equal(expected, actual) + def test_stack_unstack_fast(self): ds = Dataset( {