diff --git a/doc/api.rst b/doc/api.rst index b552bc6b4d2..d2c222da4db 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -106,6 +106,7 @@ Dataset contents Dataset.swap_dims Dataset.expand_dims Dataset.drop_vars + Dataset.drop_duplicates Dataset.drop_dims Dataset.set_coords Dataset.reset_coords diff --git a/doc/whats-new.rst b/doc/whats-new.rst index aa48bd619e8..24a8042ee66 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -31,7 +31,9 @@ New Features - Enbable to provide more keyword arguments to `pydap` backend when reading OpenDAP datasets (:issue:`6274`). By `Jonas Gliß `. - +- Allow :py:meth:`DataArray.drop_duplicates` to drop duplicates along multiple dimensions at once, + and add :py:meth:`Dataset.drop_duplicates`. (:pull:`6307`) + By `Tom Nicholas `_. Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 20e829d293e..b3c45d65818 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -4659,14 +4659,15 @@ def curvefit( def drop_duplicates( self, - dim: Hashable, - keep: (str | bool) = "first", + dim: Hashable | Iterable[Hashable] | ..., + keep: Literal["first", "last"] | Literal[False] = "first", ): """Returns a new DataArray with duplicate dimension values removed. Parameters ---------- - dim : dimension label, optional + dim : dimension label or labels + Pass `...` to drop duplicates along all dimensions. keep : {"first", "last", False}, default: "first" Determines which duplicates (if any) to keep. - ``"first"`` : Drop duplicates except for the first occurrence. @@ -4676,11 +4677,13 @@ def drop_duplicates( Returns ------- DataArray + + See Also + -------- + Dataset.drop_duplicates """ - if dim not in self.dims: - raise ValueError(f"'{dim}' not found in dimensions") - indexes = {dim: ~self.get_index(dim).duplicated(keep=keep)} - return self.isel(indexes) + deduplicated = self._to_temp_dataset().drop_duplicates(dim, keep=keep) + return self._from_temp_dataset(deduplicated) def convert_calendar( self, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index fb30cf22e04..be9df9d2e2d 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7770,6 +7770,45 @@ def _wrapper(Y, *coords_, **kwargs): return result + def drop_duplicates( + self, + dim: Hashable | Iterable[Hashable] | ..., + keep: Literal["first", "last"] | Literal[False] = "first", + ): + """Returns a new Dataset with duplicate dimension values removed. + + Parameters + ---------- + dim : dimension label or labels + Pass `...` to drop duplicates along all dimensions. + keep : {"first", "last", False}, default: "first" + Determines which duplicates (if any) to keep. + - ``"first"`` : Drop duplicates except for the first occurrence. + - ``"last"`` : Drop duplicates except for the last occurrence. + - False : Drop all duplicates. + + Returns + ------- + Dataset + + See Also + -------- + DataArray.drop_duplicates + """ + if isinstance(dim, str): + dims = (dim,) + elif dim is ...: + dims = self.dims + else: + dims = dim + + missing_dims = set(dims) - set(self.dims) + if missing_dims: + raise ValueError(f"'{missing_dims}' not found in dimensions") + + indexes = {dim: ~self.get_index(dim).duplicated(keep=keep) for dim in dims} + return self.isel(indexes) + def convert_calendar( self, calendar: str, diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 8d73f9ec7ee..fc82c03c5d9 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -6618,25 +6618,50 @@ def test_clip(da): result = da.clip(min=da.mean("x"), max=da.mean("a").isel(x=[0, 1])) -@pytest.mark.parametrize("keep", ["first", "last", False]) -def test_drop_duplicates(keep): - ds = xr.DataArray( - [0, 5, 6, 7], dims="time", coords={"time": [0, 0, 1, 2]}, name="test" - ) +class TestDropDuplicates: + @pytest.mark.parametrize("keep", ["first", "last", False]) + def test_drop_duplicates_1d(self, keep): + da = xr.DataArray( + [0, 5, 6, 7], dims="time", coords={"time": [0, 0, 1, 2]}, name="test" + ) - if keep == "first": - data = [0, 6, 7] - time = [0, 1, 2] - elif keep == "last": - data = [5, 6, 7] - time = [0, 1, 2] - else: - data = [6, 7] - time = [1, 2] + if keep == "first": + data = [0, 6, 7] + time = [0, 1, 2] + elif keep == "last": + data = [5, 6, 7] + time = [0, 1, 2] + else: + data = [6, 7] + time = [1, 2] + + expected = xr.DataArray(data, dims="time", coords={"time": time}, name="test") + result = da.drop_duplicates("time", keep=keep) + assert_equal(expected, result) + + with pytest.raises(ValueError, match="['space'] not found"): + da.drop_duplicates("space", keep=keep) + + def test_drop_duplicates_2d(self): + da = xr.DataArray( + [[0, 5, 6, 7], [2, 1, 3, 4]], + dims=["space", "time"], + coords={"space": [10, 10], "time": [0, 0, 1, 2]}, + name="test", + ) + + expected = xr.DataArray( + [[0, 6, 7]], + dims=["space", "time"], + coords={"time": ("time", [0, 1, 2]), "space": ("space", [10])}, + name="test", + ) + + result = da.drop_duplicates(["time", "space"], keep="first") + assert_equal(expected, result) - expected = xr.DataArray(data, dims="time", coords={"time": time}, name="test") - result = ds.drop_duplicates("time", keep=keep) - assert_equal(expected, result) + result = da.drop_duplicates(..., keep="first") + assert_equal(expected, result) class TestNumpyCoercion: diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index c4fa847e664..7ff75fb791b 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -6546,6 +6546,37 @@ def test_clip(ds): assert result.dims == ds.dims +class TestDropDuplicates: + @pytest.mark.parametrize("keep", ["first", "last", False]) + def test_drop_duplicates_1d(self, keep): + ds = xr.Dataset( + {"a": ("time", [0, 5, 6, 7]), "b": ("time", [9, 3, 8, 2])}, + coords={"time": [0, 0, 1, 2]}, + ) + + if keep == "first": + a = [0, 6, 7] + b = [9, 8, 2] + time = [0, 1, 2] + elif keep == "last": + a = [5, 6, 7] + b = [3, 8, 2] + time = [0, 1, 2] + else: + a = [6, 7] + b = [8, 2] + time = [1, 2] + + expected = xr.Dataset( + {"a": ("time", a), "b": ("time", b)}, coords={"time": time} + ) + result = ds.drop_duplicates("time", keep=keep) + assert_equal(expected, result) + + with pytest.raises(ValueError, match="['space'] not found"): + ds.drop_duplicates("space", keep=keep) + + class TestNumpyCoercion: def test_from_numpy(self): ds = xr.Dataset({"a": ("x", [1, 2, 3])}, coords={"lat": ("x", [4, 5, 6])})