From d3a262b459517ed530599584ef91185d174d7670 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas <thomas.nicholas@columbia.edu> Date: Fri, 25 Feb 2022 12:01:59 -0500 Subject: [PATCH 1/7] tests for da.drop_duplicates over multiple dims --- xarray/tests/test_dataarray.py | 59 ++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 17 deletions(-) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 8d73f9ec7ee..fc82c03c5d9 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -6618,25 +6618,50 @@ def test_clip(da): result = da.clip(min=da.mean("x"), max=da.mean("a").isel(x=[0, 1])) -@pytest.mark.parametrize("keep", ["first", "last", False]) -def test_drop_duplicates(keep): - ds = xr.DataArray( - [0, 5, 6, 7], dims="time", coords={"time": [0, 0, 1, 2]}, name="test" - ) +class TestDropDuplicates: + @pytest.mark.parametrize("keep", ["first", "last", False]) + def test_drop_duplicates_1d(self, keep): + da = xr.DataArray( + [0, 5, 6, 7], dims="time", coords={"time": [0, 0, 1, 2]}, name="test" + ) - if keep == "first": - data = [0, 6, 7] - time = [0, 1, 2] - elif keep == "last": - data = [5, 6, 7] - time = [0, 1, 2] - else: - data = [6, 7] - time = [1, 2] + if keep == "first": + data = [0, 6, 7] + time = [0, 1, 2] + elif keep == "last": + data = [5, 6, 7] + time = [0, 1, 2] + else: + data = [6, 7] + time = [1, 2] + + expected = xr.DataArray(data, dims="time", coords={"time": time}, name="test") + result = da.drop_duplicates("time", keep=keep) + assert_equal(expected, result) + + with pytest.raises(ValueError, match="['space'] not found"): + da.drop_duplicates("space", keep=keep) + + def test_drop_duplicates_2d(self): + da = xr.DataArray( + [[0, 5, 6, 7], [2, 1, 3, 4]], + dims=["space", "time"], + coords={"space": [10, 10], "time": [0, 0, 1, 2]}, + name="test", + ) + + expected = xr.DataArray( + [[0, 6, 7]], + dims=["space", "time"], + coords={"time": ("time", [0, 1, 2]), "space": ("space", [10])}, + name="test", + ) + + result = da.drop_duplicates(["time", "space"], keep="first") + assert_equal(expected, result) - expected = xr.DataArray(data, dims="time", coords={"time": time}, name="test") - result = ds.drop_duplicates("time", keep=keep) - assert_equal(expected, result) + result = da.drop_duplicates(..., keep="first") + assert_equal(expected, result) class TestNumpyCoercion: From fae4064b80223ee7dcb352759ea4f2fc93e4890c Mon Sep 17 00:00:00 2001 From: Thomas Nicholas <thomas.nicholas@columbia.edu> Date: Fri, 25 Feb 2022 12:02:11 -0500 Subject: [PATCH 2/7] pass tests --- xarray/core/dataarray.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 20e829d293e..4a46ccf9205 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -4659,14 +4659,15 @@ def curvefit( def drop_duplicates( self, - dim: Hashable, - keep: (str | bool) = "first", + dim: Hashable | Iterable[Hashable] | ..., + keep: Literal["first", "last"] | Literal[False] = "first", ): """Returns a new DataArray with duplicate dimension values removed. Parameters ---------- - dim : dimension label, optional + dim : dimension label or labels + Pass `...` to drop duplicates along all dimensions. keep : {"first", "last", False}, default: "first" Determines which duplicates (if any) to keep. - ``"first"`` : Drop duplicates except for the first occurrence. @@ -4677,9 +4678,18 @@ def drop_duplicates( ------- DataArray """ - if dim not in self.dims: + if isinstance(dim, str): + dims = (dim,) + elif dim is ...: + dims = self.dims + else: + dims = dim + + missing_dims = set(dims) - set(self.dims) + if missing_dims: raise ValueError(f"'{dim}' not found in dimensions") - indexes = {dim: ~self.get_index(dim).duplicated(keep=keep)} + + indexes = {dim: ~self.get_index(dim).duplicated(keep=keep) for dim in dims} return self.isel(indexes) def convert_calendar( From 3f1f88c4bdc54b38e699c127a46b52f3b4922800 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas <thomas.nicholas@columbia.edu> Date: Fri, 25 Feb 2022 12:28:04 -0500 Subject: [PATCH 3/7] test for Dataset.drop_duplicates --- xarray/tests/test_dataset.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index c4fa847e664..7ff75fb791b 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -6546,6 +6546,37 @@ def test_clip(ds): assert result.dims == ds.dims +class TestDropDuplicates: + @pytest.mark.parametrize("keep", ["first", "last", False]) + def test_drop_duplicates_1d(self, keep): + ds = xr.Dataset( + {"a": ("time", [0, 5, 6, 7]), "b": ("time", [9, 3, 8, 2])}, + coords={"time": [0, 0, 1, 2]}, + ) + + if keep == "first": + a = [0, 6, 7] + b = [9, 8, 2] + time = [0, 1, 2] + elif keep == "last": + a = [5, 6, 7] + b = [3, 8, 2] + time = [0, 1, 2] + else: + a = [6, 7] + b = [8, 2] + time = [1, 2] + + expected = xr.Dataset( + {"a": ("time", a), "b": ("time", b)}, coords={"time": time} + ) + result = ds.drop_duplicates("time", keep=keep) + assert_equal(expected, result) + + with pytest.raises(ValueError, match="['space'] not found"): + ds.drop_duplicates("space", keep=keep) + + class TestNumpyCoercion: def test_from_numpy(self): ds = xr.Dataset({"a": ("x", [1, 2, 3])}, coords={"lat": ("x", [4, 5, 6])}) From d62b2a487e2f17c65b82503e0646c20a418c2382 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas <thomas.nicholas@columbia.edu> Date: Fri, 25 Feb 2022 12:28:29 -0500 Subject: [PATCH 4/7] piped both paths through dataset.drop_duplicates --- xarray/core/dataarray.py | 19 ++++++------------- xarray/core/dataset.py | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 4a46ccf9205..b3c45d65818 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -4677,20 +4677,13 @@ def drop_duplicates( Returns ------- DataArray - """ - if isinstance(dim, str): - dims = (dim,) - elif dim is ...: - dims = self.dims - else: - dims = dim - - missing_dims = set(dims) - set(self.dims) - if missing_dims: - raise ValueError(f"'{dim}' not found in dimensions") - indexes = {dim: ~self.get_index(dim).duplicated(keep=keep) for dim in dims} - return self.isel(indexes) + See Also + -------- + Dataset.drop_duplicates + """ + deduplicated = self._to_temp_dataset().drop_duplicates(dim, keep=keep) + return self._from_temp_dataset(deduplicated) def convert_calendar( self, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index fb30cf22e04..52f575ee4c6 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7770,6 +7770,45 @@ def _wrapper(Y, *coords_, **kwargs): return result + def drop_duplicates( + self, + dim: Hashable | Iterable[Hashable] | ..., + keep: Literal["first", "last"] | Literal[False] = "first", + ): + """Returns a new Dataset with duplicate dimension values removed. + + Parameters + ---------- + dim : dimension label or labels + Pass `...` to drop duplicates along all dimensions. + keep : {"first", "last", False}, default: "first" + Determines which duplicates (if any) to keep. + - ``"first"`` : Drop duplicates except for the first occurrence. + - ``"last"`` : Drop duplicates except for the last occurrence. + - False : Drop all duplicates. + + Returns + ------- + Dataset + + See Also + -------- + DataArray.drop_duplicates + """ + if isinstance(dim, str): + dims = (dim,) + elif dim is ...: + dims = self.dims + else: + dims = dim + + missing_dims = set(dims) - set(self.dims) + if missing_dims: + raise ValueError(f"'{dim}' not found in dimensions") + + indexes = {dim: ~self.get_index(dim).duplicated(keep=keep) for dim in dims} + return self.isel(indexes) + def convert_calendar( self, calendar: str, From 148a3e77186be87fdfc75702e882552f17ee695e Mon Sep 17 00:00:00 2001 From: Thomas Nicholas <thomas.nicholas@columbia.edu> Date: Fri, 25 Feb 2022 12:32:13 -0500 Subject: [PATCH 5/7] added dataset.drop_duplicates to API docs --- doc/api.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/api.rst b/doc/api.rst index b552bc6b4d2..d2c222da4db 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -106,6 +106,7 @@ Dataset contents Dataset.swap_dims Dataset.expand_dims Dataset.drop_vars + Dataset.drop_duplicates Dataset.drop_dims Dataset.set_coords Dataset.reset_coords From d545e5db9133b7234a2e7783038768ca9e45dfaa Mon Sep 17 00:00:00 2001 From: Thomas Nicholas <thomas.nicholas@columbia.edu> Date: Fri, 25 Feb 2022 12:34:38 -0500 Subject: [PATCH 6/7] whats-new entry --- doc/whats-new.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index aa48bd619e8..24a8042ee66 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -31,7 +31,9 @@ New Features - Enbable to provide more keyword arguments to `pydap` backend when reading OpenDAP datasets (:issue:`6274`). By `Jonas Gliß <https://github.com/jgliss>`. - +- Allow :py:meth:`DataArray.drop_duplicates` to drop duplicates along multiple dimensions at once, + and add :py:meth:`Dataset.drop_duplicates`. (:pull:`6307`) + By `Tom Nicholas <https://github.com/TomNicholas>`_. Breaking changes ~~~~~~~~~~~~~~~~ From 33d05e80a8f5bcb992e353b87986fad8eecaac0f Mon Sep 17 00:00:00 2001 From: Thomas Nicholas <thomas.nicholas@columbia.edu> Date: Fri, 25 Feb 2022 14:10:31 -0500 Subject: [PATCH 7/7] correct small bug when raising error --- xarray/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 52f575ee4c6..be9df9d2e2d 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7804,7 +7804,7 @@ def drop_duplicates( missing_dims = set(dims) - set(self.dims) if missing_dims: - raise ValueError(f"'{dim}' not found in dimensions") + raise ValueError(f"'{missing_dims}' not found in dimensions") indexes = {dim: ~self.get_index(dim).duplicated(keep=keep) for dim in dims} return self.isel(indexes)