From d3a262b459517ed530599584ef91185d174d7670 Mon Sep 17 00:00:00 2001
From: Thomas Nicholas <thomas.nicholas@columbia.edu>
Date: Fri, 25 Feb 2022 12:01:59 -0500
Subject: [PATCH 1/7] tests for da.drop_duplicates over multiple dims

---
 xarray/tests/test_dataarray.py | 59 ++++++++++++++++++++++++----------
 1 file changed, 42 insertions(+), 17 deletions(-)

diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py
index 8d73f9ec7ee..fc82c03c5d9 100644
--- a/xarray/tests/test_dataarray.py
+++ b/xarray/tests/test_dataarray.py
@@ -6618,25 +6618,50 @@ def test_clip(da):
         result = da.clip(min=da.mean("x"), max=da.mean("a").isel(x=[0, 1]))
 
 
-@pytest.mark.parametrize("keep", ["first", "last", False])
-def test_drop_duplicates(keep):
-    ds = xr.DataArray(
-        [0, 5, 6, 7], dims="time", coords={"time": [0, 0, 1, 2]}, name="test"
-    )
+class TestDropDuplicates:
+    @pytest.mark.parametrize("keep", ["first", "last", False])
+    def test_drop_duplicates_1d(self, keep):
+        da = xr.DataArray(
+            [0, 5, 6, 7], dims="time", coords={"time": [0, 0, 1, 2]}, name="test"
+        )
 
-    if keep == "first":
-        data = [0, 6, 7]
-        time = [0, 1, 2]
-    elif keep == "last":
-        data = [5, 6, 7]
-        time = [0, 1, 2]
-    else:
-        data = [6, 7]
-        time = [1, 2]
+        if keep == "first":
+            data = [0, 6, 7]
+            time = [0, 1, 2]
+        elif keep == "last":
+            data = [5, 6, 7]
+            time = [0, 1, 2]
+        else:
+            data = [6, 7]
+            time = [1, 2]
+
+        expected = xr.DataArray(data, dims="time", coords={"time": time}, name="test")
+        result = da.drop_duplicates("time", keep=keep)
+        assert_equal(expected, result)
+
+        with pytest.raises(ValueError, match="['space'] not found"):
+            da.drop_duplicates("space", keep=keep)
+
+    def test_drop_duplicates_2d(self):
+        da = xr.DataArray(
+            [[0, 5, 6, 7], [2, 1, 3, 4]],
+            dims=["space", "time"],
+            coords={"space": [10, 10], "time": [0, 0, 1, 2]},
+            name="test",
+        )
+
+        expected = xr.DataArray(
+            [[0, 6, 7]],
+            dims=["space", "time"],
+            coords={"time": ("time", [0, 1, 2]), "space": ("space", [10])},
+            name="test",
+        )
+
+        result = da.drop_duplicates(["time", "space"], keep="first")
+        assert_equal(expected, result)
 
-    expected = xr.DataArray(data, dims="time", coords={"time": time}, name="test")
-    result = ds.drop_duplicates("time", keep=keep)
-    assert_equal(expected, result)
+        result = da.drop_duplicates(..., keep="first")
+        assert_equal(expected, result)
 
 
 class TestNumpyCoercion:

From fae4064b80223ee7dcb352759ea4f2fc93e4890c Mon Sep 17 00:00:00 2001
From: Thomas Nicholas <thomas.nicholas@columbia.edu>
Date: Fri, 25 Feb 2022 12:02:11 -0500
Subject: [PATCH 2/7] pass tests

---
 xarray/core/dataarray.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
index 20e829d293e..4a46ccf9205 100644
--- a/xarray/core/dataarray.py
+++ b/xarray/core/dataarray.py
@@ -4659,14 +4659,15 @@ def curvefit(
 
     def drop_duplicates(
         self,
-        dim: Hashable,
-        keep: (str | bool) = "first",
+        dim: Hashable | Iterable[Hashable] | ...,
+        keep: Literal["first", "last"] | Literal[False] = "first",
     ):
         """Returns a new DataArray with duplicate dimension values removed.
 
         Parameters
         ----------
-        dim : dimension label, optional
+        dim : dimension label or labels
+            Pass `...` to drop duplicates along all dimensions.
         keep : {"first", "last", False}, default: "first"
             Determines which duplicates (if any) to keep.
             - ``"first"`` : Drop duplicates except for the first occurrence.
@@ -4677,9 +4678,18 @@ def drop_duplicates(
         -------
         DataArray
         """
-        if dim not in self.dims:
+        if isinstance(dim, str):
+            dims = (dim,)
+        elif dim is ...:
+            dims = self.dims
+        else:
+            dims = dim
+
+        missing_dims = set(dims) - set(self.dims)
+        if missing_dims:
             raise ValueError(f"'{dim}' not found in dimensions")
-        indexes = {dim: ~self.get_index(dim).duplicated(keep=keep)}
+
+        indexes = {dim: ~self.get_index(dim).duplicated(keep=keep) for dim in dims}
         return self.isel(indexes)
 
     def convert_calendar(

From 3f1f88c4bdc54b38e699c127a46b52f3b4922800 Mon Sep 17 00:00:00 2001
From: Thomas Nicholas <thomas.nicholas@columbia.edu>
Date: Fri, 25 Feb 2022 12:28:04 -0500
Subject: [PATCH 3/7] test for Dataset.drop_duplicates

---
 xarray/tests/test_dataset.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
index c4fa847e664..7ff75fb791b 100644
--- a/xarray/tests/test_dataset.py
+++ b/xarray/tests/test_dataset.py
@@ -6546,6 +6546,37 @@ def test_clip(ds):
     assert result.dims == ds.dims
 
 
+class TestDropDuplicates:
+    @pytest.mark.parametrize("keep", ["first", "last", False])
+    def test_drop_duplicates_1d(self, keep):
+        ds = xr.Dataset(
+            {"a": ("time", [0, 5, 6, 7]), "b": ("time", [9, 3, 8, 2])},
+            coords={"time": [0, 0, 1, 2]},
+        )
+
+        if keep == "first":
+            a = [0, 6, 7]
+            b = [9, 8, 2]
+            time = [0, 1, 2]
+        elif keep == "last":
+            a = [5, 6, 7]
+            b = [3, 8, 2]
+            time = [0, 1, 2]
+        else:
+            a = [6, 7]
+            b = [8, 2]
+            time = [1, 2]
+
+        expected = xr.Dataset(
+            {"a": ("time", a), "b": ("time", b)}, coords={"time": time}
+        )
+        result = ds.drop_duplicates("time", keep=keep)
+        assert_equal(expected, result)
+
+        with pytest.raises(ValueError, match="['space'] not found"):
+            ds.drop_duplicates("space", keep=keep)
+
+
 class TestNumpyCoercion:
     def test_from_numpy(self):
         ds = xr.Dataset({"a": ("x", [1, 2, 3])}, coords={"lat": ("x", [4, 5, 6])})

From d62b2a487e2f17c65b82503e0646c20a418c2382 Mon Sep 17 00:00:00 2001
From: Thomas Nicholas <thomas.nicholas@columbia.edu>
Date: Fri, 25 Feb 2022 12:28:29 -0500
Subject: [PATCH 4/7] piped both paths through dataset.drop_duplicates

---
 xarray/core/dataarray.py | 19 ++++++-------------
 xarray/core/dataset.py   | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
index 4a46ccf9205..b3c45d65818 100644
--- a/xarray/core/dataarray.py
+++ b/xarray/core/dataarray.py
@@ -4677,20 +4677,13 @@ def drop_duplicates(
         Returns
         -------
         DataArray
-        """
-        if isinstance(dim, str):
-            dims = (dim,)
-        elif dim is ...:
-            dims = self.dims
-        else:
-            dims = dim
-
-        missing_dims = set(dims) - set(self.dims)
-        if missing_dims:
-            raise ValueError(f"'{dim}' not found in dimensions")
 
-        indexes = {dim: ~self.get_index(dim).duplicated(keep=keep) for dim in dims}
-        return self.isel(indexes)
+        See Also
+        --------
+        Dataset.drop_duplicates
+        """
+        deduplicated = self._to_temp_dataset().drop_duplicates(dim, keep=keep)
+        return self._from_temp_dataset(deduplicated)
 
     def convert_calendar(
         self,
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
index fb30cf22e04..52f575ee4c6 100644
--- a/xarray/core/dataset.py
+++ b/xarray/core/dataset.py
@@ -7770,6 +7770,45 @@ def _wrapper(Y, *coords_, **kwargs):
 
         return result
 
+    def drop_duplicates(
+        self,
+        dim: Hashable | Iterable[Hashable] | ...,
+        keep: Literal["first", "last"] | Literal[False] = "first",
+    ):
+        """Returns a new Dataset with duplicate dimension values removed.
+
+        Parameters
+        ----------
+        dim : dimension label or labels
+            Pass `...` to drop duplicates along all dimensions.
+        keep : {"first", "last", False}, default: "first"
+            Determines which duplicates (if any) to keep.
+            - ``"first"`` : Drop duplicates except for the first occurrence.
+            - ``"last"`` : Drop duplicates except for the last occurrence.
+            - False : Drop all duplicates.
+
+        Returns
+        -------
+        Dataset
+
+        See Also
+        --------
+        DataArray.drop_duplicates
+        """
+        if isinstance(dim, str):
+            dims = (dim,)
+        elif dim is ...:
+            dims = self.dims
+        else:
+            dims = dim
+
+        missing_dims = set(dims) - set(self.dims)
+        if missing_dims:
+            raise ValueError(f"'{dim}' not found in dimensions")
+
+        indexes = {dim: ~self.get_index(dim).duplicated(keep=keep) for dim in dims}
+        return self.isel(indexes)
+
     def convert_calendar(
         self,
         calendar: str,

From 148a3e77186be87fdfc75702e882552f17ee695e Mon Sep 17 00:00:00 2001
From: Thomas Nicholas <thomas.nicholas@columbia.edu>
Date: Fri, 25 Feb 2022 12:32:13 -0500
Subject: [PATCH 5/7] added dataset.drop_duplicates to API docs

---
 doc/api.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/api.rst b/doc/api.rst
index b552bc6b4d2..d2c222da4db 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -106,6 +106,7 @@ Dataset contents
    Dataset.swap_dims
    Dataset.expand_dims
    Dataset.drop_vars
+   Dataset.drop_duplicates
    Dataset.drop_dims
    Dataset.set_coords
    Dataset.reset_coords

From d545e5db9133b7234a2e7783038768ca9e45dfaa Mon Sep 17 00:00:00 2001
From: Thomas Nicholas <thomas.nicholas@columbia.edu>
Date: Fri, 25 Feb 2022 12:34:38 -0500
Subject: [PATCH 6/7] whats-new entry

---
 doc/whats-new.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
index aa48bd619e8..24a8042ee66 100644
--- a/doc/whats-new.rst
+++ b/doc/whats-new.rst
@@ -31,7 +31,9 @@ New Features
 - Enbable to provide more keyword arguments to `pydap` backend when reading
   OpenDAP datasets (:issue:`6274`).
   By `Jonas Gliß <https://github.com/jgliss>`.
-
+- Allow :py:meth:`DataArray.drop_duplicates` to drop duplicates along multiple dimensions at once,
+  and add :py:meth:`Dataset.drop_duplicates`. (:pull:`6307`)
+  By `Tom Nicholas <https://github.com/TomNicholas>`_.
 
 Breaking changes
 ~~~~~~~~~~~~~~~~

From 33d05e80a8f5bcb992e353b87986fad8eecaac0f Mon Sep 17 00:00:00 2001
From: Thomas Nicholas <thomas.nicholas@columbia.edu>
Date: Fri, 25 Feb 2022 14:10:31 -0500
Subject: [PATCH 7/7] correct small bug when raising error

---
 xarray/core/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
index 52f575ee4c6..be9df9d2e2d 100644
--- a/xarray/core/dataset.py
+++ b/xarray/core/dataset.py
@@ -7804,7 +7804,7 @@ def drop_duplicates(
 
         missing_dims = set(dims) - set(self.dims)
         if missing_dims:
-            raise ValueError(f"'{dim}' not found in dimensions")
+            raise ValueError(f"'{missing_dims}' not found in dimensions")
 
         indexes = {dim: ~self.get_index(dim).duplicated(keep=keep) for dim in dims}
         return self.isel(indexes)