ENH: Add ignore_index for df.drop_duplicates (#30405)

pandas-dev · Dec 27, 2019 · 7025c59 · 7025c59
1 parent 0a3c1d7
commit 7025c59
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 1 deletion.
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -209,6 +209,7 @@ Other enhancements
 - DataFrame constructor preserve `ExtensionArray` dtype with `ExtensionArray` (:issue:`11363`)
 - :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` have gained ``ignore_index`` keyword to be able to reset index after sorting (:issue:`30114`)
 
+- :meth:`DataFrame.drop_duplicates` has gained ``ignore_index`` keyword to reset index (:issue:`30114`)
 
 Build Changes
 ^^^^^^^^^^^^^

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4614,6 +4614,7 @@ def drop_duplicates(
         subset: Optional[Union[Hashable, Sequence[Hashable]]] = None,
         keep: Union[str, bool] = "first",
         inplace: bool = False,
+        ignore_index: bool = False,
     ) -> Optional["DataFrame"]:
         """
         Return DataFrame with duplicate rows removed.
@@ -4633,6 +4634,10 @@ def drop_duplicates(
             - False : Drop all duplicates.
         inplace : bool, default False
             Whether to drop duplicates in place or to return a copy.
+        ignore_index : bool, default False
+            If True, the resulting axis will be labeled 0, 1, …, n - 1.
+
+            .. versionadded:: 1.0.0
 
         Returns
         -------
@@ -4648,9 +4653,16 @@ def drop_duplicates(
         if inplace:
             (inds,) = (-duplicated)._ndarray_values.nonzero()
             new_data = self._data.take(inds)
+
+            if ignore_index:
+                new_data.axes[1] = ibase.default_index(len(inds))
             self._update_inplace(new_data)
         else:
-            return self[-duplicated]
+            result = self[-duplicated]
+
+            if ignore_index:
+                result.index = ibase.default_index(len(result))
+            return result
 
         return None
 

diff --git a/pandas/tests/frame/methods/test_drop_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py
@@ -391,3 +391,36 @@ def test_drop_duplicates_inplace():
     expected = orig2.drop_duplicates(["A", "B"], keep=False)
     result = df2
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "origin_dict, output_dict, ignore_index, output_index",
+    [
+        ({"A": [2, 2, 3]}, {"A": [2, 3]}, True, [0, 1]),
+        ({"A": [2, 2, 3]}, {"A": [2, 3]}, False, [0, 2]),
+        ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, True, [0, 1]),
+        ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, False, [0, 2]),
+    ],
+)
+def test_drop_duplicates_ignore_index(
+    origin_dict, output_dict, ignore_index, output_index
+):
+    # GH 30114
+    df = DataFrame(origin_dict)
+    expected = DataFrame(output_dict, index=output_index)
+
+    # Test when inplace is False
+    result = df.drop_duplicates(ignore_index=ignore_index)
+    tm.assert_frame_equal(result, expected)
+
+    # to verify original dataframe is not mutated
+    tm.assert_frame_equal(df, DataFrame(origin_dict))
+
+    # Test when inplace is True
+    copied_df = df.copy()
+
+    copied_df.drop_duplicates(ignore_index=ignore_index, inplace=True)
+    tm.assert_frame_equal(copied_df, expected)
+
+    # to verify that input is unchanged
+    tm.assert_frame_equal(df, DataFrame(origin_dict))