diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 0d0e9d9a54fff..c43126634a107 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -209,6 +209,7 @@ Other enhancements - DataFrame constructor preserve `ExtensionArray` dtype with `ExtensionArray` (:issue:`11363`) - :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` have gained ``ignore_index`` keyword to be able to reset index after sorting (:issue:`30114`) +- :meth:`DataFrame.drop_duplicates` has gained ``ignore_index`` keyword to reset index (:issue:`30114`) Build Changes ^^^^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7895f448424e3..4ff3cb7d4f02d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4614,6 +4614,7 @@ def drop_duplicates( subset: Optional[Union[Hashable, Sequence[Hashable]]] = None, keep: Union[str, bool] = "first", inplace: bool = False, + ignore_index: bool = False, ) -> Optional["DataFrame"]: """ Return DataFrame with duplicate rows removed. @@ -4633,6 +4634,10 @@ def drop_duplicates( - False : Drop all duplicates. inplace : bool, default False Whether to drop duplicates in place or to return a copy. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.0.0 Returns ------- @@ -4648,9 +4653,16 @@ def drop_duplicates( if inplace: (inds,) = (-duplicated)._ndarray_values.nonzero() new_data = self._data.take(inds) + + if ignore_index: + new_data.axes[1] = ibase.default_index(len(inds)) self._update_inplace(new_data) else: - return self[-duplicated] + result = self[-duplicated] + + if ignore_index: + result.index = ibase.default_index(len(result)) + return result return None diff --git a/pandas/tests/frame/methods/test_drop_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py index a7715d1f31673..29ab2e1bfd512 100644 --- a/pandas/tests/frame/methods/test_drop_duplicates.py +++ b/pandas/tests/frame/methods/test_drop_duplicates.py @@ -391,3 +391,36 @@ def test_drop_duplicates_inplace(): expected = orig2.drop_duplicates(["A", "B"], keep=False) result = df2 tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "origin_dict, output_dict, ignore_index, output_index", + [ + ({"A": [2, 2, 3]}, {"A": [2, 3]}, True, [0, 1]), + ({"A": [2, 2, 3]}, {"A": [2, 3]}, False, [0, 2]), + ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, True, [0, 1]), + ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, False, [0, 2]), + ], +) +def test_drop_duplicates_ignore_index( + origin_dict, output_dict, ignore_index, output_index +): + # GH 30114 + df = DataFrame(origin_dict) + expected = DataFrame(output_dict, index=output_index) + + # Test when inplace is False + result = df.drop_duplicates(ignore_index=ignore_index) + tm.assert_frame_equal(result, expected) + + # to verify original dataframe is not mutated + tm.assert_frame_equal(df, DataFrame(origin_dict)) + + # Test when inplace is True + copied_df = df.copy() + + copied_df.drop_duplicates(ignore_index=ignore_index, inplace=True) + tm.assert_frame_equal(copied_df, expected) + + # to verify that input is unchanged + tm.assert_frame_equal(df, DataFrame(origin_dict))