From 675af5401833287dac7e491bb1d6fa79c863bc0f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 29 Mar 2018 18:35:24 -0700 Subject: [PATCH 1/3] Deprecated Index.get_duplicates() --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/indexes/base.py | 13 +++++++++++-- pandas/tests/indexes/test_base.py | 5 +++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index e83f149db1f18..cb3544b5df3a6 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -831,6 +831,7 @@ Deprecations - ``pandas.tseries.plotting.tsplot`` is deprecated. Use :func:`Series.plot` instead (:issue:`18627`) - ``Index.summary()`` is deprecated and will be removed in a future version (:issue:`18217`) - ``NDFrame.get_ftype_counts()`` is deprecated and will be removed in a future version (:issue:`18243`) +- ``Index.get_duplicates()`` is deprecated and will be removed in a future version (:issue:`20239`) .. _whatsnew_0230.prior_deprecations: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 12bb09e8f8a8a..51b3b682fd4a5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1824,6 +1824,9 @@ def get_duplicates(self): Returns a sorted list of index elements which appear more than once in the index. + .. deprecated:: 0.23.0 + Use idx[idx.duplicated()].unique() instead + Returns ------- array-like @@ -1870,14 +1873,20 @@ def get_duplicates(self): >>> pd.Index(dates).get_duplicates() DatetimeIndex([], dtype='datetime64[ns]', freq=None) """ + warnings.warn("'get_duplicates' is deprecated and will be removed in " + "a future release. You can use " + "idx[idx.duplicated()].unique() instead", + FutureWarning, stacklevel=2) + + return self._get_duplicates() + + def _get_duplicates(self): from collections import defaultdict counter = defaultdict(lambda: 0) for k in self.values: counter[k] += 1 return sorted(k for k, v in compat.iteritems(counter) if v > 1) - _get_duplicates = get_duplicates - def _cleanup(self): self._engine.clear_mapping() diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index ff9c86fbfe384..d396d3b7e0036 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -2061,6 +2061,11 @@ def test_cached_properties_not_settable(self): with tm.assert_raises_regex(AttributeError, "Can't set attribute"): idx.is_unique = False + def test_get_duplicates_deprecated(self): + idx = pd.Index([1, 2, 3]) + with tm.assert_produces_warning(FutureWarning): + idx.get_duplicates() + class TestMixedIntIndex(Base): # Mostly the tests from common.py for which the results differ From ed58eec355350f2005bc1ccbee7ee46b465a0791 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 1 Apr 2018 09:30:04 -0700 Subject: [PATCH 2/3] Updated return val and test cases --- pandas/core/indexes/base.py | 9 +-------- pandas/tests/indexes/test_multi.py | 6 ++++-- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 51b3b682fd4a5..6f7da6ae6c3d2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1878,14 +1878,7 @@ def get_duplicates(self): "idx[idx.duplicated()].unique() instead", FutureWarning, stacklevel=2) - return self._get_duplicates() - - def _get_duplicates(self): - from collections import defaultdict - counter = defaultdict(lambda: 0) - for k in self.values: - counter[k] += 1 - return sorted(k for k, v in compat.iteritems(counter) if v > 1) + return self[self.duplicated()].unique() def _cleanup(self): self._engine.clear_mapping() diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 984f37042d600..f99b94216aa33 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -2413,7 +2413,8 @@ def check(nlevels, with_nulls): for a in [101, 102]: mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]]) assert not mi.has_duplicates - assert mi.get_duplicates() == [] + assert mi.get_duplicates().equals( + MultiIndex.from_arrays([[], []])) tm.assert_numpy_array_equal(mi.duplicated(), np.zeros( 2, dtype='bool')) @@ -2425,7 +2426,8 @@ def check(nlevels, with_nulls): labels=np.random.permutation(list(lab)).T) assert len(mi) == (n + 1) * (m + 1) assert not mi.has_duplicates - assert mi.get_duplicates() == [] + assert mi.get_duplicates().equals( + MultiIndex.from_arrays([[], []])) tm.assert_numpy_array_equal(mi.duplicated(), np.zeros( len(mi), dtype='bool')) From a1c5e51298715fec33e377e16575cca0f03b681e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 1 Apr 2018 09:44:26 -0700 Subject: [PATCH 3/3] Updated internal refs --- pandas/core/frame.py | 5 +++-- pandas/core/indexes/datetimelike.py | 4 ---- pandas/core/reshape/concat.py | 2 +- pandas/tests/indexes/datetimes/test_datetime.py | 6 +++++- pandas/tests/indexes/test_multi.py | 16 ++++++++++++---- .../tests/indexes/timedeltas/test_timedelta.py | 7 ++++++- 6 files changed, 27 insertions(+), 13 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 35f3a7c20e270..11b9d93a27284 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3840,8 +3840,9 @@ def set_index(self, keys, drop=True, append=False, inplace=False, index = _ensure_index_from_sequences(arrays, names) if verify_integrity and not index.is_unique: - duplicates = index.get_duplicates() - raise ValueError('Index has duplicate keys: %s' % duplicates) + duplicates = index[index.duplicated()].unique() + raise ValueError('Index has duplicate keys: {duplicates!s}'.format( + duplicates=duplicates)) for c in to_remove: del frame[c] diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index b906ea0f4784c..ae6bd80de5d12 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -501,10 +501,6 @@ def take(self, indices, axis=0, allow_fill=True, freq = self.freq if isinstance(self, ABCPeriodIndex) else None return self._shallow_copy(taken, freq=freq) - def get_duplicates(self): - values = Index.get_duplicates(self) - return self._simple_new(values) - _can_hold_na = True _na_value = NaT diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 20f4384a3d698..6e564975f34cd 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -504,7 +504,7 @@ def _get_concat_axis(self): def _maybe_check_integrity(self, concat_index): if self.verify_integrity: if not concat_index.is_unique: - overlap = concat_index.get_duplicates() + overlap = concat_index[concat_index.duplicated()].unique() raise ValueError('Indexes have overlapping values: ' '{overlap!s}'.format(overlap=overlap)) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 51788b3e25507..b3aab6dba796c 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -1,3 +1,4 @@ +import warnings import pytest @@ -178,7 +179,10 @@ def test_get_duplicates(self): idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-02', '2000-01-03', '2000-01-03', '2000-01-04']) - result = idx.get_duplicates() + with warnings.catch_warnings(record=True): + # Deprecated - see GH20239 + result = idx.get_duplicates() + ex = DatetimeIndex(['2000-01-02', '2000-01-03']) tm.assert_index_equal(result, ex) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index f99b94216aa33..0ae4b43575f66 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -2413,8 +2413,12 @@ def check(nlevels, with_nulls): for a in [101, 102]: mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]]) assert not mi.has_duplicates - assert mi.get_duplicates().equals( - MultiIndex.from_arrays([[], []])) + + with warnings.catch_warnings(record=True): + # Deprecated - see GH20239 + assert mi.get_duplicates().equals(MultiIndex.from_arrays( + [[], []])) + tm.assert_numpy_array_equal(mi.duplicated(), np.zeros( 2, dtype='bool')) @@ -2426,8 +2430,12 @@ def check(nlevels, with_nulls): labels=np.random.permutation(list(lab)).T) assert len(mi) == (n + 1) * (m + 1) assert not mi.has_duplicates - assert mi.get_duplicates().equals( - MultiIndex.from_arrays([[], []])) + + with warnings.catch_warnings(record=True): + # Deprecated - see GH20239 + assert mi.get_duplicates().equals(MultiIndex.from_arrays( + [[], []])) + tm.assert_numpy_array_equal(mi.duplicated(), np.zeros( len(mi), dtype='bool')) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 4692b6d675e6b..d7745ffd94cd9 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -1,3 +1,5 @@ +import warnings + import pytest import numpy as np @@ -145,7 +147,10 @@ def test_get_duplicates(self): idx = TimedeltaIndex(['1 day', '2 day', '2 day', '3 day', '3day', '4day']) - result = idx.get_duplicates() + with warnings.catch_warnings(record=True): + # Deprecated - see GH20239 + result = idx.get_duplicates() + ex = TimedeltaIndex(['2 day', '3day']) tm.assert_index_equal(result, ex)