Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deprecated Index.get_duplicates() #20544

Merged
merged 8 commits into from
Apr 24, 2018
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -878,6 +878,7 @@ Deprecations
- :func:`Series.rolling().apply() <pandas.core.window.Rolling.apply>`, :func:`DataFrame.rolling().apply() <pandas.core.window.Rolling.apply>`,
:func:`Series.expanding().apply() <pandas.core.window.Expanding.apply>`, and :func:`DataFrame.expanding().apply() <pandas.core.window.Expanding.apply>` have deprecated passing an ``np.array`` by default. One will need to pass the new ``raw`` parameter to be explicit about what is passed (:issue:`20584`)
- ``DatetimeIndex.offset`` is deprecated. Use ``DatetimeIndex.freq`` instead (:issue:`20716`)
- ``Index.get_duplicates()`` is deprecated and will be removed in a future version (:issue:`20239`)

.. _whatsnew_0230.prior_deprecations:

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3878,7 +3878,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
index = _ensure_index_from_sequences(arrays, names)

if verify_integrity and not index.is_unique:
duplicates = index.get_duplicates()
duplicates = index[index.duplicated()].unique()
raise ValueError('Index has duplicate keys: {dup}'.format(
dup=duplicates))

Expand Down
14 changes: 8 additions & 6 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1824,6 +1824,9 @@ def get_duplicates(self):
Returns a sorted list of index elements which appear more than once in
the index.

.. deprecated:: 0.23.0
Use idx[idx.duplicated()].unique() instead

Returns
-------
array-like
Expand Down Expand Up @@ -1870,13 +1873,12 @@ def get_duplicates(self):
>>> pd.Index(dates).get_duplicates()
DatetimeIndex([], dtype='datetime64[ns]', freq=None)
"""
from collections import defaultdict
counter = defaultdict(lambda: 0)
for k in self.values:
counter[k] += 1
return sorted(k for k, v in compat.iteritems(counter) if v > 1)
warnings.warn("'get_duplicates' is deprecated and will be removed in "
"a future release. You can use "
"idx[idx.duplicated()].unique() instead",
FutureWarning, stacklevel=2)

_get_duplicates = get_duplicates
return self[self.duplicated()].unique()

def _cleanup(self):
self._engine.clear_mapping()
Expand Down
4 changes: 0 additions & 4 deletions pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,10 +501,6 @@ def take(self, indices, axis=0, allow_fill=True,
freq = self.freq if isinstance(self, ABCPeriodIndex) else None
return self._shallow_copy(taken, freq=freq)

def get_duplicates(self):
values = Index.get_duplicates(self)
return self._simple_new(values)

_can_hold_na = True

_na_value = NaT
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/reshape/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,7 +504,7 @@ def _get_concat_axis(self):
def _maybe_check_integrity(self, concat_index):
if self.verify_integrity:
if not concat_index.is_unique:
overlap = concat_index.get_duplicates()
overlap = concat_index[concat_index.duplicated()].unique()
raise ValueError('Indexes have overlapping values: '
'{overlap!s}'.format(overlap=overlap))

Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/indexes/datetimes/test_datetime.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import warnings

import pytest

Expand Down Expand Up @@ -178,7 +179,10 @@ def test_get_duplicates(self):
idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-02',
'2000-01-03', '2000-01-03', '2000-01-04'])

result = idx.get_duplicates()
with warnings.catch_warnings(record=True):
# Deprecated - see GH20239
result = idx.get_duplicates()

ex = DatetimeIndex(['2000-01-02', '2000-01-03'])
tm.assert_index_equal(result, ex)

Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2060,6 +2060,11 @@ def test_cached_properties_not_settable(self):
with tm.assert_raises_regex(AttributeError, "Can't set attribute"):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need to remove the usage from all tests or catch the warnings

(pandas) bash-3.2$ grep -r get_duplicates pandas
pandas/core/reshape/concat.py:                overlap = concat_index.get_duplicates()
Binary file pandas/core/reshape/__pycache__/concat.cpython-36.pyc matches
Binary file pandas/core/__pycache__/frame.cpython-36.pyc matches
pandas/core/frame.py:            duplicates = index.get_duplicates()
Binary file pandas/core/indexes/__pycache__/datetimelike.cpython-36.pyc matches
Binary file pandas/core/indexes/__pycache__/base.cpython-36.pyc matches
pandas/core/indexes/datetimelike.py:    def get_duplicates(self):
pandas/core/indexes/datetimelike.py:        values = Index.get_duplicates(self)
pandas/core/indexes/base.py:    def get_duplicates(self):
pandas/core/indexes/base.py:        >>> pd.Index([1, 2, 2, 3, 3, 3, 4]).get_duplicates()
pandas/core/indexes/base.py:        >>> pd.Index([1., 2., 2., 3., 3., 3., 4.]).get_duplicates()
pandas/core/indexes/base.py:        >>> pd.Index(['a', 'b', 'b', 'c', 'c', 'c', 'd']).get_duplicates()
pandas/core/indexes/base.py:        >>> pd.Index(dates).get_duplicates()
pandas/core/indexes/base.py:        >>> pd.Index([1, 2, 3, 2, 3, 4, 3]).get_duplicates()
pandas/core/indexes/base.py:        >>> pd.Index([1, 2, 3, 4]).get_duplicates()
pandas/core/indexes/base.py:        >>> pd.Index(dates).get_duplicates()
pandas/core/indexes/base.py:    _get_duplicates = get_duplicates
Binary file pandas/tests/indexes/__pycache__/test_multi.cpython-36-PYTEST.pyc matches
Binary file pandas/tests/indexes/datetimes/__pycache__/test_datetime.cpython-36-PYTEST.pyc matches
pandas/tests/indexes/datetimes/test_datetime.py:    def test_get_duplicates(self):
pandas/tests/indexes/datetimes/test_datetime.py:        result = idx.get_duplicates()
Binary file pandas/tests/indexes/timedeltas/__pycache__/test_timedelta.cpython-36-PYTEST.pyc matches
pandas/tests/indexes/timedeltas/test_timedelta.py:    def test_get_duplicates(self):
pandas/tests/indexes/timedeltas/test_timedelta.py:        result = idx.get_duplicates()
pandas/tests/indexes/test_multi.py:            assert mi.get_duplicates() == []
pandas/tests/indexes/test_multi.py:                assert mi.get_duplicates() == []

idx.is_unique = False

def test_get_duplicates_deprecated(self):
idx = pd.Index([1, 2, 3])
with tm.assert_produces_warning(FutureWarning):
idx.get_duplicates()


class TestMixedIntIndex(Base):
# Mostly the tests from common.py for which the results differ
Expand Down
14 changes: 12 additions & 2 deletions pandas/tests/indexes/test_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2413,7 +2413,12 @@ def check(nlevels, with_nulls):
for a in [101, 102]:
mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]])
assert not mi.has_duplicates
assert mi.get_duplicates() == []

with warnings.catch_warnings(record=True):
# Deprecated - see GH20239
assert mi.get_duplicates().equals(MultiIndex.from_arrays(
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Had to tweak the assertion here given the return value is no longer a list, though I assume you are aware of that from the original issue. With that said, this is a different behavior for non-datetimelikes (which were returning a like-Index object) - is it worth documenting that in the whatsnew or is this getting into too technical of a distinction?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is fine

[[], []]))

tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(
2, dtype='bool'))

Expand All @@ -2425,7 +2430,12 @@ def check(nlevels, with_nulls):
labels=np.random.permutation(list(lab)).T)
assert len(mi) == (n + 1) * (m + 1)
assert not mi.has_duplicates
assert mi.get_duplicates() == []

with warnings.catch_warnings(record=True):
# Deprecated - see GH20239
assert mi.get_duplicates().equals(MultiIndex.from_arrays(
[[], []]))

tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(
len(mi), dtype='bool'))

Expand Down
7 changes: 6 additions & 1 deletion pandas/tests/indexes/timedeltas/test_timedelta.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import warnings

import pytest

import numpy as np
Expand Down Expand Up @@ -145,7 +147,10 @@ def test_get_duplicates(self):
idx = TimedeltaIndex(['1 day', '2 day', '2 day', '3 day', '3day',
'4day'])

result = idx.get_duplicates()
with warnings.catch_warnings(record=True):
# Deprecated - see GH20239
result = idx.get_duplicates()

ex = TimedeltaIndex(['2 day', '3day'])
tm.assert_index_equal(result, ex)

Expand Down