From 0cd58fbec63d5e461b487e7e37aa9942ebe0f116 Mon Sep 17 00:00:00 2001 From: AmirAli Mirian <37371367+amiralimi@users.noreply.github.com> Date: Thu, 25 Jan 2024 11:40:04 -0500 Subject: [PATCH] Fix index difference to follow the pandas format (#14789) This PR fixes an error in `Index.difference` where the function keeps duplicate elements while pandas removes the duplicates. The tests had no inputs with duplicates, so I added new tests too (I added the test from the original issue). - closes #14489 Authors: - AmirAli Mirian (https://github.com/amiralimi) - Ashwin Srinath (https://github.com/shwina) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/14789 --- python/cudf/cudf/core/_base_index.py | 4 ++-- python/cudf/cudf/tests/test_index.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 2aef77b6c99..d7d8e26db1b 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1040,11 +1040,11 @@ def difference(self, other, sort=None): res_name = _get_result_name(self.name, other.name) if is_mixed_with_object_dtype(self, other): - difference = self.copy() + difference = self.copy().unique() else: other = other.copy(deep=False) difference = cudf.core.index._index_from_data( - cudf.DataFrame._from_data({"None": self._column}) + cudf.DataFrame._from_data({"None": self._column.unique()}) .merge( cudf.DataFrame._from_data({"None": other._column}), how="leftanti", diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index a480a4624f7..e0a369d8d91 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. """ Test related to Index @@ -803,6 +803,7 @@ def test_index_to_series(data): pd.Series(["1", "2", "a", "3", None], dtype="category"), range(0, 10), [], + [1, 1, 2, 2], ], ) @pytest.mark.parametrize( @@ -819,6 +820,7 @@ def test_index_to_series(data): range(2, 4), pd.Series(["1", "a", "3", None], dtype="category"), [], + [2], ], ) @pytest.mark.parametrize("sort", [None, False])