From 4dc8300c6104697b1d9313a48d1c4c7f5dabf81a Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 16 Nov 2023 16:22:44 -0600 Subject: [PATCH] Raise error in `reindex` when `index` is not unique (#14400) (#14429) Bacport of #14400 Fixes: #14398 This PR raises an error in `reindex` API when reindexing is performed on a non-unique index column. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/14400 Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Richard (Rick) Zamora (https://github.com/rjzamora) - Ashwin Srinath (https://github.com/shwina) - Ray Douglass (https://github.com/raydouglass) --- python/cudf/cudf/core/indexed_frame.py | 4 ++++ python/cudf/cudf/tests/test_dataframe.py | 12 ++++++++++++ python/cudf/cudf/tests/test_series.py | 12 ++++++++++++ python/dask_cudf/dask_cudf/backends.py | 13 ++++--------- 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 219b8021241..fef62594fb8 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -2606,6 +2606,10 @@ def _reindex( df = self if index is not None: + if not df._index.is_unique: + raise ValueError( + "cannot reindex on an axis with duplicate labels" + ) index = cudf.core.index.as_index( index, name=getattr(index, "name", self._index.name) ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d44cf594e8b..5677f97408a 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10723,3 +10723,15 @@ def test_dataframe_series_dot(): expected = gser @ [12, 13] assert_eq(expected, actual) + + +def test_dataframe_duplicate_index_reindex(): + gdf = cudf.DataFrame({"a": [0, 1, 2, 3]}, index=[0, 0, 1, 1]) + pdf = gdf.to_pandas() + + assert_exceptions_equal( + gdf.reindex, + pdf.reindex, + lfunc_args_and_kwargs=([10, 11, 12, 13], {}), + rfunc_args_and_kwargs=([10, 11, 12, 13], {}), + ) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 8f8f87c20e0..c15a797713f 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2638,3 +2638,15 @@ def test_series_setitem_mixed_bool_dtype(): s = cudf.Series([True, False, True]) with pytest.raises(TypeError): s[0] = 10 + + +def test_series_duplicate_index_reindex(): + gs = cudf.Series([0, 1, 2, 3], index=[0, 0, 1, 1]) + ps = gs.to_pandas() + + assert_exceptions_equal( + gs.reindex, + ps.reindex, + lfunc_args_and_kwargs=([10, 11, 12, 13], {}), + rfunc_args_and_kwargs=([10, 11, 12, 13], {}), + ) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 344b03c631d..2be256f85e8 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -437,17 +437,12 @@ def union_categoricals_cudf( ) -@_dask_cudf_nvtx_annotate -def safe_hash(frame): - return cudf.Series(frame.hash_values(), index=frame.index) - - @hash_object_dispatch.register((cudf.DataFrame, cudf.Series)) @_dask_cudf_nvtx_annotate def hash_object_cudf(frame, index=True): if index: - return safe_hash(frame.reset_index()) - return safe_hash(frame) + frame = frame.reset_index() + return frame.hash_values() @hash_object_dispatch.register(cudf.BaseIndex) @@ -455,10 +450,10 @@ def hash_object_cudf(frame, index=True): def hash_object_cudf_index(ind, index=None): if isinstance(ind, cudf.MultiIndex): - return safe_hash(ind.to_frame(index=False)) + return ind.to_frame(index=False).hash_values() col = cudf.core.column.as_column(ind) - return safe_hash(cudf.Series(col)) + return cudf.Series(col).hash_values() @group_split_dispatch.register((cudf.Series, cudf.DataFrame))