Skip to content

Commit

Permalink
Raise error in reindex when index is not unique (rapidsai#14400) (r…
Browse files Browse the repository at this point in the history
…apidsai#14429)

Bacport of rapidsai#14400 
Fixes: rapidsai#14398 
This PR raises an error in `reindex` API when reindexing is performed on a non-unique index column.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Lawrence Mitchell (https://github.com/wence-)

URL: rapidsai#14400

Authors:
   - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
   - Richard (Rick) Zamora (https://github.com/rjzamora)
   - Ashwin Srinath (https://github.com/shwina)
   - Ray Douglass (https://github.com/raydouglass)
  • Loading branch information
galipremsagar authored Nov 16, 2023
1 parent 38a5a32 commit 4dc8300
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 9 deletions.
4 changes: 4 additions & 0 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2606,6 +2606,10 @@ def _reindex(

df = self
if index is not None:
if not df._index.is_unique:
raise ValueError(
"cannot reindex on an axis with duplicate labels"
)
index = cudf.core.index.as_index(
index, name=getattr(index, "name", self._index.name)
)
Expand Down
12 changes: 12 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10723,3 +10723,15 @@ def test_dataframe_series_dot():
expected = gser @ [12, 13]

assert_eq(expected, actual)


def test_dataframe_duplicate_index_reindex():
gdf = cudf.DataFrame({"a": [0, 1, 2, 3]}, index=[0, 0, 1, 1])
pdf = gdf.to_pandas()

assert_exceptions_equal(
gdf.reindex,
pdf.reindex,
lfunc_args_and_kwargs=([10, 11, 12, 13], {}),
rfunc_args_and_kwargs=([10, 11, 12, 13], {}),
)
12 changes: 12 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2638,3 +2638,15 @@ def test_series_setitem_mixed_bool_dtype():
s = cudf.Series([True, False, True])
with pytest.raises(TypeError):
s[0] = 10


def test_series_duplicate_index_reindex():
gs = cudf.Series([0, 1, 2, 3], index=[0, 0, 1, 1])
ps = gs.to_pandas()

assert_exceptions_equal(
gs.reindex,
ps.reindex,
lfunc_args_and_kwargs=([10, 11, 12, 13], {}),
rfunc_args_and_kwargs=([10, 11, 12, 13], {}),
)
13 changes: 4 additions & 9 deletions python/dask_cudf/dask_cudf/backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,28 +437,23 @@ def union_categoricals_cudf(
)


@_dask_cudf_nvtx_annotate
def safe_hash(frame):
return cudf.Series(frame.hash_values(), index=frame.index)


@hash_object_dispatch.register((cudf.DataFrame, cudf.Series))
@_dask_cudf_nvtx_annotate
def hash_object_cudf(frame, index=True):
if index:
return safe_hash(frame.reset_index())
return safe_hash(frame)
frame = frame.reset_index()
return frame.hash_values()


@hash_object_dispatch.register(cudf.BaseIndex)
@_dask_cudf_nvtx_annotate
def hash_object_cudf_index(ind, index=None):

if isinstance(ind, cudf.MultiIndex):
return safe_hash(ind.to_frame(index=False))
return ind.to_frame(index=False).hash_values()

col = cudf.core.column.as_column(ind)
return safe_hash(cudf.Series(col))
return cudf.Series(col).hash_values()


@group_split_dispatch.register((cudf.Series, cudf.DataFrame))
Expand Down

0 comments on commit 4dc8300

Please sign in to comment.