Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Raise error in reindex when index is not unique #14400

Merged
merged 8 commits into from
Nov 15, 2023
Merged
4 changes: 4 additions & 0 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2607,6 +2607,10 @@ def _reindex(

df = self
if index is not None:
if not df._index.is_unique:
raise ValueError(
"cannot reindex on an axis with duplicate labels"
)
index = cudf.core.index.as_index(
index, name=getattr(index, "name", self._index.name)
)
Expand Down
12 changes: 12 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10723,3 +10723,15 @@ def test_dataframe_series_dot():
expected = gser @ [12, 13]

assert_eq(expected, actual)


def test_dataframe_duplicate_index_reindex():
gdf = cudf.DataFrame({"a": [0, 1, 2, 3]}, index=[0, 0, 1, 1])
pdf = gdf.to_pandas()

assert_exceptions_equal(
gdf.reindex,
pdf.reindex,
lfunc_args_and_kwargs=([10, 11, 12, 13], {}),
rfunc_args_and_kwargs=([10, 11, 12, 13], {}),
)
12 changes: 12 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2638,3 +2638,15 @@ def test_series_setitem_mixed_bool_dtype():
s = cudf.Series([True, False, True])
with pytest.raises(TypeError):
s[0] = 10


def test_series_duplicate_index_reindex():
gs = cudf.Series([0, 1, 2, 3], index=[0, 0, 1, 1])
ps = gs.to_pandas()

assert_exceptions_equal(
gs.reindex,
ps.reindex,
lfunc_args_and_kwargs=([10, 11, 12, 13], {}),
rfunc_args_and_kwargs=([10, 11, 12, 13], {}),
)
13 changes: 4 additions & 9 deletions python/dask_cudf/dask_cudf/backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,28 +427,23 @@ def union_categoricals_cudf(
)


@_dask_cudf_nvtx_annotate
def safe_hash(frame):
return cudf.Series(frame.hash_values(), index=frame.index)


@hash_object_dispatch.register((cudf.DataFrame, cudf.Series))
@_dask_cudf_nvtx_annotate
def hash_object_cudf(frame, index=True):
if index:
return safe_hash(frame.reset_index())
return safe_hash(frame)
frame = frame.reset_index()
return frame.hash_values()


@hash_object_dispatch.register(cudf.BaseIndex)
@_dask_cudf_nvtx_annotate
def hash_object_cudf_index(ind, index=None):

if isinstance(ind, cudf.MultiIndex):
return safe_hash(ind.to_frame(index=False))
return ind.to_frame(index=False).hash_values()

col = cudf.core.column.as_column(ind)
return safe_hash(cudf.Series(col))
return cudf.Series(col).hash_values()


@group_split_dispatch.register((cudf.Series, cudf.DataFrame))
Expand Down