Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Forward-merge branch-23.12 to branch-24.02 #14406

Merged
merged 17 commits into from
Nov 16, 2023
Merged
Changes from 1 commit
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Raise error in reindex when index is not unique (#14400)
Fixes: #14398 
This PR raises an error in `reindex` API when reindexing is performed on a non-unique index column.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Lawrence Mitchell (https://github.com/wence-)

URL: #14400
galipremsagar authored Nov 15, 2023

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
commit 8deb3dd7573000e7d87f18a9e2bbe39cf2932e10
4 changes: 4 additions & 0 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
@@ -2607,6 +2607,10 @@ def _reindex(

df = self
if index is not None:
if not df._index.is_unique:
raise ValueError(
"cannot reindex on an axis with duplicate labels"
)
index = cudf.core.index.as_index(
index, name=getattr(index, "name", self._index.name)
)
12 changes: 12 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
@@ -10723,3 +10723,15 @@ def test_dataframe_series_dot():
expected = gser @ [12, 13]

assert_eq(expected, actual)


def test_dataframe_duplicate_index_reindex():
gdf = cudf.DataFrame({"a": [0, 1, 2, 3]}, index=[0, 0, 1, 1])
pdf = gdf.to_pandas()

assert_exceptions_equal(
gdf.reindex,
pdf.reindex,
lfunc_args_and_kwargs=([10, 11, 12, 13], {}),
rfunc_args_and_kwargs=([10, 11, 12, 13], {}),
)
12 changes: 12 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
@@ -2638,3 +2638,15 @@ def test_series_setitem_mixed_bool_dtype():
s = cudf.Series([True, False, True])
with pytest.raises(TypeError):
s[0] = 10


def test_series_duplicate_index_reindex():
gs = cudf.Series([0, 1, 2, 3], index=[0, 0, 1, 1])
ps = gs.to_pandas()

assert_exceptions_equal(
gs.reindex,
ps.reindex,
lfunc_args_and_kwargs=([10, 11, 12, 13], {}),
rfunc_args_and_kwargs=([10, 11, 12, 13], {}),
)
13 changes: 4 additions & 9 deletions python/dask_cudf/dask_cudf/backends.py
Original file line number Diff line number Diff line change
@@ -427,28 +427,23 @@ def union_categoricals_cudf(
)


@_dask_cudf_nvtx_annotate
def safe_hash(frame):
return cudf.Series(frame.hash_values(), index=frame.index)


@hash_object_dispatch.register((cudf.DataFrame, cudf.Series))
@_dask_cudf_nvtx_annotate
def hash_object_cudf(frame, index=True):
if index:
return safe_hash(frame.reset_index())
return safe_hash(frame)
frame = frame.reset_index()
return frame.hash_values()


@hash_object_dispatch.register(cudf.BaseIndex)
@_dask_cudf_nvtx_annotate
def hash_object_cudf_index(ind, index=None):

if isinstance(ind, cudf.MultiIndex):
return safe_hash(ind.to_frame(index=False))
return ind.to_frame(index=False).hash_values()

col = cudf.core.column.as_column(ind)
return safe_hash(cudf.Series(col))
return cudf.Series(col).hash_values()


@group_split_dispatch.register((cudf.Series, cudf.DataFrame))