Skip to content
/ cudf Public
forked from rapidsai/cudf

Commit

Permalink
Fix loc-getitem ordering when index contains duplicate labels
Browse files Browse the repository at this point in the history
If the index has duplicate labels then it is not sufficient to sort
with the ordering induced by gathering the requested keys since the
duplicate keys arrive in non-deterministic order from the join in the
table that is being looked up from. To fix this, sort additionally by
a secondary key that is the ordering induced by gathering from the
donor table.

- Closes rapidsai#13658
  • Loading branch information
wence- committed Jul 4, 2023
1 parent aed7174 commit e82fb11
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 3 deletions.
6 changes: 4 additions & 2 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,12 +291,14 @@ def _getitem_tuple_arg(self, arg):
{tmp_col_name: column.arange(len(tmp_arg[0]))},
index=as_index(tmp_arg[0]),
)
cantor_name = "_" + "_".join(columns_df.columns)
columns_df[cantor_name] = column.arange(len(columns_df))
df = other_df.join(columns_df, how="inner")
# as join is not assigning any names to index,
# update it over here
df.index.name = columns_df.index.name
df = df.sort_values(tmp_col_name)
df.drop(columns=[tmp_col_name], inplace=True)
df = df.sort_values(by=[tmp_col_name, cantor_name])
df.drop(columns=[tmp_col_name, cantor_name], inplace=True)
# There were no indices found
if len(df) == 0:
raise KeyError(arg)
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def _indices_from_labels(obj, labels):
rhs = cudf.DataFrame(
{"_": cudf.core.column.arange(len(obj))}, index=obj.index
)
return lhs.join(rhs).sort_values("__")["_"]
return lhs.join(rhs).sort_values(by=["__", "_"])["_"]


def _get_label_range_or_mask(index, start, stop, step):
Expand Down
14 changes: 14 additions & 0 deletions python/cudf/cudf/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1996,6 +1996,20 @@ def test_loc_missing_label_keyerror_issue_13379(index):
cdf.loc[[0, 5]]


@pytest.mark.parametrize("series", [True, False], ids=["Series", "DataFrame"])
def test_loc_repeated_label_ordering_issue_13658(series):
# https://github.com/rapidsai/cudf/issues/13658
values = range(2048)
index = [1 for _ in values]
if series:
frame = cudf.Series(values, index=index)
else:
frame = cudf.DataFrame({"a": values}, index=index)
expect = frame.to_pandas().loc[[1]]
actual = frame.loc[[1]]
assert_eq(actual, expect)


class TestLocIndexWithOrder:
# https://github.com/rapidsai/cudf/issues/12833
@pytest.fixture(params=["increasing", "decreasing", "neither"])
Expand Down

0 comments on commit e82fb11

Please sign in to comment.