Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Delay materializing RangeIndex in .reset_index #15588

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 15 additions & 8 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import pickle
import warnings
from collections.abc import Generator
from functools import cached_property
from typing import Any, Literal, Set, Tuple

Expand Down Expand Up @@ -2183,14 +2184,20 @@ def repeat(self, repeats, axis=None):
"""
raise NotImplementedError

def _split_columns_by_levels(self, levels):
if isinstance(levels, int) and levels > 0:
raise ValueError(f"Out of bound level: {levels}")
return (
[self._data[self.name]],
[],
["index" if self.name is None else self.name],
[],
def _new_index_for_reset_index(
self, levels: tuple | None, name
) -> None | BaseIndex:
"""Return the new index after .reset_index"""
# None is caught later to return RangeIndex
return None

def _columns_for_reset_index(
self, levels: tuple | None
) -> Generator[tuple[Any, ColumnBase], None, None]:
"""Return the columns and column names for .reset_index"""
yield (
"index" if self.name is None else self.name,
next(iter(self._columns)),
)

def _split(self, splits):
Expand Down
8 changes: 8 additions & 0 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import operator
import pickle
import warnings
from collections.abc import Generator
from functools import cache, cached_property
from numbers import Number
from typing import (
Expand Down Expand Up @@ -955,6 +956,13 @@ def __pos__(self):
def __abs__(self):
return abs(self._as_int_index())

def _columns_for_reset_index(
self, levels: tuple | None
) -> Generator[tuple[Any, ColumnBase], None, None]:
"""Return the columns and column names for .reset_index"""
# We need to explicitly materialize the RangeIndex to a column
yield "index" if self.name is None else self.name, as_column(self)
vyasr marked this conversation as resolved.
Show resolved Hide resolved

@_warn_no_dask_cudf
def __dask_tokenize__(self):
return (type(self), self.start, self.stop, self.step)
Expand Down
35 changes: 14 additions & 21 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4338,34 +4338,27 @@ def take(self, indices, axis=0):

def _reset_index(self, level, drop, col_level=0, col_fill=""):
"""Shared path for DataFrame.reset_index and Series.reset_index."""
if level is not None and not isinstance(level, (tuple, list)):
level = (level,)
if level is not None:
if (
isinstance(level, int)
and level > 0
and not isinstance(self.index, MultiIndex)
):
raise IndexError(
f"Too many levels: Index has only 1 level, not {level + 1}"
)
if not isinstance(level, (tuple, list)):
level = (level,)
_check_duplicate_level_names(level, self._index.names)

# Split the columns in the index into data and index columns
(
data_columns,
index_columns,
data_names,
index_names,
) = self._index._split_columns_by_levels(level)
if index_columns:
index = _index_from_data(
dict(enumerate(index_columns)),
name=self._index.name,
)
if isinstance(index, MultiIndex):
index.names = index_names
else:
index.name = index_names[0]
else:
index = self.index._new_index_for_reset_index(level, self.index.name)
if index is None:
index = RangeIndex(len(self))

if drop:
return self._data, index

new_column_data = {}
for name, col in zip(data_names, data_columns):
for name, col in self.index._columns_for_reset_index(level):
if name == "index" and "index" in self._data:
name = "level_0"
name = (
Expand Down
74 changes: 49 additions & 25 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pickle
import warnings
from collections import abc
from collections.abc import Generator
from functools import cached_property
from numbers import Integral
from typing import Any, List, MutableMapping, Tuple, Union
Expand Down Expand Up @@ -2051,41 +2052,64 @@ def _copy_type_metadata(
return res

@_cudf_nvtx_annotate
def _split_columns_by_levels(self, levels):
def _split_columns_by_levels(
self, levels: tuple, *, in_levels: bool
) -> Generator[tuple[Any, column.ColumnBase], None, None]:
# This function assumes that for levels with duplicate names, they are
# specified by indices, not name by ``levels``. E.g. [None, None] can
# only be specified by 0, 1, not "None".

if levels is None:
return (
list(self._data.columns),
[],
[
f"level_{i}" if name is None else name
for i, name in enumerate(self.names)
],
[],
)

# Normalize named levels into indices
level_names = list(self.names)
level_indices = {
lv if isinstance(lv, int) else level_names.index(lv)
for lv in levels
}

# Split the columns
data_columns, index_columns = [], []
data_names, index_names = [], []
for i, (name, col) in enumerate(zip(self.names, self._data.columns)):
if i in level_indices:
if in_levels and i in level_indices:
name = f"level_{i}" if name is None else name
data_columns.append(col)
data_names.append(name)
else:
index_columns.append(col)
index_names.append(name)
return data_columns, index_columns, data_names, index_names
yield name, col
elif not in_levels and i not in level_indices:
yield name, col

@_cudf_nvtx_annotate
def _new_index_for_reset_index(
self, levels: tuple | None, name
) -> None | BaseIndex:
"""Return the new index after .reset_index"""
if levels is None:
return None

index_columns, index_names = [], []
for name, col in self._split_columns_by_levels(
levels, in_levels=False
):
index_columns.append(col)
index_names.append(name)

if not index_columns:
# None is caught later to return RangeIndex
return None

index = cudf.core.index._index_from_data(
dict(enumerate(index_columns)),
name=name,
)
if isinstance(index, type(self)):
index.names = index_names
else:
index.name = index_names[0]
return index

def _columns_for_reset_index(
self, levels: tuple | None
) -> Generator[tuple[Any, column.ColumnBase], None, None]:
"""Return the columns and column names for .reset_index"""
if levels is None:
for i, (col, name) in enumerate(
zip(self._data.columns, self.names)
):
yield f"level_{i}" if name is None else name, col
else:
yield from self._split_columns_by_levels(levels, in_levels=True)

def repeat(self, repeats, axis=None):
return self._from_columns_like_self(
Expand Down
8 changes: 8 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3208,6 +3208,14 @@ def test_reset_index_unnamed(
assert_eq(expect, got)


def test_reset_index_invalid_level():
with pytest.raises(IndexError):
cudf.DataFrame([1]).reset_index(level=2)

with pytest.raises(IndexError):
pd.DataFrame([1]).reset_index(level=2)


@pytest.mark.parametrize(
"data",
[
Expand Down
Loading