From 503bb3ca98da5fd1a45806852f7db07c935e78bd Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 23 Apr 2024 16:17:33 -0700 Subject: [PATCH] Delay materializing RangeIndex in .reset_index --- python/cudf/cudf/core/_base_index.py | 23 +++++--- python/cudf/cudf/core/index.py | 8 +++ python/cudf/cudf/core/indexed_frame.py | 35 +++++------ python/cudf/cudf/core/multiindex.py | 74 ++++++++++++++++-------- python/cudf/cudf/tests/test_dataframe.py | 8 +++ 5 files changed, 94 insertions(+), 54 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index de44f392eef..742198b1e61 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -4,6 +4,7 @@ import pickle import warnings +from collections.abc import Generator from functools import cached_property from typing import Any, Literal, Set, Tuple @@ -2183,14 +2184,20 @@ def repeat(self, repeats, axis=None): """ raise NotImplementedError - def _split_columns_by_levels(self, levels): - if isinstance(levels, int) and levels > 0: - raise ValueError(f"Out of bound level: {levels}") - return ( - [self._data[self.name]], - [], - ["index" if self.name is None else self.name], - [], + def _new_index_for_reset_index( + self, levels: tuple | None, name + ) -> None | BaseIndex: + """Return the new index after .reset_index""" + # None is caught later to return RangeIndex + return None + + def _columns_for_reset_index( + self, levels: tuple | None + ) -> Generator[tuple[Any, ColumnBase], None, None]: + """Return the columns and column names for .reset_index""" + yield ( + "index" if self.name is None else self.name, + next(iter(self._columns)), ) def _split(self, splits): diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 6f08b1d83b3..063286754f0 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -5,6 +5,7 @@ import operator import pickle import warnings +from collections.abc import Generator from functools import cache, cached_property from numbers import Number from typing import ( @@ -955,6 +956,13 @@ def __pos__(self): def __abs__(self): return abs(self._as_int_index()) + def _columns_for_reset_index( + self, levels: tuple | None + ) -> Generator[tuple[Any, ColumnBase], None, None]: + """Return the columns and column names for .reset_index""" + # We need to explicitly materialize the RangeIndex to a column + yield "index" if self.name is None else self.name, as_column(self) + @_warn_no_dask_cudf def __dask_tokenize__(self): return (type(self), self.start, self.stop, self.step) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 48e80d8162f..e161723747b 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -4338,34 +4338,27 @@ def take(self, indices, axis=0): def _reset_index(self, level, drop, col_level=0, col_fill=""): """Shared path for DataFrame.reset_index and Series.reset_index.""" - if level is not None and not isinstance(level, (tuple, list)): - level = (level,) + if level is not None: + if ( + isinstance(level, int) + and level > 0 + and not isinstance(self.index, MultiIndex) + ): + raise IndexError( + f"Too many levels: Index has only 1 level, not {level + 1}" + ) + if not isinstance(level, (tuple, list)): + level = (level,) _check_duplicate_level_names(level, self._index.names) - # Split the columns in the index into data and index columns - ( - data_columns, - index_columns, - data_names, - index_names, - ) = self._index._split_columns_by_levels(level) - if index_columns: - index = _index_from_data( - dict(enumerate(index_columns)), - name=self._index.name, - ) - if isinstance(index, MultiIndex): - index.names = index_names - else: - index.name = index_names[0] - else: + index = self.index._new_index_for_reset_index(level, self.index.name) + if index is None: index = RangeIndex(len(self)) - if drop: return self._data, index new_column_data = {} - for name, col in zip(data_names, data_columns): + for name, col in self.index._columns_for_reset_index(level): if name == "index" and "index" in self._data: name = "level_0" name = ( diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 019daacddba..cd89cb74db6 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -8,6 +8,7 @@ import pickle import warnings from collections import abc +from collections.abc import Generator from functools import cached_property from numbers import Integral from typing import Any, List, MutableMapping, Tuple, Union @@ -2051,41 +2052,64 @@ def _copy_type_metadata( return res @_cudf_nvtx_annotate - def _split_columns_by_levels(self, levels): + def _split_columns_by_levels( + self, levels: tuple, *, in_levels: bool + ) -> Generator[tuple[Any, column.ColumnBase], None, None]: # This function assumes that for levels with duplicate names, they are # specified by indices, not name by ``levels``. E.g. [None, None] can # only be specified by 0, 1, not "None". - - if levels is None: - return ( - list(self._data.columns), - [], - [ - f"level_{i}" if name is None else name - for i, name in enumerate(self.names) - ], - [], - ) - - # Normalize named levels into indices level_names = list(self.names) level_indices = { lv if isinstance(lv, int) else level_names.index(lv) for lv in levels } - - # Split the columns - data_columns, index_columns = [], [] - data_names, index_names = [], [] for i, (name, col) in enumerate(zip(self.names, self._data.columns)): - if i in level_indices: + if in_levels and i in level_indices: name = f"level_{i}" if name is None else name - data_columns.append(col) - data_names.append(name) - else: - index_columns.append(col) - index_names.append(name) - return data_columns, index_columns, data_names, index_names + yield name, col + elif not in_levels and i not in level_indices: + yield name, col + + @_cudf_nvtx_annotate + def _new_index_for_reset_index( + self, levels: tuple | None, name + ) -> None | BaseIndex: + """Return the new index after .reset_index""" + if levels is None: + return None + + index_columns, index_names = [], [] + for name, col in self._split_columns_by_levels( + levels, in_levels=False + ): + index_columns.append(col) + index_names.append(name) + + if not index_columns: + # None is caught later to return RangeIndex + return None + + index = cudf.core.index._index_from_data( + dict(enumerate(index_columns)), + name=name, + ) + if isinstance(index, type(self)): + index.names = index_names + else: + index.name = index_names[0] + return index + + def _columns_for_reset_index( + self, levels: tuple | None + ) -> Generator[tuple[Any, column.ColumnBase], None, None]: + """Return the columns and column names for .reset_index""" + if levels is None: + for i, (col, name) in enumerate( + zip(self._data.columns, self.names) + ): + yield f"level_{i}" if name is None else name, col + else: + yield from self._split_columns_by_levels(levels, in_levels=True) def repeat(self, repeats, axis=None): return self._from_columns_like_self( diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 59e8b41e51a..2527f9356f7 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3208,6 +3208,14 @@ def test_reset_index_unnamed( assert_eq(expect, got) +def test_reset_index_invalid_level(): + with pytest.raises(IndexError): + cudf.DataFrame([1]).reset_index(level=2) + + with pytest.raises(IndexError): + pd.DataFrame([1]).reset_index(level=2) + + @pytest.mark.parametrize( "data", [