Skip to content

Commit

Permalink
Consolidate and improve reset_index (#9750)
Browse files Browse the repository at this point in the history
Partial of #9038 

This is a rewrite of `reset_index` to share some common logics between `Series` and `DataFrame`. It extends it's capability to handle `level` argument for multi-level index, `col_level` and `col_fill` for multi-level column name support. And adds `name` argument support for series api.

Authors:
  - Michael Wang (https://github.com/isVoid)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #9750
  • Loading branch information
isVoid authored Jan 8, 2022
1 parent 3192ace commit 0722e20
Show file tree
Hide file tree
Showing 10 changed files with 503 additions and 118 deletions.
10 changes: 10 additions & 0 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1414,6 +1414,16 @@ def from_pandas(cls, index, nan_as_null=None):
def _constructor_expanddim(self):
return cudf.MultiIndex

def _split_columns_by_levels(self, levels):
if isinstance(levels, int) and levels > 0:
raise ValueError(f"Out of bound level: {levels}")
return (
[self._data[self.name]],
[],
["index" if self.name is None else self.name],
[],
)


def _get_result_name(left_name, right_name):
if left_name == right_name:
Expand Down
111 changes: 51 additions & 60 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
_FrameIndexer,
_get_label_range_or_mask,
_indices_from_labels,
doc_reset_index_template,
)
from cudf.core.multiindex import MultiIndex
from cudf.core.resample import DataFrameResampler
Expand Down Expand Up @@ -2429,29 +2430,13 @@ def set_index(
df.index = idx
return df if not inplace else None

def reset_index(
self, level=None, drop=False, inplace=False, col_level=0, col_fill=""
):
"""
Reset the index.
Reset the index of the DataFrame, and use the default one instead.
Parameters
----------
drop : bool, default False
Do not try to insert index into dataframe columns. This resets
the index to the default integer index.
inplace : bool, default False
Modify the DataFrame in place (do not create a new object).
Returns
-------
DataFrame or None
DataFrame with the new index or None if ``inplace=True``.
Examples
--------
@docutils.doc_apply(
doc_reset_index_template.format(
klass="DataFrame",
argument="",
return_type="DataFrame or None",
return_doc="",
example="""
>>> df = cudf.DataFrame([('bird', 389.0),
... ('bird', 24.0),
... ('mammal', 80.5),
Expand All @@ -2476,45 +2461,51 @@ class max_speed
1 bird 24.0
2 mammal 80.5
3 mammal <NA>
"""
if level is not None:
raise NotImplementedError("level parameter is not supported yet.")

if col_level != 0:
raise NotImplementedError(
"col_level parameter is not supported yet."
)
if col_fill != "":
raise NotImplementedError(
"col_fill parameter is not supported yet."
)

result = self if inplace else self.copy()

if not drop:
if isinstance(self.index, MultiIndex):
names = tuple(
name if name is not None else f"level_{i}"
for i, name in enumerate(self.index.names)
You can also use ``reset_index`` with MultiIndex.
>>> index = cudf.MultiIndex.from_tuples([('bird', 'falcon'),
... ('bird', 'parrot'),
... ('mammal', 'lion'),
... ('mammal', 'monkey')],
... names=['class', 'name'])
>>> df = cudf.DataFrame([(389.0, 'fly'),
... ( 24.0, 'fly'),
... ( 80.5, 'run'),
... (np.nan, 'jump')],
... index=index,
... columns=('speed', 'type'))
>>> df
speed type
class name
bird falcon 389.0 fly
parrot 24.0 fly
mammal lion 80.5 run
monkey <NA> jump
>>> df.reset_index(level='class')
class speed type
name
falcon bird 389.0 fly
parrot bird 24.0 fly
lion mammal 80.5 run
monkey mammal <NA> jump
""",
)
)
def reset_index(
self, level=None, drop=False, inplace=False, col_level=0, col_fill=""
):
return self._mimic_inplace(
DataFrame._from_data(
*self._reset_index(
level=level,
drop=drop,
col_level=col_level,
col_fill=col_fill,
)
else:
if self.index.name is None:
if "index" in self._data.names:
names = ("level_0",)
else:
names = ("index",)
else:
names = (self.index.name,)

index_columns = self.index._data.columns
for name, index_column in zip(
reversed(names), reversed(index_columns)
):
result.insert(0, name, index_column)
result.index = RangeIndex(len(self))
if not inplace:
return result
),
inplace=inplace,
)

def take(self, indices, axis=0, keep_index=None):
axis = self._get_axis_from_axis_arg(axis)
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,8 @@ def _from_columns(
n_index_columns = 0
if index_names is not None:
n_index_columns = len(index_names)
index = cudf.core.index._index_from_data(
dict(zip(range(n_index_columns), columns))
index = cudf.core.index._index_from_columns(
columns[:n_index_columns]
)
if isinstance(index, cudf.MultiIndex):
index.names = index_names
Expand Down
7 changes: 7 additions & 0 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,13 @@ def _index_from_data(data: MutableMapping, name: Any = None):
return index_class_type._from_data(data, None, name)


def _index_from_columns(
columns: List[cudf.core.column.ColumnBase], name: Any = None
):
"""Construct an index from ``columns``, with levels named 0, 1, 2..."""
return _index_from_data(dict(zip(range(len(columns)), columns)), name=name)


class RangeIndex(BaseIndex):
"""
Immutable Index implementing a monotonic integer range.
Expand Down
94 changes: 92 additions & 2 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import operator
import warnings
from collections import abc
from collections import Counter, abc
from typing import Callable, Type, TypeVar
from uuid import uuid4

Expand All @@ -24,11 +24,37 @@
is_list_like,
)
from cudf.core.column import arange
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.frame import Frame
from cudf.core.index import Index
from cudf.core.index import Index, RangeIndex, _index_from_columns
from cudf.core.multiindex import MultiIndex
from cudf.utils.utils import _gather_map_is_valid, cached_property

doc_reset_index_template = """
Reset the index of the {klass}, or a level of it.
Parameters
----------
level : int, str, tuple, or list, default None
Only remove the given levels from the index. Removes all levels by
default.
drop : bool, default False
Do not try to insert index into dataframe columns. This resets
the index to the default integer index.
{argument}
inplace : bool, default False
Modify the DataFrame in place (do not create a new object).
Returns
-------
{return_type}
{klass} with the new index or None if ``inplace=True``.{return_doc}
Examples
--------
{example}
"""


def _indices_from_labels(obj, labels):
from cudf.core.column import column
Expand Down Expand Up @@ -1171,6 +1197,53 @@ def resample(
else cudf.core.resample.DataFrameResampler(self, by=by)
)

def _reset_index(self, level, drop, col_level=0, col_fill=""):
"""Shared path for DataFrame.reset_index and Series.reset_index."""
if level is not None and not isinstance(level, (tuple, list)):
level = (level,)
_check_duplicate_level_names(level, self._index.names)

# Split the columns in the index into data and index columns
(
data_columns,
index_columns,
data_names,
index_names,
) = self._index._split_columns_by_levels(level)
if index_columns:
index = _index_from_columns(index_columns, name=self._index.name,)
if isinstance(index, MultiIndex):
index.names = index_names
else:
index.name = index_names[0]
else:
index = RangeIndex(len(self))

if drop:
return self._data, index

new_column_data = {}
for name, col in zip(data_names, data_columns):
if name == "index" and "index" in self._data:
name = "level_0"
name = (
tuple(
name if i == col_level else col_fill
for i in range(self._data.nlevels)
)
if self._data.multiindex
else name
)
new_column_data[name] = col
# This is to match pandas where the new data columns are always
# inserted to the left of existing data columns.
return (
ColumnAccessor(
{**new_column_data, **self._data}, self._data.multiindex
),
index,
)

def _first_or_last(
self, offset, idx: int, op: Callable, side: str, slice_func: Callable
) -> "IndexedFrame":
Expand Down Expand Up @@ -1292,3 +1365,20 @@ def last(self, offset):
side="right",
slice_func=lambda i: self.iloc[i:],
)


def _check_duplicate_level_names(specified, level_names):
"""Raise if any of `specified` has duplicates in `level_names`."""
if specified is None:
return
if len(set(level_names)) == len(level_names):
return
duplicates = {key for key, val in Counter(level_names).items() if val > 1}

duplicates_specified = [spec for spec in specified if spec in duplicates]
if not len(duplicates_specified) == 0:
# Note: pandas raises first encountered duplicates, cuDF raises all.
raise ValueError(
f"The names {duplicates_specified} occurs multiple times, use a"
" level number"
)
36 changes: 36 additions & 0 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -1743,3 +1743,39 @@ def _intersection(self, other, sort=None):
if sort is None and len(other):
return midx.sort_values()
return midx

def _split_columns_by_levels(self, levels):
# This function assumes that for levels with duplicate names, they are
# specified by indices, not name by ``levels``. E.g. [None, None] can
# only be specified by 0, 1, not "None".

if levels is None:
return (
list(self._data.columns),
[],
[
f"level_{i}" if name is None else name
for i, name in enumerate(self.names)
],
[],
)

# Normalize named levels into indices
level_names = list(self.names)
level_indices = {
lv if isinstance(lv, int) else level_names.index(lv)
for lv in levels
}

# Split the columns
data_columns, index_columns = [], []
data_names, index_names = [], []
for i, (name, col) in enumerate(zip(self.names, self._data.columns)):
if i in level_indices:
name = f"level_{i}" if name is None else name
data_columns.append(col)
data_names.append(name)
else:
index_columns.append(col)
index_names.append(name)
return data_columns, index_columns, data_names, index_names
Loading

0 comments on commit 0722e20

Please sign in to comment.