Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consolidate and improve reset_index #9750

Merged
Merged
Show file tree
Hide file tree
Changes from 30 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
6edc4be
initial pass
isVoid Nov 21, 2021
8416b94
handling level=None, check duplicate level names, move level name han…
isVoid Nov 21, 2021
41986ff
first pass clean up, not returning column indices from helpers
isVoid Nov 22, 2021
3993530
lining up `name` behavior in series api
isVoid Nov 22, 2021
f90bdd4
duplicate same tests to series
isVoid Nov 22, 2021
f612e65
minor improvements and extra test cases
isVoid Nov 29, 2021
b05a583
:wqMerge branch 'branch-22.02' of github.com:rapidsai/cudf into impro…
isVoid Nov 29, 2021
7ed329b
Make use of docfmt and docutils
isVoid Nov 29, 2021
dfce947
Minor doc improvement
isVoid Nov 30, 2021
6210abb
use doc_apply
isVoid Dec 4, 2021
8d8cced
style
isVoid Dec 4, 2021
ec496c0
Commits review changes in check duplicates func
isVoid Dec 4, 2021
c3bb37c
Raise proper error when duplicate name specified; improve assertion o…
isVoid Dec 4, 2021
dc2746d
Update python/cudf/cudf/core/multiindex.py
isVoid Dec 4, 2021
50980b7
rename var
isVoid Dec 4, 2021
5e2078e
Special handling prior to loop
isVoid Dec 4, 2021
2c70c48
improve series test
isVoid Dec 4, 2021
44583fb
not skipping tests when drop=False and inplace=True
isVoid Dec 4, 2021
0413858
revert factory pending discussion
isVoid Dec 4, 2021
5b81aa3
add _index_from_columns helper
isVoid Dec 7, 2021
45ff7d6
doc
isVoid Dec 7, 2021
4e38d97
make naming consistent
isVoid Dec 7, 2021
1eb9873
raise all encountered duplicates
isVoid Dec 8, 2021
b9dc154
move doc template to indexedframe
isVoid Dec 8, 2021
475a7f7
Merge branch 'branch-22.02' of github.com:rapidsai/cudf into improvem…
isVoid Dec 8, 2021
06a9d6a
test fix
isVoid Dec 8, 2021
e5c2f49
Merge branch 'branch-22.02' of github.com:rapidsai/cudf into improvem…
isVoid Dec 8, 2021
0772bca
fix broken tests due to change in exception raised
isVoid Dec 10, 2021
b81e39f
Merge branch 'branch-22.02' of github.com:rapidsai/cudf into improvem…
isVoid Dec 10, 2021
6372794
also support multilevel column names in reset index
isVoid Dec 10, 2021
fb02ae3
move helper inline
isVoid Dec 15, 2021
0f40453
Merge branch 'branch-22.02' of github.com:rapidsai/cudf into improvem…
isVoid Jan 5, 2022
1fb372c
fix docstring
galipremsagar Jan 6, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1414,6 +1414,16 @@ def from_pandas(cls, index, nan_as_null=None):
def _constructor_expanddim(self):
return cudf.MultiIndex

def _split_columns_by_levels(self, levels):
if isinstance(levels, int) and levels > 0:
raise ValueError(f"Out of bound level: {levels}")
return (
[self._data[self.name]],
[],
["index" if self.name is None else self.name],
[],
)


def _get_result_name(left_name, right_name):
if left_name == right_name:
Expand Down
111 changes: 51 additions & 60 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
_FrameIndexer,
_get_label_range_or_mask,
_indices_from_labels,
doc_reset_index_template,
)
from cudf.core.resample import DataFrameResampler
from cudf.core.series import Series
Expand Down Expand Up @@ -2486,29 +2487,13 @@ def set_index(
df.index = idx
return df if not inplace else None

def reset_index(
self, level=None, drop=False, inplace=False, col_level=0, col_fill=""
):
"""
Reset the index.

Reset the index of the DataFrame, and use the default one instead.

Parameters
----------
drop : bool, default False
Do not try to insert index into dataframe columns. This resets
the index to the default integer index.
inplace : bool, default False
Modify the DataFrame in place (do not create a new object).

Returns
-------
DataFrame or None
DataFrame with the new index or None if ``inplace=True``.

Examples
--------
@docutils.doc_apply(
doc_reset_index_template.format(
klass="DataFrame",
argument="",
return_type="DataFrame or None",
return_doc="",
example="""
>>> df = cudf.DataFrame([('bird', 389.0),
... ('bird', 24.0),
... ('mammal', 80.5),
Expand All @@ -2533,45 +2518,51 @@ class max_speed
1 bird 24.0
2 mammal 80.5
3 mammal <NA>
"""
if level is not None:
raise NotImplementedError("level parameter is not supported yet.")

if col_level != 0:
raise NotImplementedError(
"col_level parameter is not supported yet."
)

if col_fill != "":
raise NotImplementedError(
"col_fill parameter is not supported yet."
)

result = self if inplace else self.copy()

if not drop:
if isinstance(self.index, cudf.MultiIndex):
names = tuple(
name if name is not None else f"level_{i}"
for i, name in enumerate(self.index.names)
You can also use ``reset_index`` with MultiIndex.

>>> index = cudf.MultiIndex.from_tuples([('bird', 'falcon'),
... ('bird', 'parrot'),
... ('mammal', 'lion'),
... ('mammal', 'monkey')],
... names=['class', 'name'])
>>> df = cudf.DataFrame([(389.0, 'fly'),
... ( 24.0, 'fly'),
... ( 80.5, 'run'),
... (np.nan, 'jump')],
... index=index,
... columns=('speed', 'type'))
>>> df
speed type
class name
bird falcon 389.0 fly
parrot 24.0 fly
mammal lion 80.5 run
monkey <NA> jump
>>> df.reset_index(level='class')
class speed type
name
falcon bird 389.0 fly
parrot bird 24.0 fly
lion mammal 80.5 run
monkey mammal <NA> jump
""",
)
)
def reset_index(
self, level=None, drop=False, inplace=False, col_level=0, col_fill=""
):
return self._mimic_inplace(
DataFrame._from_data(
*self._reset_index(
level=level,
drop=drop,
col_level=col_level,
col_fill=col_fill,
)
else:
if self.index.name is None:
if "index" in self._data.names:
names = ("level_0",)
else:
names = ("index",)
else:
names = (self.index.name,)

index_columns = self.index._data.columns
for name, index_column in zip(
reversed(names), reversed(index_columns)
):
result.insert(0, name, index_column)
result.index = RangeIndex(len(self))
if not inplace:
return result
),
inplace=inplace,
)

def take(self, indices, axis=0, keep_index=None):
axis = self._get_axis_from_axis_arg(axis)
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,8 @@ def _from_columns(
n_index_columns = 0
if index_names is not None:
n_index_columns = len(index_names)
index = cudf.core.index._index_from_data(
dict(zip(range(n_index_columns), columns))
index = cudf.core.index._index_from_columns(
columns[:n_index_columns]
)
if isinstance(index, cudf.MultiIndex):
index.names = index_names
Expand Down
7 changes: 7 additions & 0 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,13 @@ def _index_from_data(data: MutableMapping, name: Any = None):
return index_class_type._from_data(data, None, name)


def _index_from_columns(
columns: List[cudf.core.column.ColumnBase], name: Any = None
):
"""Construct an index from ``columns``, with levels named 0, 1, 2..."""
return _index_from_data(dict(zip(range(len(columns)), columns)), name=name)


class RangeIndex(BaseIndex):
"""
Immutable Index implementing a monotonic integer range.
Expand Down
99 changes: 96 additions & 3 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from __future__ import annotations

import warnings
from collections import abc
from collections import Counter, abc
from typing import Type, TypeVar
from uuid import uuid4

Expand All @@ -23,10 +23,40 @@
is_list_like,
)
from cudf.core.column import arange
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.frame import Frame
from cudf.core.index import Index
from cudf.core.index import Index, RangeIndex, _index_from_columns
from cudf.core.multiindex import MultiIndex
from cudf.utils.utils import _gather_map_is_valid, cached_property
from cudf.utils.utils import (
_gather_map_is_valid,
_make_column_name,
cached_property,
)

doc_reset_index_template = """
Reset the index of the {klass}, or a level of it.

Parameters
----------
level : int, str, tuple, or list, default None
Only remove the given levels from the index. Removes all levels by
default.
drop : bool, default False
Do not try to insert index into dataframe columns. This resets
the index to the default integer index.
{argument}
inplace : bool, default False
Modify the DataFrame in place (do not create a new object).

Returns
-------
{return_type}
{klass} with the new index or None if ``inplace=True``.{return_doc}

Examples
--------
{example}
"""


def _indices_from_labels(obj, labels):
Expand Down Expand Up @@ -1104,3 +1134,66 @@ def resample(
if isinstance(self, cudf.Series)
else cudf.core.resample.DataFrameResampler(self, by=by)
)

def _reset_index(self, level, drop, col_level=0, col_fill=""):
"""Shared path for DataFrame.reset_index and Series.reset_index."""
if level is not None and not isinstance(level, (tuple, list)):
level = (level,)
_check_duplicate_level_names(level, self._index.names)

# Split the columns in the index into data and index columns
(
data_columns,
index_columns,
data_names,
index_names,
) = self._index._split_columns_by_levels(level)
if index_columns:
index = _index_from_columns(index_columns, name=self._index.name,)
if isinstance(index, MultiIndex):
index.names = index_names
else:
index.name = index_names[0]
else:
index = RangeIndex(len(self))

if drop:
return self._data, index

new_column_data = {}
for name, col in zip(data_names, data_columns):
if name == "index" and "index" in self._data:
name = "level_0"
name = _make_column_name(
name,
self._data.multiindex,
col_level,
col_fill,
self._data.nlevels,
)
new_column_data[name] = col
# This is to match pandas where the new data columns are always
# inserted to the left of existing data columns.
return (
ColumnAccessor(
{**new_column_data, **self._data}, self._data.multiindex
),
index,
)


def _check_duplicate_level_names(specified, level_names):
isVoid marked this conversation as resolved.
Show resolved Hide resolved
"""Raise if any of `specified` has duplicates in `level_names`."""
if specified is None:
return
isVoid marked this conversation as resolved.
Show resolved Hide resolved
if len(set(level_names)) == len(level_names):
return
duplicates = {key for key, val in Counter(level_names).items() if val > 1}

duplicates_specified = [spec for spec in specified if spec in duplicates]
if not len(duplicates_specified) == 0:
# Note: pandas raises first encountered duplicates, cuDF raises all.
raise ValueError(
f"The names {duplicates_specified} occurs multiple times, use a"
" level number"
)
36 changes: 36 additions & 0 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -1743,3 +1743,39 @@ def _intersection(self, other, sort=None):
if sort is None and len(other):
return midx.sort_values()
return midx

def _split_columns_by_levels(self, levels):
# This function assumes that for levels with duplicate names, they are
# specified by indices, not name by ``levels``. E.g. [None, None] can
# only be specified by 0, 1, not "None".

if levels is None:
return (
list(self._data.columns),
[],
[
f"level_{i}" if name is None else name
for i, name in enumerate(self.names)
],
[],
)

# Normalize named levels into indices
level_names = list(self.names)
level_indices = {
lv if isinstance(lv, int) else level_names.index(lv)
for lv in levels
}

# Split the columns
data_columns, index_columns = [], []
data_names, index_names = [], []
for i, (name, col) in enumerate(zip(self.names, self._data.columns)):
if i in level_indices:
name = f"level_{i}" if name is None else name
data_columns.append(col)
data_names.append(name)
else:
index_columns.append(col)
index_names.append(name)
return data_columns, index_columns, data_names, index_names
Loading