Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consolidate and improve reset_index #9750

Merged
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
6edc4be
initial pass
isVoid Nov 21, 2021
8416b94
handling level=None, check duplicate level names, move level name han…
isVoid Nov 21, 2021
41986ff
first pass clean up, not returning column indices from helpers
isVoid Nov 22, 2021
3993530
lining up `name` behavior in series api
isVoid Nov 22, 2021
f90bdd4
duplicate same tests to series
isVoid Nov 22, 2021
f612e65
minor improvements and extra test cases
isVoid Nov 29, 2021
b05a583
:wqMerge branch 'branch-22.02' of github.com:rapidsai/cudf into impro…
isVoid Nov 29, 2021
7ed329b
Make use of docfmt and docutils
isVoid Nov 29, 2021
dfce947
Minor doc improvement
isVoid Nov 30, 2021
6210abb
use doc_apply
isVoid Dec 4, 2021
8d8cced
style
isVoid Dec 4, 2021
ec496c0
Commits review changes in check duplicates func
isVoid Dec 4, 2021
c3bb37c
Raise proper error when duplicate name specified; improve assertion o…
isVoid Dec 4, 2021
dc2746d
Update python/cudf/cudf/core/multiindex.py
isVoid Dec 4, 2021
50980b7
rename var
isVoid Dec 4, 2021
5e2078e
Special handling prior to loop
isVoid Dec 4, 2021
2c70c48
improve series test
isVoid Dec 4, 2021
44583fb
not skipping tests when drop=False and inplace=True
isVoid Dec 4, 2021
0413858
revert factory pending discussion
isVoid Dec 4, 2021
5b81aa3
add _index_from_columns helper
isVoid Dec 7, 2021
45ff7d6
doc
isVoid Dec 7, 2021
4e38d97
make naming consistent
isVoid Dec 7, 2021
1eb9873
raise all encountered duplicates
isVoid Dec 8, 2021
b9dc154
move doc template to indexedframe
isVoid Dec 8, 2021
475a7f7
Merge branch 'branch-22.02' of github.com:rapidsai/cudf into improvem…
isVoid Dec 8, 2021
06a9d6a
test fix
isVoid Dec 8, 2021
e5c2f49
Merge branch 'branch-22.02' of github.com:rapidsai/cudf into improvem…
isVoid Dec 8, 2021
0772bca
fix broken tests due to change in exception raised
isVoid Dec 10, 2021
b81e39f
Merge branch 'branch-22.02' of github.com:rapidsai/cudf into improvem…
isVoid Dec 10, 2021
6372794
also support multilevel column names in reset index
isVoid Dec 10, 2021
fb02ae3
move helper inline
isVoid Dec 15, 2021
0f40453
Merge branch 'branch-22.02' of github.com:rapidsai/cudf into improvem…
isVoid Jan 5, 2022
1fb372c
fix docstring
galipremsagar Jan 6, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1414,6 +1414,16 @@ def from_pandas(cls, index, nan_as_null=None):
def _constructor_expanddim(self):
return cudf.MultiIndex

def _split_columns_by_levels(self, levels):
if isinstance(levels, int) and levels > 0:
raise ValueError(f"Out of bound level: {levels}")
return (
[self._data[self.name]],
[],
["index" if self.name is None else self.name],
[],
)


def _get_result_name(left_name, right_name):
if left_name == right_name:
Expand Down
94 changes: 12 additions & 82 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2486,92 +2486,22 @@ def set_index(
df.index = idx
return df if not inplace else None

@docutils.doc_dataframe_reset_index()
def reset_index(
self, level=None, drop=False, inplace=False, col_level=0, col_fill=""
):
"""
Reset the index.

Reset the index of the DataFrame, and use the default one instead.

Parameters
----------
drop : bool, default False
Do not try to insert index into dataframe columns. This resets
the index to the default integer index.
inplace : bool, default False
Modify the DataFrame in place (do not create a new object).

Returns
-------
DataFrame or None
DataFrame with the new index or None if ``inplace=True``.

Examples
--------
>>> df = cudf.DataFrame([('bird', 389.0),
... ('bird', 24.0),
... ('mammal', 80.5),
... ('mammal', np.nan)],
... index=['falcon', 'parrot', 'lion', 'monkey'],
... columns=('class', 'max_speed'))
>>> df
class max_speed
falcon bird 389.0
parrot bird 24.0
lion mammal 80.5
monkey mammal <NA>
>>> df.reset_index()
index class max_speed
0 falcon bird 389.0
1 parrot bird 24.0
2 lion mammal 80.5
3 monkey mammal <NA>
>>> df.reset_index(drop=True)
class max_speed
0 bird 389.0
1 bird 24.0
2 mammal 80.5
3 mammal <NA>
"""
if level is not None:
raise NotImplementedError("level parameter is not supported yet.")

if col_level != 0:
raise NotImplementedError(
"col_level parameter is not supported yet."
)

if col_fill != "":
raise NotImplementedError(
"col_fill parameter is not supported yet."
)

result = self if inplace else self.copy()

if not drop:
if isinstance(self.index, cudf.MultiIndex):
names = tuple(
name if name is not None else f"level_{i}"
for i, name in enumerate(self.index.names)
"""{docstring}"""
isVoid marked this conversation as resolved.
Show resolved Hide resolved
return self._mimic_inplace(
DataFrame._from_data(
*self._reset_index(
level=level,
drop=drop,
col_level=col_level,
col_fill=col_fill,
)
else:
if self.index.name is None:
if "index" in self._data.names:
names = ("level_0",)
else:
names = ("index",)
else:
names = (self.index.name,)

index_columns = self.index._data.columns
for name, index_column in zip(
reversed(names), reversed(index_columns)
):
result.insert(0, name, index_column)
result.index = RangeIndex(len(self))
if not inplace:
return result
),
inplace=inplace,
)

def take(self, indices, axis=0, keep_index=None):
axis = self._get_axis_from_axis_arg(axis)
Expand Down
65 changes: 64 additions & 1 deletion python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from cudf.api.types import is_categorical_dtype, is_integer_dtype, is_list_like
from cudf.core.column import arange
from cudf.core.frame import Frame
from cudf.core.index import Index
from cudf.core.index import Index, RangeIndex, _index_from_data
from cudf.core.multiindex import MultiIndex
from cudf.utils.utils import _gather_map_is_valid, cached_property

Expand Down Expand Up @@ -867,3 +867,66 @@ def resample(
if isinstance(self, cudf.Series)
else cudf.core.resample.DataFrameResampler(self, by=by)
)

def _reset_index(self, level, drop, col_level=0, col_fill=""):
"""Shared path for DataFrame.reset_index and Series.reset_index."""
if col_level != 0:
raise NotImplementedError(
"col_level parameter is not supported yet."
)

if col_fill != "":
raise NotImplementedError(
"col_fill parameter is not supported yet."
)

if level is not None and not isinstance(level, (tuple, list)):
level = (level,)
_check_duplicate_level_names(level, self._index.names)

# Split the columns in the index into data and index columns
(
data_columns,
index_columns,
column_names,
index_names,
) = self._index._split_columns_by_levels(level)
if index_columns:
index = _index_from_data(
dict(zip(range(len(index_columns)), index_columns)),
isVoid marked this conversation as resolved.
Show resolved Hide resolved
name=self._index.name,
)
if isinstance(index, MultiIndex):
index.names = index_names
else:
index.name = index_names[0]
else:
index = RangeIndex(len(self))

if drop:
return self._data, index

new_column_data = {}
for name, col in zip(column_names, data_columns):
if name == "index" and "index" in self._data:
name = "level_0"
new_column_data[name] = col
return {**new_column_data, **self._data}, index
isVoid marked this conversation as resolved.
Show resolved Hide resolved


def _check_duplicate_level_names(specified, level_names):
isVoid marked this conversation as resolved.
Show resolved Hide resolved
if specified is None:
return
isVoid marked this conversation as resolved.
Show resolved Hide resolved
non_duplicates = set()
duplicates = set()

for x in level_names:
if x in non_duplicates:
duplicates.add(x)
else:
non_duplicates.add(x)
isVoid marked this conversation as resolved.
Show resolved Hide resolved

if any(x in duplicates for x in specified):
raise ValueError(
f"The name {x} occurs multiple times, use a level number"
isVoid marked this conversation as resolved.
Show resolved Hide resolved
)
30 changes: 30 additions & 0 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -1733,3 +1733,33 @@ def _intersection(self, other, sort=None):
if sort is None and len(other):
return midx.sort_values()
return midx

def _split_columns_by_levels(self, levels):
# This function assumes that for levels with duplicate names, they are
# specified by indices, not name by ``levels``. E.g. [None, None] can
# only be specified by 0, 1, not "None".

# Normalize named levels into indices
if levels is not None:
level_indices = set()
level_names = list(self.names)
for lv in levels:
if isinstance(lv, int):
level_indices.add(lv)
else:
level_indices.add(level_names.index(lv))
isVoid marked this conversation as resolved.
Show resolved Hide resolved
else:
level_indices = range(len(self._data))
isVoid marked this conversation as resolved.
Show resolved Hide resolved

# Split the columns
data_columns, index_columns = [], []
column_names, index_names = [], []
for i, (name, col) in enumerate(zip(self.names, self._data.columns)):
if i in level_indices:
name = f"level_{i}" if name is None else name
data_columns.append(col)
column_names.append(name)
else:
index_columns.append(col)
index_names.append(name)
return data_columns, index_columns, column_names, index_names
isVoid marked this conversation as resolved.
Show resolved Hide resolved
64 changes: 14 additions & 50 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,62 +831,26 @@ def reindex(self, index=None, copy=True):
series.name = self.name
return series

def reset_index(self, drop=False, inplace=False):
"""
Reset index to RangeIndex

Parameters
----------
drop : bool, default False
Just reset the index, without inserting it as a column in
the new DataFrame.
inplace : bool, default False
Modify the Series in place (do not create a new object).

Returns
-------
Series or DataFrame or None
When `drop` is False (the default), a DataFrame is returned.
The newly created columns will come first in the DataFrame,
followed by the original Series values.
When `drop` is True, a `Series` is returned.
In either case, if ``inplace=True``, no value is returned.

Examples
--------
>>> import cudf
>>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[10, 11, 12, 13])
>>> series
10 a
11 b
12 c
13 d
dtype: object
>>> series.reset_index()
index 0
0 10 a
1 11 b
2 12 c
3 13 d
>>> series.reset_index(drop=True)
0 a
1 b
2 c
3 d
dtype: object
"""
@docutils.doc_series_reset_index()
def reset_index(self, level=None, drop=False, name=None, inplace=False):
"""{docstring}"""
data, index = self._reset_index(level=level, drop=drop)
if not drop:
if inplace is True:
raise TypeError(
"Cannot reset_index inplace on a Series "
"to create a DataFrame"
)
return self.to_frame().reset_index(drop=drop)
else:
if inplace is True:
self._index = RangeIndex(len(self))
else:
return self._from_data(self._data, index=RangeIndex(len(self)))
if name is None:
name = 0 if self.name is None else self.name
data[name] = data.pop(self.name)
return cudf.core.dataframe.DataFrame._from_data(data, index)
# For ``name`` behavior, see:
# https://github.com/pandas-dev/pandas/issues/44575
isVoid marked this conversation as resolved.
Show resolved Hide resolved
return self._mimic_inplace(
Series._from_data(data, index, name if inplace else None),
inplace=inplace,
)

def set_index(self, index):
"""Returns a new Series with a different index.
Expand Down
Loading