diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index ed1cc74db71..aa89b8f849f 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1414,6 +1414,16 @@ def from_pandas(cls, index, nan_as_null=None): def _constructor_expanddim(self): return cudf.MultiIndex + def _split_columns_by_levels(self, levels): + if isinstance(levels, int) and levels > 0: + raise ValueError(f"Out of bound level: {levels}") + return ( + [self._data[self.name]], + [], + ["index" if self.name is None else self.name], + [], + ) + def _get_result_name(left_name, right_name): if left_name == right_name: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 197011e629d..fe6ac8e1529 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -58,6 +58,7 @@ _FrameIndexer, _get_label_range_or_mask, _indices_from_labels, + doc_reset_index_template, ) from cudf.core.multiindex import MultiIndex from cudf.core.resample import DataFrameResampler @@ -2429,29 +2430,13 @@ def set_index( df.index = idx return df if not inplace else None - def reset_index( - self, level=None, drop=False, inplace=False, col_level=0, col_fill="" - ): - """ - Reset the index. - - Reset the index of the DataFrame, and use the default one instead. - - Parameters - ---------- - drop : bool, default False - Do not try to insert index into dataframe columns. This resets - the index to the default integer index. - inplace : bool, default False - Modify the DataFrame in place (do not create a new object). - - Returns - ------- - DataFrame or None - DataFrame with the new index or None if ``inplace=True``. - - Examples - -------- + @docutils.doc_apply( + doc_reset_index_template.format( + klass="DataFrame", + argument="", + return_type="DataFrame or None", + return_doc="", + example=""" >>> df = cudf.DataFrame([('bird', 389.0), ... ('bird', 24.0), ... ('mammal', 80.5), @@ -2476,45 +2461,51 @@ class max_speed 1 bird 24.0 2 mammal 80.5 3 mammal - """ - if level is not None: - raise NotImplementedError("level parameter is not supported yet.") - - if col_level != 0: - raise NotImplementedError( - "col_level parameter is not supported yet." - ) - if col_fill != "": - raise NotImplementedError( - "col_fill parameter is not supported yet." - ) - - result = self if inplace else self.copy() - - if not drop: - if isinstance(self.index, MultiIndex): - names = tuple( - name if name is not None else f"level_{i}" - for i, name in enumerate(self.index.names) + You can also use ``reset_index`` with MultiIndex. + + >>> index = cudf.MultiIndex.from_tuples([('bird', 'falcon'), + ... ('bird', 'parrot'), + ... ('mammal', 'lion'), + ... ('mammal', 'monkey')], + ... names=['class', 'name']) + >>> df = cudf.DataFrame([(389.0, 'fly'), + ... ( 24.0, 'fly'), + ... ( 80.5, 'run'), + ... (np.nan, 'jump')], + ... index=index, + ... columns=('speed', 'type')) + >>> df + speed type + class name + bird falcon 389.0 fly + parrot 24.0 fly + mammal lion 80.5 run + monkey jump + >>> df.reset_index(level='class') + class speed type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal jump + """, + ) + ) + def reset_index( + self, level=None, drop=False, inplace=False, col_level=0, col_fill="" + ): + return self._mimic_inplace( + DataFrame._from_data( + *self._reset_index( + level=level, + drop=drop, + col_level=col_level, + col_fill=col_fill, ) - else: - if self.index.name is None: - if "index" in self._data.names: - names = ("level_0",) - else: - names = ("index",) - else: - names = (self.index.name,) - - index_columns = self.index._data.columns - for name, index_column in zip( - reversed(names), reversed(index_columns) - ): - result.insert(0, name, index_column) - result.index = RangeIndex(len(self)) - if not inplace: - return result + ), + inplace=inplace, + ) def take(self, indices, axis=0, keep_index=None): axis = self._get_axis_from_axis_arg(axis) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 539408b6afb..0345966d6bd 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -150,8 +150,8 @@ def _from_columns( n_index_columns = 0 if index_names is not None: n_index_columns = len(index_names) - index = cudf.core.index._index_from_data( - dict(zip(range(n_index_columns), columns)) + index = cudf.core.index._index_from_columns( + columns[:n_index_columns] ) if isinstance(index, cudf.MultiIndex): index.names = index_names diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 362c96ebbeb..859a81bc5f4 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -114,6 +114,13 @@ def _index_from_data(data: MutableMapping, name: Any = None): return index_class_type._from_data(data, None, name) +def _index_from_columns( + columns: List[cudf.core.column.ColumnBase], name: Any = None +): + """Construct an index from ``columns``, with levels named 0, 1, 2...""" + return _index_from_data(dict(zip(range(len(columns)), columns)), name=name) + + class RangeIndex(BaseIndex): """ Immutable Index implementing a monotonic integer range. diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index ecacb1ff326..2f4d4a88195 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -5,7 +5,7 @@ import operator import warnings -from collections import abc +from collections import Counter, abc from typing import Callable, Type, TypeVar from uuid import uuid4 @@ -24,11 +24,37 @@ is_list_like, ) from cudf.core.column import arange +from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import Frame -from cudf.core.index import Index +from cudf.core.index import Index, RangeIndex, _index_from_columns from cudf.core.multiindex import MultiIndex from cudf.utils.utils import _gather_map_is_valid, cached_property +doc_reset_index_template = """ + Reset the index of the {klass}, or a level of it. + + Parameters + ---------- + level : int, str, tuple, or list, default None + Only remove the given levels from the index. Removes all levels by + default. + drop : bool, default False + Do not try to insert index into dataframe columns. This resets + the index to the default integer index. +{argument} + inplace : bool, default False + Modify the DataFrame in place (do not create a new object). + + Returns + ------- + {return_type} + {klass} with the new index or None if ``inplace=True``.{return_doc} + + Examples + -------- + {example} +""" + def _indices_from_labels(obj, labels): from cudf.core.column import column @@ -1171,6 +1197,53 @@ def resample( else cudf.core.resample.DataFrameResampler(self, by=by) ) + def _reset_index(self, level, drop, col_level=0, col_fill=""): + """Shared path for DataFrame.reset_index and Series.reset_index.""" + if level is not None and not isinstance(level, (tuple, list)): + level = (level,) + _check_duplicate_level_names(level, self._index.names) + + # Split the columns in the index into data and index columns + ( + data_columns, + index_columns, + data_names, + index_names, + ) = self._index._split_columns_by_levels(level) + if index_columns: + index = _index_from_columns(index_columns, name=self._index.name,) + if isinstance(index, MultiIndex): + index.names = index_names + else: + index.name = index_names[0] + else: + index = RangeIndex(len(self)) + + if drop: + return self._data, index + + new_column_data = {} + for name, col in zip(data_names, data_columns): + if name == "index" and "index" in self._data: + name = "level_0" + name = ( + tuple( + name if i == col_level else col_fill + for i in range(self._data.nlevels) + ) + if self._data.multiindex + else name + ) + new_column_data[name] = col + # This is to match pandas where the new data columns are always + # inserted to the left of existing data columns. + return ( + ColumnAccessor( + {**new_column_data, **self._data}, self._data.multiindex + ), + index, + ) + def _first_or_last( self, offset, idx: int, op: Callable, side: str, slice_func: Callable ) -> "IndexedFrame": @@ -1292,3 +1365,20 @@ def last(self, offset): side="right", slice_func=lambda i: self.iloc[i:], ) + + +def _check_duplicate_level_names(specified, level_names): + """Raise if any of `specified` has duplicates in `level_names`.""" + if specified is None: + return + if len(set(level_names)) == len(level_names): + return + duplicates = {key for key, val in Counter(level_names).items() if val > 1} + + duplicates_specified = [spec for spec in specified if spec in duplicates] + if not len(duplicates_specified) == 0: + # Note: pandas raises first encountered duplicates, cuDF raises all. + raise ValueError( + f"The names {duplicates_specified} occurs multiple times, use a" + " level number" + ) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index c403c697e3d..b333c862f21 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1743,3 +1743,39 @@ def _intersection(self, other, sort=None): if sort is None and len(other): return midx.sort_values() return midx + + def _split_columns_by_levels(self, levels): + # This function assumes that for levels with duplicate names, they are + # specified by indices, not name by ``levels``. E.g. [None, None] can + # only be specified by 0, 1, not "None". + + if levels is None: + return ( + list(self._data.columns), + [], + [ + f"level_{i}" if name is None else name + for i, name in enumerate(self.names) + ], + [], + ) + + # Normalize named levels into indices + level_names = list(self.names) + level_indices = { + lv if isinstance(lv, int) else level_names.index(lv) + for lv in levels + } + + # Split the columns + data_columns, index_columns = [], [] + data_names, index_names = [], [] + for i, (name, col) in enumerate(zip(self.names, self._data.columns)): + if i in level_indices: + name = f"level_{i}" if name is None else name + data_columns.append(col) + data_names.append(name) + else: + index_columns.append(col) + index_names.append(name) + return data_columns, index_columns, data_names, index_names diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index a0e359d1278..11166320760 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -64,6 +64,7 @@ _FrameIndexer, _get_label_range_or_mask, _indices_from_labels, + doc_reset_index_template, ) from cudf.core.single_column_frame import SingleColumnFrame from cudf.utils import cudautils, docutils @@ -830,30 +831,22 @@ def reindex(self, index=None, copy=True): series.name = self.name return series - def reset_index(self, drop=False, inplace=False): - """ - Reset index to RangeIndex - - Parameters - ---------- - drop : bool, default False - Just reset the index, without inserting it as a column in - the new DataFrame. - inplace : bool, default False - Modify the Series in place (do not create a new object). - - Returns - ------- - Series or DataFrame or None - When `drop` is False (the default), a DataFrame is returned. - The newly created columns will come first in the DataFrame, - followed by the original Series values. - When `drop` is True, a `Series` is returned. - In either case, if ``inplace=True``, no value is returned. - - Examples - -------- - >>> import cudf + @docutils.doc_apply( + doc_reset_index_template.format( + klass="Series", + argument=""" + name : object, optional + The name to use for the column containing the original Series + values. Uses self.name by default. This argument is ignored when + ``drop`` is True.""", + return_type="Series or DataFrame or None", + return_doc=""" For Series, When drop is False (the default), a DataFrame + is returned. The newly created columns will come first in the + DataFrame, followed by the original Series values. When `drop` is + True, a `Series` is returned. In either case, if ``inplace=True``, + no value is returned. +""", + example=""" >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[10, 11, 12, 13]) >>> series 10 a @@ -873,19 +866,51 @@ def reset_index(self, drop=False, inplace=False): 2 c 3 d dtype: object - """ + + You can also use ``reset_index`` with MultiIndex. + + >>> s2 = cudf.Series( + ... range(4), name='foo', + ... index=cudf.MultiIndex.from_tuples([ + ... ('bar', 'one'), ('bar', 'two'), + ... ('baz', 'one'), ('baz', 'two')], + ... names=['a', 'b'] + ... )) + >>> s2 + a b + bar one 0 + two 1 + baz one 2 + two 3 + Name: foo, dtype: int64 + >>> s2.reset_index(level='a') + a foo + b + one bar 0 + two bar 1 + one baz 2 + two baz 3 +""", + ) + ) + def reset_index(self, level=None, drop=False, name=None, inplace=False): + if not drop and inplace: + raise TypeError( + "Cannot reset_index inplace on a Series " + "to create a DataFrame" + ) + data, index = self._reset_index(level=level, drop=drop) if not drop: - if inplace is True: - raise TypeError( - "Cannot reset_index inplace on a Series " - "to create a DataFrame" - ) - return self.to_frame().reset_index(drop=drop) - else: - if inplace is True: - self._index = RangeIndex(len(self)) - else: - return self._from_data(self._data, index=RangeIndex(len(self))) + if name is None: + name = 0 if self.name is None else self.name + data[name] = data.pop(self.name) + return cudf.core.dataframe.DataFrame._from_data(data, index) + # For ``name`` behavior, see: + # https://github.com/pandas-dev/pandas/issues/44575 + return self._mimic_inplace( + Series._from_data(data, index, name if inplace else None), + inplace=inplace, + ) def set_index(self, index): """Returns a new Series with a different index. diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 73f9cb858e1..e5b298a8448 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2513,37 +2513,137 @@ def test_tail_for_string(): assert_eq(gdf.tail(3), gdf.to_pandas().tail(3)) +@pytest.mark.parametrize("level", [None, 0, "l0", 1, ["l0", 1]]) @pytest.mark.parametrize("drop", [True, False]) -def test_reset_index(pdf, gdf, drop): - assert_eq( - pdf.reset_index(drop=drop, inplace=False), - gdf.reset_index(drop=drop, inplace=False), +@pytest.mark.parametrize( + "column_names", + [ + ["v0", "v1"], + ["v0", "index"], + pd.MultiIndex.from_tuples([("x0", "x1"), ("y0", "y1")]), + ], +) +@pytest.mark.parametrize("inplace", [True, False]) +@pytest.mark.parametrize("col_level", [0, 1]) +@pytest.mark.parametrize("col_fill", ["", "some_lv"]) +def test_reset_index(level, drop, column_names, inplace, col_level, col_fill): + midx = pd.MultiIndex.from_tuples( + [("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=["l0", None] ) - assert_eq( - pdf.x.reset_index(drop=drop, inplace=False), - gdf.x.reset_index(drop=drop, inplace=False), + pdf = pd.DataFrame( + [[1, 2], [3, 4], [5, 6], [7, 8]], index=midx, columns=column_names ) + gdf = cudf.from_pandas(pdf) + + expect = pdf.reset_index( + level=level, + drop=drop, + inplace=inplace, + col_level=col_level, + col_fill=col_fill, + ) + got = gdf.reset_index( + level=level, + drop=drop, + inplace=inplace, + col_level=col_level, + col_fill=col_fill, + ) + if inplace: + expect = pdf + got = gdf + + assert_eq(expect, got) + + +@pytest.mark.parametrize("level", [None, 0, 1, [None]]) +@pytest.mark.parametrize("drop", [False, True]) +@pytest.mark.parametrize("inplace", [False, True]) +@pytest.mark.parametrize("col_level", [0, 1]) +@pytest.mark.parametrize("col_fill", ["", "some_lv"]) +def test_reset_index_dup_level_name(level, drop, inplace, col_level, col_fill): + # midx levels are named [None, None] + midx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) + pdf = pd.DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]], index=midx) + gdf = cudf.from_pandas(pdf) + if level == [None]: + assert_exceptions_equal( + lfunc=pdf.reset_index, + rfunc=gdf.reset_index, + lfunc_args_and_kwargs=( + [], + {"level": level, "drop": drop, "inplace": inplace}, + ), + rfunc_args_and_kwargs=( + [], + {"level": level, "drop": drop, "inplace": inplace}, + ), + expected_error_message="occurs multiple times, use a level number", + ) + return + + expect = pdf.reset_index( + level=level, + drop=drop, + inplace=inplace, + col_level=col_level, + col_fill=col_fill, + ) + got = gdf.reset_index( + level=level, + drop=drop, + inplace=inplace, + col_level=col_level, + col_fill=col_fill, + ) + if inplace: + expect = pdf + got = gdf + + assert_eq(expect, got) @pytest.mark.parametrize("drop", [True, False]) -def test_reset_named_index(pdf, gdf, drop): +@pytest.mark.parametrize("inplace", [False, True]) +@pytest.mark.parametrize("col_level", [0, 1]) +@pytest.mark.parametrize("col_fill", ["", "some_lv"]) +def test_reset_index_named(pdf, gdf, drop, inplace, col_level, col_fill): pdf.index.name = "cudf" gdf.index.name = "cudf" - assert_eq( - pdf.reset_index(drop=drop, inplace=False), - gdf.reset_index(drop=drop, inplace=False), + + expect = pdf.reset_index( + drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill ) - assert_eq( - pdf.x.reset_index(drop=drop, inplace=False), - gdf.x.reset_index(drop=drop, inplace=False), + got = gdf.reset_index( + drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill ) + if inplace: + expect = pdf + got = gdf + assert_eq(expect, got) @pytest.mark.parametrize("drop", [True, False]) -def test_reset_index_inplace(pdf, gdf, drop): - pdf.reset_index(drop=drop, inplace=True) - gdf.reset_index(drop=drop, inplace=True) - assert_eq(pdf, gdf) +@pytest.mark.parametrize("inplace", [False, True]) +@pytest.mark.parametrize("column_names", [["x", "y"], ["index", "y"]]) +@pytest.mark.parametrize("col_level", [0, 1]) +@pytest.mark.parametrize("col_fill", ["", "some_lv"]) +def test_reset_index_unnamed( + pdf, gdf, drop, inplace, column_names, col_level, col_fill +): + pdf.columns = column_names + gdf.columns = column_names + + expect = pdf.reset_index( + drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill + ) + got = gdf.reset_index( + drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill + ) + if inplace: + expect = pdf + got = gdf + assert_eq(expect, got) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 583d2c7a8dd..ffdd53c58ac 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1345,6 +1345,122 @@ def test_nullable_bool_dtype_series(data, bool_dtype): assert_eq(psr, gsr.to_pandas(nullable=True)) +@pytest.mark.parametrize("level", [None, 0, "l0", 1, ["l0", 1]]) +@pytest.mark.parametrize("drop", [True, False]) +@pytest.mark.parametrize("original_name", [None, "original_ser"]) +@pytest.mark.parametrize("name", [None, "ser"]) +@pytest.mark.parametrize("inplace", [True, False]) +def test_reset_index(level, drop, inplace, original_name, name): + midx = pd.MultiIndex.from_tuples( + [("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=["l0", None] + ) + ps = pd.Series(range(4), index=midx, name=original_name) + gs = cudf.from_pandas(ps) + + if not drop and inplace: + pytest.skip( + "For exception checks, see " + "test_reset_index_dup_level_name_exceptions" + ) + + expect = ps.reset_index(level=level, drop=drop, name=name, inplace=inplace) + got = gs.reset_index(level=level, drop=drop, name=name, inplace=inplace) + if inplace: + expect = ps + got = gs + + assert_eq(expect, got) + + +@pytest.mark.parametrize("level", [None, 0, 1, [None]]) +@pytest.mark.parametrize("drop", [False, True]) +@pytest.mark.parametrize("inplace", [False, True]) +@pytest.mark.parametrize("original_name", [None, "original_ser"]) +@pytest.mark.parametrize("name", [None, "ser"]) +def test_reset_index_dup_level_name(level, drop, inplace, original_name, name): + # midx levels are named [None, None] + midx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) + ps = pd.Series(range(4), index=midx, name=original_name) + gs = cudf.from_pandas(ps) + if level == [None] or not drop and inplace: + pytest.skip( + "For exception checks, see " + "test_reset_index_dup_level_name_exceptions" + ) + + expect = ps.reset_index(level=level, drop=drop, inplace=inplace, name=name) + got = gs.reset_index(level=level, drop=drop, inplace=inplace, name=name) + if inplace: + expect = ps + got = gs + + assert_eq(expect, got) + + +@pytest.mark.parametrize("drop", [True, False]) +@pytest.mark.parametrize("inplace", [True, False]) +@pytest.mark.parametrize("original_name", [None, "original_ser"]) +@pytest.mark.parametrize("name", [None, "ser"]) +def test_reset_index_named(drop, inplace, original_name, name): + ps = pd.Series(range(4), index=["x", "y", "z", "w"], name=original_name) + gs = cudf.from_pandas(ps) + + ps.index.name = "cudf" + gs.index.name = "cudf" + + if not drop and inplace: + pytest.skip( + "For exception checks, see " + "test_reset_index_dup_level_name_exceptions" + ) + + expect = ps.reset_index(drop=drop, inplace=inplace, name=name) + got = gs.reset_index(drop=drop, inplace=inplace, name=name) + + if inplace: + expect = ps + got = gs + + assert_eq(expect, got) + + +def test_reset_index_dup_level_name_exceptions(): + midx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) + ps = pd.Series(range(4), index=midx) + gs = cudf.from_pandas(ps) + + # Should specify duplicate level names with level number. + assert_exceptions_equal( + lfunc=ps.reset_index, + rfunc=gs.reset_index, + lfunc_args_and_kwargs=([], {"level": [None]},), + rfunc_args_and_kwargs=([], {"level": [None]},), + expected_error_message="occurs multiple times, use a level number", + ) + + # Cannot use drop=False and inplace=True to turn a series into dataframe. + assert_exceptions_equal( + lfunc=ps.reset_index, + rfunc=gs.reset_index, + lfunc_args_and_kwargs=([], {"drop": False, "inplace": True},), + rfunc_args_and_kwargs=([], {"drop": False, "inplace": True},), + ) + + # Pandas raises the above exception should these two inputs crosses. + assert_exceptions_equal( + lfunc=ps.reset_index, + rfunc=gs.reset_index, + lfunc_args_and_kwargs=( + [], + {"level": [None], "drop": False, "inplace": True}, + ), + rfunc_args_and_kwargs=( + [], + {"level": [None], "drop": False, "inplace": True}, + ), + ) + + def test_series_add_prefix(): cd_s = cudf.Series([1, 2, 3, 4]) pd_s = cd_s.to_pandas() diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py index 57ad612846d..7a4a2673f9b 100644 --- a/python/cudf/cudf/utils/docutils.py +++ b/python/cudf/cudf/utils/docutils.py @@ -68,6 +68,16 @@ def wrapper(func): return wrapper +def doc_apply(doc): + """Set `__doc__` attribute of `func` to `doc`.""" + + def wrapper(func): + func.__doc__ = doc + return func + + return wrapper + + doc_describe = docfmt_partial( docstring=""" Generate descriptive statistics.