From 6edc4becbf416622ca01393f657d7cffe2b01746 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Sat, 20 Nov 2021 19:02:26 -0800 Subject: [PATCH 01/28] initial pass --- python/cudf/cudf/core/_base_index.py | 5 +++ python/cudf/cudf/core/dataframe.py | 44 +++------------------- python/cudf/cudf/core/indexed_frame.py | 47 +++++++++++++++++++++++- python/cudf/cudf/core/multiindex.py | 19 ++++++++++ python/cudf/cudf/core/series.py | 15 ++++---- python/cudf/cudf/tests/test_dataframe.py | 31 +++++++++------- python/cudf/cudf/tests/test_series.py | 22 +++++++++++ 7 files changed, 123 insertions(+), 60 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index d688b75ed14..c57d889d2ca 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1414,6 +1414,11 @@ def from_pandas(cls, index, nan_as_null=None): def _constructor_expanddim(self): return cudf.MultiIndex + def _split_columns_by_levels(self, levels): + if isinstance(levels, int) and levels > 0: + raise ValueError(f"Out of bound level: {levels}") + return {0: self._data[self.name]}, {}, [self.name], [] + def _get_result_name(left_name, right_name): if left_name == right_name: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b2e6588edb2..adc5205b4f2 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2531,44 +2531,12 @@ class max_speed 2 mammal 80.5 3 mammal """ - if level is not None: - raise NotImplementedError("level parameter is not supported yet.") - - if col_level != 0: - raise NotImplementedError( - "col_level parameter is not supported yet." - ) - - if col_fill != "": - raise NotImplementedError( - "col_fill parameter is not supported yet." - ) - - result = self if inplace else self.copy() - - if not drop: - if isinstance(self.index, cudf.MultiIndex): - names = tuple( - name if name is not None else f"level_{i}" - for i, name in enumerate(self.index.names) - ) - else: - if self.index.name is None: - if "index" in self._data.names: - names = ("level_0",) - else: - names = ("index",) - else: - names = (self.index.name,) - - index_columns = self.index._data.columns - for name, index_column in zip( - reversed(names), reversed(index_columns) - ): - result.insert(0, name, index_column) - result.index = RangeIndex(len(self)) - if not inplace: - return result + data, index = self._reset_index( + level=level, drop=drop, col_level=col_level, col_fill=col_fill + ) + return self._mimic_inplace( + DataFrame._from_data(data, index), inplace=inplace + ) def take(self, indices, axis=0, keep_index=None): axis = self._get_axis_from_axis_arg(axis) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index cf12907d96a..145eee35484 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -16,7 +16,7 @@ from cudf.api.types import is_categorical_dtype, is_list_like from cudf.core.column import arange from cudf.core.frame import Frame -from cudf.core.index import Index +from cudf.core.index import Index, RangeIndex, _index_from_data from cudf.core.multiindex import MultiIndex from cudf.utils.utils import cached_property @@ -758,3 +758,48 @@ def resample( if isinstance(self, cudf.Series) else cudf.core.resample.DataFrameResampler(self, by=by) ) + + def _reset_index(self, level, drop, col_level=0, col_fill=""): + """Shared path for DataFrame.reset_index and Series.reset_index.""" + if col_level != 0: + raise NotImplementedError( + "col_level parameter is not supported yet." + ) + + if col_fill != "": + raise NotImplementedError( + "col_fill parameter is not supported yet." + ) + + if not isinstance(level, (tuple, list)): + level = (level,) + # Split the columns in the index into data and index columns + ( + data_columns, + index_columns, + column_names, + index_names, + ) = self._index._split_columns_by_levels(level) + if index_columns: + index = _index_from_data(index_columns, name=self._index.name) + if isinstance(index, MultiIndex): + index.names = index_names + else: + index.name = index_names[0] + else: + index = RangeIndex(len(self)) + + if drop: + return self._data, index + + new_column_data = {} + for name, (i, col) in zip(column_names, data_columns.items()): + if name is None: + name = ( + f"level_{i}" + if "index" in self._data.names or i > 0 + else "index" + ) + new_column_data[name] = col + result_data = {**new_column_data, **self._data} + return result_data, index diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 7c132e3fb71..86e411a2b0d 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1727,3 +1727,22 @@ def _intersection(self, other, sort=None): if sort is None and len(other): return midx.sort_values() return midx + + def _split_columns_by_levels(self, levels): + level_indices = [] + level_names = list(self._data.columns) + for lv in levels: + if isinstance(lv, int): + level_indices.append(lv) + else: + level_indices.append(level_names.index(lv)) + s0, s1 = {}, {} + column_names, index_names = [], [] + for i, (name, col) in enumerate(self._data.items()): + if i in level_indices: + s1[i] = col + column_names.append(name) + else: + s0[i] = col + index_names.append(name) + return s1, s0, column_names, index_names diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 00a8ebabe34..4a0a9fee185 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -831,7 +831,7 @@ def reindex(self, index=None, copy=True): series.name = self.name return series - def reset_index(self, drop=False, inplace=False): + def reset_index(self, level=None, drop=False, name=None, inplace=False): """ Reset index to RangeIndex @@ -875,18 +875,19 @@ def reset_index(self, drop=False, inplace=False): 3 d dtype: object """ + data, index = self._reset_index(level=level, drop=drop) if not drop: if inplace is True: raise TypeError( "Cannot reset_index inplace on a Series " "to create a DataFrame" ) - return self.to_frame().reset_index(drop=drop) - else: - if inplace is True: - self._index = RangeIndex(len(self)) - else: - return self._from_data(self._data, index=RangeIndex(len(self))) + if None in data: + data[0] = data.pop(None) # Should be handled in _from_data? + return cudf.core.dataframe.DataFrame._from_data(data, index) + return self._mimic_inplace( + Series._from_data(data, index), inplace=inplace + ) def set_index(self, index): """Returns a new Series with a different index. diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 30edc0fb260..8047d957c21 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2574,16 +2574,26 @@ def test_tail_for_string(): assert_eq(gdf.tail(3), gdf.to_pandas().tail(3)) +@pytest.mark.parametrize("level", [0, "l0", 1, ["l0", 1]]) @pytest.mark.parametrize("drop", [True, False]) -def test_reset_index(pdf, gdf, drop): - assert_eq( - pdf.reset_index(drop=drop, inplace=False), - gdf.reset_index(drop=drop, inplace=False), +@pytest.mark.parametrize("column_names", [["v0", "v1"], ["v0", "index"]]) +@pytest.mark.parametrize("inplace", [True, False]) +def test_reset_index(level, drop, column_names, inplace): + midx = pd.MultiIndex.from_tuples( + [("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=["l0", None] ) - assert_eq( - pdf.x.reset_index(drop=drop, inplace=False), - gdf.x.reset_index(drop=drop, inplace=False), + pdf = pd.DataFrame( + [[1, 2], [3, 4], [5, 6], [7, 8]], index=midx, columns=column_names ) + gdf = cudf.from_pandas(pdf) + + expect = pdf.reset_index(level=level, drop=drop, inplace=inplace) + got = gdf.reset_index(level=level, drop=drop, inplace=inplace) + if inplace: + expect = pdf + got = gdf + + assert_eq(expect, got) @pytest.mark.parametrize("drop", [True, False]) @@ -2600,13 +2610,6 @@ def test_reset_named_index(pdf, gdf, drop): ) -@pytest.mark.parametrize("drop", [True, False]) -def test_reset_index_inplace(pdf, gdf, drop): - pdf.reset_index(drop=drop, inplace=True) - gdf.reset_index(drop=drop, inplace=True) - assert_eq(pdf, gdf) - - @pytest.mark.parametrize( "data", [ diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 09f0417b7ac..d97b0d96bf5 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1335,3 +1335,25 @@ def test_equals_names(lhs, rhs): expect = lhs.to_pandas().equals(rhs.to_pandas()) assert_eq(expect, got) + + +@pytest.mark.parametrize("level", [0, "l0", 1, ["l0", 1]]) +@pytest.mark.parametrize("drop", [True, False]) +@pytest.mark.parametrize("name", [None, "ser"]) +@pytest.mark.parametrize("inplace", [True, False]) +def test_reset_index(level, drop, inplace, name): + if not drop and inplace: + pytest.skip() + midx = pd.MultiIndex.from_tuples( + [("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=["l0", None] + ) + ps = pd.Series(range(4), index=midx, name=name) + gs = cudf.from_pandas(ps) + + expect = ps.reset_index(level=level, drop=drop, inplace=inplace) + got = gs.reset_index(level=level, drop=drop, inplace=inplace) + if inplace: + expect = ps + got = gs + + assert_eq(expect, got) From 8416b9438caabb66a7c6b0de221458c6433074d9 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Sun, 21 Nov 2021 12:40:40 -0800 Subject: [PATCH 02/28] handling level=None, check duplicate level names, move level name handling to index.py --- python/cudf/cudf/core/_base_index.py | 7 ++++- python/cudf/cudf/core/indexed_frame.py | 35 +++++++++++++++++++----- python/cudf/cudf/core/multiindex.py | 21 ++++++++------ python/cudf/cudf/tests/test_dataframe.py | 26 +++++++++++++++++- python/cudf/cudf/tests/test_series.py | 2 +- 5 files changed, 73 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index c57d889d2ca..ecb6df0d369 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1417,7 +1417,12 @@ def _constructor_expanddim(self): def _split_columns_by_levels(self, levels): if isinstance(levels, int) and levels > 0: raise ValueError(f"Out of bound level: {levels}") - return {0: self._data[self.name]}, {}, [self.name], [] + return ( + {0: self._data[self.name]}, + {}, + ["index" if self.name is None else self.name], + [], + ) def _get_result_name(left_name, right_name): diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 145eee35484..8a3cf13c307 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -771,8 +771,10 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""): "col_fill parameter is not supported yet." ) - if not isinstance(level, (tuple, list)): + if level is not None and not isinstance(level, (tuple, list)): level = (level,) + _check_duplicate_level_names(level, self._index.names) + # Split the columns in the index into data and index columns ( data_columns, @@ -794,12 +796,31 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""): new_column_data = {} for name, (i, col) in zip(column_names, data_columns.items()): - if name is None: - name = ( - f"level_{i}" - if "index" in self._data.names or i > 0 - else "index" - ) + if name == "index" and "index" in self._data: + name = "level_0" new_column_data[name] = col result_data = {**new_column_data, **self._data} return result_data, index + + +def _check_duplicate_level_names(specified, level_names): + if specified is None: + return + # Size: specified M, level_names N + # Worst case: Nlog(N) + Mlog(N) + existing = set() + duplicates = set() + + # Worst case: Nlog(N) + for x in level_names: + if x in existing: + duplicates.add(x) + else: + existing.add(x) + + # Worst case: Mlog(N) + for x in specified: + if x in duplicates: + raise ValueError( + f"The name {x} occurs multiple times, use a level number" + ) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 86e411a2b0d..b7fbec326aa 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1729,17 +1729,22 @@ def _intersection(self, other, sort=None): return midx def _split_columns_by_levels(self, levels): - level_indices = [] - level_names = list(self._data.columns) - for lv in levels: - if isinstance(lv, int): - level_indices.append(lv) - else: - level_indices.append(level_names.index(lv)) + if levels is not None: + level_indices = [] + level_names = list(self.names) + for lv in levels: + if isinstance(lv, int): + level_indices.append(lv) + else: + level_indices.append(level_names.index(lv)) + else: + level_indices = range(len(self._data)) + s0, s1 = {}, {} column_names, index_names = [], [] - for i, (name, col) in enumerate(self._data.items()): + for i, (name, col) in enumerate(zip(self.names, self._data.columns)): if i in level_indices: + name = f"level_{i}" if name is None else name s1[i] = col column_names.append(name) else: diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 8047d957c21..d76c76be001 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2574,7 +2574,7 @@ def test_tail_for_string(): assert_eq(gdf.tail(3), gdf.to_pandas().tail(3)) -@pytest.mark.parametrize("level", [0, "l0", 1, ["l0", 1]]) +@pytest.mark.parametrize("level", [None, 0, "l0", 1, ["l0", 1]]) @pytest.mark.parametrize("drop", [True, False]) @pytest.mark.parametrize("column_names", [["v0", "v1"], ["v0", "index"]]) @pytest.mark.parametrize("inplace", [True, False]) @@ -2596,6 +2596,30 @@ def test_reset_index(level, drop, column_names, inplace): assert_eq(expect, got) +@pytest.mark.parametrize("level", [None, 0, 1, [None]]) +@pytest.mark.parametrize("drop", [False, True]) +@pytest.mark.parametrize("inplace", [False, True]) +def test_reset_index_dup_level_name(level, drop, inplace): + # midx levels are named [None, None] + midx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) + pdf = pd.DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]], index=midx) + gdf = cudf.from_pandas(pdf) + if level == [None]: + with pytest.raises( + ValueError, match="occurs multiple times, use a level number$" + ): + gdf.reset_index(level=level, drop=drop, inplace=inplace) + return + + expect = pdf.reset_index(level=level, drop=drop, inplace=inplace) + got = gdf.reset_index(level=level, drop=drop, inplace=inplace) + if inplace: + expect = pdf + got = gdf + + assert_eq(expect, got) + + @pytest.mark.parametrize("drop", [True, False]) def test_reset_named_index(pdf, gdf, drop): pdf.index.name = "cudf" diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index d97b0d96bf5..8865a93344e 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1337,7 +1337,7 @@ def test_equals_names(lhs, rhs): assert_eq(expect, got) -@pytest.mark.parametrize("level", [0, "l0", 1, ["l0", 1]]) +@pytest.mark.parametrize("level", [None, 0, "l0", 1, ["l0", 1]]) @pytest.mark.parametrize("drop", [True, False]) @pytest.mark.parametrize("name", [None, "ser"]) @pytest.mark.parametrize("inplace", [True, False]) From 41986ff24f50f45d0c3cdc68790b42c4f68d4b00 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 22 Nov 2021 10:07:32 -0800 Subject: [PATCH 03/28] first pass clean up, not returning column indices from helpers --- python/cudf/cudf/core/_base_index.py | 4 ++-- python/cudf/cudf/core/indexed_frame.py | 7 +++++-- python/cudf/cudf/core/multiindex.py | 20 +++++++++++++------- python/cudf/cudf/tests/test_dataframe.py | 2 +- 4 files changed, 21 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index ecb6df0d369..b7eb4a1fe90 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1418,8 +1418,8 @@ def _split_columns_by_levels(self, levels): if isinstance(levels, int) and levels > 0: raise ValueError(f"Out of bound level: {levels}") return ( - {0: self._data[self.name]}, - {}, + [self._data[self.name]], + [], ["index" if self.name is None else self.name], [], ) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 8a3cf13c307..01441713d9d 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -783,7 +783,10 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""): index_names, ) = self._index._split_columns_by_levels(level) if index_columns: - index = _index_from_data(index_columns, name=self._index.name) + index = _index_from_data( + dict(zip(range(len(index_columns)), index_columns)), + name=self._index.name, + ) if isinstance(index, MultiIndex): index.names = index_names else: @@ -795,7 +798,7 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""): return self._data, index new_column_data = {} - for name, (i, col) in zip(column_names, data_columns.items()): + for name, col in zip(column_names, data_columns): if name == "index" and "index" in self._data: name = "level_0" new_column_data[name] = col diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index b7fbec326aa..ab6608596d4 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1729,25 +1729,31 @@ def _intersection(self, other, sort=None): return midx def _split_columns_by_levels(self, levels): + # This function assumes that for levels with duplicate names, they are + # specified by indices, not name by ``levels``. E.g. [None, None] can + # only be specified by 0, 1, not "None". + + # Normalize named levels into indices if levels is not None: - level_indices = [] + level_indices = set() level_names = list(self.names) for lv in levels: if isinstance(lv, int): - level_indices.append(lv) + level_indices.add(lv) else: - level_indices.append(level_names.index(lv)) + level_indices.add(level_names.index(lv)) else: level_indices = range(len(self._data)) - s0, s1 = {}, {} + # Split the columns + data_columns, index_columns = [], [] column_names, index_names = [], [] for i, (name, col) in enumerate(zip(self.names, self._data.columns)): if i in level_indices: name = f"level_{i}" if name is None else name - s1[i] = col + data_columns.append(col) column_names.append(name) else: - s0[i] = col + index_columns.append(col) index_names.append(name) - return s1, s0, column_names, index_names + return data_columns, index_columns, column_names, index_names diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d76c76be001..b5aacf26a44 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2621,7 +2621,7 @@ def test_reset_index_dup_level_name(level, drop, inplace): @pytest.mark.parametrize("drop", [True, False]) -def test_reset_named_index(pdf, gdf, drop): +def test_reset_index_named(pdf, gdf, drop): pdf.index.name = "cudf" gdf.index.name = "cudf" assert_eq( From 399353043ff0145e154f88f9a0b1bb9ac84792a7 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 22 Nov 2021 10:42:25 -0800 Subject: [PATCH 04/28] lining up `name` behavior in series api --- python/cudf/cudf/core/series.py | 10 +++++++--- python/cudf/cudf/tests/test_series.py | 9 +++++---- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 4a0a9fee185..c7e8bd588bc 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -882,11 +882,15 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): "Cannot reset_index inplace on a Series " "to create a DataFrame" ) - if None in data: - data[0] = data.pop(None) # Should be handled in _from_data? + if name is None: + name = 0 if self.name is None else self.name + data[name] = data.pop(self.name) return cudf.core.dataframe.DataFrame._from_data(data, index) + # For ``name`` behavior, see: + # https://github.com/pandas-dev/pandas/issues/44575 return self._mimic_inplace( - Series._from_data(data, index), inplace=inplace + Series._from_data(data, index, name if inplace else None), + inplace=inplace, ) def set_index(self, index): diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 8865a93344e..4c497a4b47d 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1339,19 +1339,20 @@ def test_equals_names(lhs, rhs): @pytest.mark.parametrize("level", [None, 0, "l0", 1, ["l0", 1]]) @pytest.mark.parametrize("drop", [True, False]) +@pytest.mark.parametrize("original_name", [None, "original_ser"]) @pytest.mark.parametrize("name", [None, "ser"]) @pytest.mark.parametrize("inplace", [True, False]) -def test_reset_index(level, drop, inplace, name): +def test_reset_index(level, drop, inplace, original_name, name): if not drop and inplace: pytest.skip() midx = pd.MultiIndex.from_tuples( [("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=["l0", None] ) - ps = pd.Series(range(4), index=midx, name=name) + ps = pd.Series(range(4), index=midx, name=original_name) gs = cudf.from_pandas(ps) - expect = ps.reset_index(level=level, drop=drop, inplace=inplace) - got = gs.reset_index(level=level, drop=drop, inplace=inplace) + expect = ps.reset_index(level=level, drop=drop, name=name, inplace=inplace) + got = gs.reset_index(level=level, drop=drop, name=name, inplace=inplace) if inplace: expect = ps got = gs From f90bdd4b5871c95d1a2f10b363189ad2f6ebd573 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 22 Nov 2021 10:53:23 -0800 Subject: [PATCH 05/28] duplicate same tests to series --- python/cudf/cudf/tests/test_dataframe.py | 18 ++++----- python/cudf/cudf/tests/test_series.py | 51 ++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index b5aacf26a44..e38864015f8 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2621,17 +2621,17 @@ def test_reset_index_dup_level_name(level, drop, inplace): @pytest.mark.parametrize("drop", [True, False]) -def test_reset_index_named(pdf, gdf, drop): +@pytest.mark.parametrize("inplace", [False, True]) +def test_reset_index_named(pdf, gdf, drop, inplace): pdf.index.name = "cudf" gdf.index.name = "cudf" - assert_eq( - pdf.reset_index(drop=drop, inplace=False), - gdf.reset_index(drop=drop, inplace=False), - ) - assert_eq( - pdf.x.reset_index(drop=drop, inplace=False), - gdf.x.reset_index(drop=drop, inplace=False), - ) + + expect = pdf.reset_index(drop=drop, inplace=inplace) + got = gdf.reset_index(drop=drop, inplace=inplace) + if inplace: + expect = pdf + got = gdf + assert_eq(expect, got) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 4c497a4b47d..1d0f8e602e9 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1358,3 +1358,54 @@ def test_reset_index(level, drop, inplace, original_name, name): got = gs assert_eq(expect, got) + + +@pytest.mark.parametrize("level", [None, 0, 1, [None]]) +@pytest.mark.parametrize("drop", [False, True]) +@pytest.mark.parametrize("inplace", [False, True]) +@pytest.mark.parametrize("original_name", [None, "original_ser"]) +@pytest.mark.parametrize("name", [None, "ser"]) +def test_reset_index_dup_level_name(level, drop, inplace, original_name, name): + if not drop and inplace: + pytest.skip() + # midx levels are named [None, None] + midx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) + ps = pd.Series(range(4), index=midx, name=original_name) + gs = cudf.from_pandas(ps) + if level == [None]: + with pytest.raises( + ValueError, match="occurs multiple times, use a level number$" + ): + gs.reset_index(level=level, drop=drop, inplace=inplace) + return + + expect = ps.reset_index(level=level, drop=drop, inplace=inplace, name=name) + got = gs.reset_index(level=level, drop=drop, inplace=inplace, name=name) + if inplace: + expect = ps + got = gs + + assert_eq(expect, got) + + +@pytest.mark.parametrize("drop", [True, False]) +@pytest.mark.parametrize("inplace", [True, False]) +@pytest.mark.parametrize("original_name", [None, "original_ser"]) +@pytest.mark.parametrize("name", [None, "ser"]) +def test_reset_index_named(drop, inplace, original_name, name): + if not drop and inplace: + pytest.skip() + ps = pd.Series(range(4), index=["x", "y", "z", "w"], name=original_name) + gs = cudf.from_pandas(ps) + + ps.index.name = "cudf" + gs.index.name = "cudf" + + expect = ps.reset_index(drop=drop, inplace=inplace, name=name) + got = gs.reset_index(drop=drop, inplace=inplace, name=name) + + if inplace: + expect = ps + got = gs + + assert_eq(expect, got) From f612e65658bdaf527f1e0a75c6a41705de477264 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 29 Nov 2021 11:49:49 -0800 Subject: [PATCH 06/28] minor improvements and extra test cases --- python/cudf/cudf/core/indexed_frame.py | 19 +++++++------------ python/cudf/cudf/tests/test_dataframe.py | 15 +++++++++++++++ 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 01441713d9d..a244d0e6266 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -809,21 +809,16 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""): def _check_duplicate_level_names(specified, level_names): if specified is None: return - # Size: specified M, level_names N - # Worst case: Nlog(N) + Mlog(N) - existing = set() + non_duplicates = set() duplicates = set() - # Worst case: Nlog(N) for x in level_names: - if x in existing: + if x in non_duplicates: duplicates.add(x) else: - existing.add(x) + non_duplicates.add(x) - # Worst case: Mlog(N) - for x in specified: - if x in duplicates: - raise ValueError( - f"The name {x} occurs multiple times, use a level number" - ) + if any(x in duplicates for x in specified): + raise ValueError( + f"The name {x} occurs multiple times, use a level number" + ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index e38864015f8..e30d0b6b376 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2634,6 +2634,21 @@ def test_reset_index_named(pdf, gdf, drop, inplace): assert_eq(expect, got) +@pytest.mark.parametrize("drop", [True, False]) +@pytest.mark.parametrize("inplace", [False, True]) +@pytest.mark.parametrize("column_names", [["x", "y"], ["index", "y"]]) +def test_reset_index_unnamed(pdf, gdf, drop, inplace, column_names): + pdf.columns = column_names + gdf.columns = column_names + + expect = pdf.reset_index(drop=drop, inplace=inplace) + got = gdf.reset_index(drop=drop, inplace=inplace) + if inplace: + expect = pdf + got = gdf + assert_eq(expect, got) + + @pytest.mark.parametrize( "data", [ From 7ed329bc2e6959dfb26d522a92f24bce5c5a4418 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 29 Nov 2021 15:34:49 -0800 Subject: [PATCH 07/28] Make use of docfmt and docutils --- python/cudf/cudf/core/dataframe.py | 60 ++-------- python/cudf/cudf/core/indexed_frame.py | 3 +- python/cudf/cudf/core/series.py | 45 +------- python/cudf/cudf/utils/docutils.py | 150 +++++++++++++++++++++++++ 4 files changed, 164 insertions(+), 94 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 61a8f3185f0..adef7bcf9b1 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2486,59 +2486,21 @@ def set_index( df.index = idx return df if not inplace else None + @docutils.doc_dataframe_reset_index() def reset_index( self, level=None, drop=False, inplace=False, col_level=0, col_fill="" ): - """ - Reset the index. - - Reset the index of the DataFrame, and use the default one instead. - - Parameters - ---------- - drop : bool, default False - Do not try to insert index into dataframe columns. This resets - the index to the default integer index. - inplace : bool, default False - Modify the DataFrame in place (do not create a new object). - - Returns - ------- - DataFrame or None - DataFrame with the new index or None if ``inplace=True``. - - Examples - -------- - >>> df = cudf.DataFrame([('bird', 389.0), - ... ('bird', 24.0), - ... ('mammal', 80.5), - ... ('mammal', np.nan)], - ... index=['falcon', 'parrot', 'lion', 'monkey'], - ... columns=('class', 'max_speed')) - >>> df - class max_speed - falcon bird 389.0 - parrot bird 24.0 - lion mammal 80.5 - monkey mammal - >>> df.reset_index() - index class max_speed - 0 falcon bird 389.0 - 1 parrot bird 24.0 - 2 lion mammal 80.5 - 3 monkey mammal - >>> df.reset_index(drop=True) - class max_speed - 0 bird 389.0 - 1 bird 24.0 - 2 mammal 80.5 - 3 mammal - """ - data, index = self._reset_index( - level=level, drop=drop, col_level=col_level, col_fill=col_fill - ) + """{docstring}""" return self._mimic_inplace( - DataFrame._from_data(data, index), inplace=inplace + DataFrame._from_data( + *self._reset_index( + level=level, + drop=drop, + col_level=col_level, + col_fill=col_fill, + ) + ), + inplace=inplace, ) def take(self, indices, axis=0, keep_index=None): diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 6ab3e817129..7e17ee066a4 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -911,8 +911,7 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""): if name == "index" and "index" in self._data: name = "level_0" new_column_data[name] = col - result_data = {**new_column_data, **self._data} - return result_data, index + return {**new_column_data, **self._data}, index def _check_duplicate_level_names(specified, level_names): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 86f24502f96..f4d2c740281 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -831,50 +831,9 @@ def reindex(self, index=None, copy=True): series.name = self.name return series + @docutils.doc_series_reset_index() def reset_index(self, level=None, drop=False, name=None, inplace=False): - """ - Reset index to RangeIndex - - Parameters - ---------- - drop : bool, default False - Just reset the index, without inserting it as a column in - the new DataFrame. - inplace : bool, default False - Modify the Series in place (do not create a new object). - - Returns - ------- - Series or DataFrame or None - When `drop` is False (the default), a DataFrame is returned. - The newly created columns will come first in the DataFrame, - followed by the original Series values. - When `drop` is True, a `Series` is returned. - In either case, if ``inplace=True``, no value is returned. - - Examples - -------- - >>> import cudf - >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[10, 11, 12, 13]) - >>> series - 10 a - 11 b - 12 c - 13 d - dtype: object - >>> series.reset_index() - index 0 - 0 10 a - 1 11 b - 2 12 c - 3 13 d - >>> series.reset_index(drop=True) - 0 a - 1 b - 2 c - 3 d - dtype: object - """ + """{docstring}""" data, index = self._reset_index(level=level, drop=drop) if not drop: if inplace is True: diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py index 57ad612846d..f2d41c6a9f8 100644 --- a/python/cudf/cudf/utils/docutils.py +++ b/python/cudf/cudf/utils/docutils.py @@ -335,3 +335,153 @@ def wrapper(func): max 3.0 """ ) + +doc_reset_index_template = """ + Reset the index of the {klass}, or a level of it. + + Parameters + ---------- + level : int, str, tuple, or list, default None + Only remove the given levels from the index. Removes all levels by + default. + drop : bool, default False + Do not try to insert index into dataframe columns. This resets + the index to the default integer index. + {argument} + inplace : bool, default False + Modify the DataFrame in place (do not create a new object). + + Returns + ------- + {return_type} + {klass} with the new index or None if ``inplace=True``.{return_doc} + + Examples + -------- + {example} +""" + +doc_dataframe_reset_index = docfmt_partial( + docstring=doc_reset_index_template.format( + klass="DataFrame", + argument="", + return_type="DataFrame or None", + return_doc="", + example=""" + >>> df = cudf.DataFrame([('bird', 389.0), + ... ('bird', 24.0), + ... ('mammal', 80.5), + ... ('mammal', np.nan)], + ... index=['falcon', 'parrot', 'lion', 'monkey'], + ... columns=('class', 'max_speed')) + >>> df + class max_speed + falcon bird 389.0 + parrot bird 24.0 + lion mammal 80.5 + monkey mammal + >>> df.reset_index() + index class max_speed + 0 falcon bird 389.0 + 1 parrot bird 24.0 + 2 lion mammal 80.5 + 3 monkey mammal + >>> df.reset_index(drop=True) + class max_speed + 0 bird 389.0 + 1 bird 24.0 + 2 mammal 80.5 + 3 mammal + + You can also use ``reset_index`` with MultiIndex. + + >>> index = cudf.MultiIndex.from_tuples([('bird', 'falcon'), + ... ('bird', 'parrot'), + ... ('mammal', 'lion'), + ... ('mammal', 'monkey')], + ... names=['class', 'name']) + >>> df = cudf.DataFrame([(389.0, 'fly'), + ... ( 24.0, 'fly'), + ... ( 80.5, 'run'), + ... (np.nan, 'jump')], + ... index=index, + ... columns=('speed', 'type')) + >>> df + speed type + class name + bird falcon 389.0 fly + parrot 24.0 fly + mammal lion 80.5 run + monkey jump + >>> df.reset_index(level='class') + class speed type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal jump + """, + ) +) + +doc_series_reset_index = docfmt_partial( + docstring=doc_reset_index_template.format( + klass="Series", + argument="""name : object, optional + The name to use for the column containing the original Series + values. Uses self.name by default. This argument is ignored when + ``drop`` is True.""", + return_type="Series or DataFrame or None", + return_doc=""" For Series, When drop is False (the default), a DataFrame + is returned. The newly created columns will come first in the + DataFrame, followed by the original Series values. When `drop` is + True, a `Series` is returned. In either case, if ``inplace=True``, + no value is returned. + """, + example=""" + >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[10, 11, 12, 13]) + >>> series + 10 a + 11 b + 12 c + 13 d + dtype: object + >>> series.reset_index() + index 0 + 0 10 a + 1 11 b + 2 12 c + 3 13 d + >>> series.reset_index(drop=True) + 0 a + 1 b + 2 c + 3 d + dtype: object + + You can also use ``reset_index`` with MultiIndex. + + >>> s2 = cudf.Series( + ... range(4), name='foo', + ... index=cudf.MultiIndex.from_tuples([ + ... ('bar', 'one'), ('bar', 'two'), + ... ('baz', 'one'), ('baz', 'two')], + ... names=['a', 'b'] + ... )) + >>> s2 + a b + bar one 0 + two 1 + baz one 2 + two 3 + Name: foo, dtype: int64 + >>> s2.reset_index(level='a') + a foo + b + one bar 0 + two bar 1 + one baz 2 + two baz 3 + """, + ) +) From dfce947f2aa193193504c7dd4b5272235296f3d2 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 29 Nov 2021 16:06:51 -0800 Subject: [PATCH 08/28] Minor doc improvement --- python/cudf/cudf/utils/docutils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py index f2d41c6a9f8..c9b7bd0b708 100644 --- a/python/cudf/cudf/utils/docutils.py +++ b/python/cudf/cudf/utils/docutils.py @@ -375,7 +375,7 @@ def wrapper(func): ... index=['falcon', 'parrot', 'lion', 'monkey'], ... columns=('class', 'max_speed')) >>> df - class max_speed + class max_speed falcon bird 389.0 parrot bird 24.0 lion mammal 80.5 @@ -407,7 +407,7 @@ class max_speed ... index=index, ... columns=('speed', 'type')) >>> df - speed type + speed type class name bird falcon 389.0 fly parrot 24.0 fly @@ -467,7 +467,7 @@ class speed type ... ('bar', 'one'), ('bar', 'two'), ... ('baz', 'one'), ('baz', 'two')], ... names=['a', 'b'] - ... )) + ... )) >>> s2 a b bar one 0 From 6210abb85447536d16ef0db7979b896ab2eab168 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Sat, 4 Dec 2021 13:09:56 -0800 Subject: [PATCH 09/28] use doc_apply --- python/cudf/cudf/core/dataframe.py | 62 ++++++++++++- python/cudf/cudf/core/series.py | 64 +++++++++++++- python/cudf/cudf/utils/docutils.py | 134 ++--------------------------- 3 files changed, 130 insertions(+), 130 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index adef7bcf9b1..c9eb88c7e86 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2486,11 +2486,69 @@ def set_index( df.index = idx return df if not inplace else None - @docutils.doc_dataframe_reset_index() + @docutils.doc_apply(docutils.doc_reset_index_template.format( + klass="DataFrame", + argument="", + return_type="DataFrame or None", + return_doc="", + example=""" + >>> df = cudf.DataFrame([('bird', 389.0), + ... ('bird', 24.0), + ... ('mammal', 80.5), + ... ('mammal', np.nan)], + ... index=['falcon', 'parrot', 'lion', 'monkey'], + ... columns=('class', 'max_speed')) + >>> df + class max_speed + falcon bird 389.0 + parrot bird 24.0 + lion mammal 80.5 + monkey mammal + >>> df.reset_index() + index class max_speed + 0 falcon bird 389.0 + 1 parrot bird 24.0 + 2 lion mammal 80.5 + 3 monkey mammal + >>> df.reset_index(drop=True) + class max_speed + 0 bird 389.0 + 1 bird 24.0 + 2 mammal 80.5 + 3 mammal + + You can also use ``reset_index`` with MultiIndex. + + >>> index = cudf.MultiIndex.from_tuples([('bird', 'falcon'), + ... ('bird', 'parrot'), + ... ('mammal', 'lion'), + ... ('mammal', 'monkey')], + ... names=['class', 'name']) + >>> df = cudf.DataFrame([(389.0, 'fly'), + ... ( 24.0, 'fly'), + ... ( 80.5, 'run'), + ... (np.nan, 'jump')], + ... index=index, + ... columns=('speed', 'type')) + >>> df + speed type + class name + bird falcon 389.0 fly + parrot 24.0 fly + mammal lion 80.5 run + monkey jump + >>> df.reset_index(level='class') + class speed type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal jump + """ + )) def reset_index( self, level=None, drop=False, inplace=False, col_level=0, col_fill="" ): - """{docstring}""" return self._mimic_inplace( DataFrame._from_data( *self._reset_index( diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index f4d2c740281..2703591fcc9 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -831,9 +831,69 @@ def reindex(self, index=None, copy=True): series.name = self.name return series - @docutils.doc_series_reset_index() + @docutils.doc_apply( + docutils.doc_reset_index_template.format( + klass="Series", + argument=""" + name : object, optional + The name to use for the column containing the original Series + values. Uses self.name by default. This argument is ignored when + ``drop`` is True.""", + return_type="Series or DataFrame or None", + return_doc=""" For Series, When drop is False (the default), a DataFrame + is returned. The newly created columns will come first in the + DataFrame, followed by the original Series values. When `drop` is + True, a `Series` is returned. In either case, if ``inplace=True``, + no value is returned. +""", + example=""" + >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[10, 11, 12, 13]) + >>> series + 10 a + 11 b + 12 c + 13 d + dtype: object + >>> series.reset_index() + index 0 + 0 10 a + 1 11 b + 2 12 c + 3 13 d + >>> series.reset_index(drop=True) + 0 a + 1 b + 2 c + 3 d + dtype: object + + You can also use ``reset_index`` with MultiIndex. + + >>> s2 = cudf.Series( + ... range(4), name='foo', + ... index=cudf.MultiIndex.from_tuples([ + ... ('bar', 'one'), ('bar', 'two'), + ... ('baz', 'one'), ('baz', 'two')], + ... names=['a', 'b'] + ... )) + >>> s2 + a b + bar one 0 + two 1 + baz one 2 + two 3 + Name: foo, dtype: int64 + >>> s2.reset_index(level='a') + a foo + b + one bar 0 + two bar 1 + one baz 2 + two baz 3 +""", + ) + ) def reset_index(self, level=None, drop=False, name=None, inplace=False): - """{docstring}""" data, index = self._reset_index(level=level, drop=drop) if not drop: if inplace is True: diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py index c9b7bd0b708..d102531917d 100644 --- a/python/cudf/cudf/utils/docutils.py +++ b/python/cudf/cudf/utils/docutils.py @@ -67,6 +67,13 @@ def wrapper(func): return wrapper +def doc_apply(doc): + """Set `__doc__` attribute of `func` to `doc`.""" + def wrapper(func): + func.__doc__ = doc + return func + return wrapper + doc_describe = docfmt_partial( docstring=""" @@ -347,7 +354,7 @@ def wrapper(func): drop : bool, default False Do not try to insert index into dataframe columns. This resets the index to the default integer index. - {argument} +{argument} inplace : bool, default False Modify the DataFrame in place (do not create a new object). @@ -360,128 +367,3 @@ def wrapper(func): -------- {example} """ - -doc_dataframe_reset_index = docfmt_partial( - docstring=doc_reset_index_template.format( - klass="DataFrame", - argument="", - return_type="DataFrame or None", - return_doc="", - example=""" - >>> df = cudf.DataFrame([('bird', 389.0), - ... ('bird', 24.0), - ... ('mammal', 80.5), - ... ('mammal', np.nan)], - ... index=['falcon', 'parrot', 'lion', 'monkey'], - ... columns=('class', 'max_speed')) - >>> df - class max_speed - falcon bird 389.0 - parrot bird 24.0 - lion mammal 80.5 - monkey mammal - >>> df.reset_index() - index class max_speed - 0 falcon bird 389.0 - 1 parrot bird 24.0 - 2 lion mammal 80.5 - 3 monkey mammal - >>> df.reset_index(drop=True) - class max_speed - 0 bird 389.0 - 1 bird 24.0 - 2 mammal 80.5 - 3 mammal - - You can also use ``reset_index`` with MultiIndex. - - >>> index = cudf.MultiIndex.from_tuples([('bird', 'falcon'), - ... ('bird', 'parrot'), - ... ('mammal', 'lion'), - ... ('mammal', 'monkey')], - ... names=['class', 'name']) - >>> df = cudf.DataFrame([(389.0, 'fly'), - ... ( 24.0, 'fly'), - ... ( 80.5, 'run'), - ... (np.nan, 'jump')], - ... index=index, - ... columns=('speed', 'type')) - >>> df - speed type - class name - bird falcon 389.0 fly - parrot 24.0 fly - mammal lion 80.5 run - monkey jump - >>> df.reset_index(level='class') - class speed type - name - falcon bird 389.0 fly - parrot bird 24.0 fly - lion mammal 80.5 run - monkey mammal jump - """, - ) -) - -doc_series_reset_index = docfmt_partial( - docstring=doc_reset_index_template.format( - klass="Series", - argument="""name : object, optional - The name to use for the column containing the original Series - values. Uses self.name by default. This argument is ignored when - ``drop`` is True.""", - return_type="Series or DataFrame or None", - return_doc=""" For Series, When drop is False (the default), a DataFrame - is returned. The newly created columns will come first in the - DataFrame, followed by the original Series values. When `drop` is - True, a `Series` is returned. In either case, if ``inplace=True``, - no value is returned. - """, - example=""" - >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[10, 11, 12, 13]) - >>> series - 10 a - 11 b - 12 c - 13 d - dtype: object - >>> series.reset_index() - index 0 - 0 10 a - 1 11 b - 2 12 c - 3 13 d - >>> series.reset_index(drop=True) - 0 a - 1 b - 2 c - 3 d - dtype: object - - You can also use ``reset_index`` with MultiIndex. - - >>> s2 = cudf.Series( - ... range(4), name='foo', - ... index=cudf.MultiIndex.from_tuples([ - ... ('bar', 'one'), ('bar', 'two'), - ... ('baz', 'one'), ('baz', 'two')], - ... names=['a', 'b'] - ... )) - >>> s2 - a b - bar one 0 - two 1 - baz one 2 - two 3 - Name: foo, dtype: int64 - >>> s2.reset_index(level='a') - a foo - b - one bar 0 - two bar 1 - one baz 2 - two baz 3 - """, - ) -) From 8d8cced75c2bcf42634d7ea613c79b8d3b313b4e Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Sat, 4 Dec 2021 13:12:49 -0800 Subject: [PATCH 10/28] style --- python/cudf/cudf/core/dataframe.py | 18 ++++++++++-------- python/cudf/cudf/core/series.py | 10 +++++----- python/cudf/cudf/utils/docutils.py | 3 +++ 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c9eb88c7e86..17b0b0b6ea0 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2486,12 +2486,13 @@ def set_index( df.index = idx return df if not inplace else None - @docutils.doc_apply(docutils.doc_reset_index_template.format( - klass="DataFrame", - argument="", - return_type="DataFrame or None", - return_doc="", - example=""" + @docutils.doc_apply( + docutils.doc_reset_index_template.format( + klass="DataFrame", + argument="", + return_type="DataFrame or None", + return_doc="", + example=""" >>> df = cudf.DataFrame([('bird', 389.0), ... ('bird', 24.0), ... ('mammal', 80.5), @@ -2544,8 +2545,9 @@ class speed type parrot bird 24.0 fly lion mammal 80.5 run monkey mammal jump - """ - )) + """, + ) + ) def reset_index( self, level=None, drop=False, inplace=False, col_level=0, col_fill="" ): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 2703591fcc9..0de539a015a 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -833,20 +833,20 @@ def reindex(self, index=None, copy=True): @docutils.doc_apply( docutils.doc_reset_index_template.format( - klass="Series", - argument=""" + klass="Series", + argument=""" name : object, optional The name to use for the column containing the original Series values. Uses self.name by default. This argument is ignored when ``drop`` is True.""", - return_type="Series or DataFrame or None", - return_doc=""" For Series, When drop is False (the default), a DataFrame + return_type="Series or DataFrame or None", + return_doc=""" For Series, When drop is False (the default), a DataFrame is returned. The newly created columns will come first in the DataFrame, followed by the original Series values. When `drop` is True, a `Series` is returned. In either case, if ``inplace=True``, no value is returned. """, - example=""" + example=""" >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[10, 11, 12, 13]) >>> series 10 a diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py index d102531917d..93e8690b991 100644 --- a/python/cudf/cudf/utils/docutils.py +++ b/python/cudf/cudf/utils/docutils.py @@ -67,11 +67,14 @@ def wrapper(func): return wrapper + def doc_apply(doc): """Set `__doc__` attribute of `func` to `doc`.""" + def wrapper(func): func.__doc__ = doc return func + return wrapper From ec496c05906d3375d7d1ca1bd4ea57e1934295aa Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Sat, 4 Dec 2021 13:27:37 -0800 Subject: [PATCH 11/28] Commits review changes in check duplicates func Co-authored-by: Vyas Ramasubramani --- python/cudf/cudf/core/indexed_frame.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 7e17ee066a4..d96805694a2 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -917,14 +917,9 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""): def _check_duplicate_level_names(specified, level_names): if specified is None: return - non_duplicates = set() - duplicates = set() - - for x in level_names: - if x in non_duplicates: - duplicates.add(x) - else: - non_duplicates.add(x) + if len(set(level_names)) != len(level_names): + return + duplicates = {key for key, val in Counter(level_names).items() if val > 1} if any(x in duplicates for x in specified): raise ValueError( From c3bb37cf29118c5fe09eb82b92b3d1e6053413bf Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Sat, 4 Dec 2021 14:28:47 -0800 Subject: [PATCH 12/28] Raise proper error when duplicate name specified; improve assertion on test; and fix a bug from github commit --- python/cudf/cudf/core/indexed_frame.py | 12 +++++++----- python/cudf/cudf/tests/test_dataframe.py | 16 ++++++++++++---- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index d96805694a2..a76b8bebb44 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -4,6 +4,7 @@ from __future__ import annotations import warnings +from collections import Counter from typing import Type, TypeVar from uuid import uuid4 @@ -917,11 +918,12 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""): def _check_duplicate_level_names(specified, level_names): if specified is None: return - if len(set(level_names)) != len(level_names): + if len(set(level_names)) == len(level_names): return duplicates = {key for key, val in Counter(level_names).items() if val > 1} - if any(x in duplicates for x in specified): - raise ValueError( - f"The name {x} occurs multiple times, use a level number" - ) + for x in specified: + if x in duplicates: + raise ValueError( + f"The name {x} occurs multiple times, use a level number" + ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 8c10d044da5..ec765992816 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2610,10 +2610,18 @@ def test_reset_index_dup_level_name(level, drop, inplace): pdf = pd.DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]], index=midx) gdf = cudf.from_pandas(pdf) if level == [None]: - with pytest.raises( - ValueError, match="occurs multiple times, use a level number$" - ): - gdf.reset_index(level=level, drop=drop, inplace=inplace) + assert_exceptions_equal( + lfunc=pdf.reset_index, + rfunc=gdf.reset_index, + lfunc_args_and_kwargs=( + [], + {"level": level, "drop": drop, "inplace": inplace}, + ), + rfunc_args_and_kwargs=( + [], + {"level": level, "drop": drop, "inplace": inplace}, + ), + ) return expect = pdf.reset_index(level=level, drop=drop, inplace=inplace) From dc2746dcf3d9eefab065e23be713053d25d9de50 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Sat, 4 Dec 2021 14:31:33 -0800 Subject: [PATCH 13/28] Update python/cudf/cudf/core/multiindex.py Co-authored-by: Vyas Ramasubramani --- python/cudf/cudf/core/multiindex.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 3dbc97dcda2..c8a1ad85972 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1741,13 +1741,8 @@ def _split_columns_by_levels(self, levels): # Normalize named levels into indices if levels is not None: - level_indices = set() level_names = list(self.names) - for lv in levels: - if isinstance(lv, int): - level_indices.add(lv) - else: - level_indices.add(level_names.index(lv)) + level_indices = {lv if isinstance(lv, int) else level_names.index(lv) for lv in levels} else: level_indices = range(len(self._data)) From 50980b773f6a90262f932ddbf11452d2fbeeadb5 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Sat, 4 Dec 2021 14:35:48 -0800 Subject: [PATCH 14/28] rename var --- python/cudf/cudf/core/multiindex.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index c8a1ad85972..62d844841ac 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1742,19 +1742,22 @@ def _split_columns_by_levels(self, levels): # Normalize named levels into indices if levels is not None: level_names = list(self.names) - level_indices = {lv if isinstance(lv, int) else level_names.index(lv) for lv in levels} + level_indices = { + lv if isinstance(lv, int) else level_names.index(lv) + for lv in levels + } else: level_indices = range(len(self._data)) # Split the columns data_columns, index_columns = [], [] - column_names, index_names = [], [] + data_names, index_names = [], [] for i, (name, col) in enumerate(zip(self.names, self._data.columns)): if i in level_indices: name = f"level_{i}" if name is None else name data_columns.append(col) - column_names.append(name) + data_names.append(name) else: index_columns.append(col) index_names.append(name) - return data_columns, index_columns, column_names, index_names + return data_columns, index_columns, data_names, index_names From 5e2078e1dbcb061940a1e2746ea47cee7884709c Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Sat, 4 Dec 2021 14:43:54 -0800 Subject: [PATCH 15/28] Special handling prior to loop --- python/cudf/cudf/core/multiindex.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 62d844841ac..bded17f98b8 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1739,15 +1739,23 @@ def _split_columns_by_levels(self, levels): # specified by indices, not name by ``levels``. E.g. [None, None] can # only be specified by 0, 1, not "None". + if levels is None: + return ( + list(self._data.columns), + [], + [ + f"level_{i}" if name is None else name + for i, name in enumerate(self.names) + ], + [], + ) + # Normalize named levels into indices - if levels is not None: - level_names = list(self.names) - level_indices = { - lv if isinstance(lv, int) else level_names.index(lv) - for lv in levels - } - else: - level_indices = range(len(self._data)) + level_names = list(self.names) + level_indices = { + lv if isinstance(lv, int) else level_names.index(lv) + for lv in levels + } # Split the columns data_columns, index_columns = [], [] From 2c70c485c5387aa6df41ac93f8eff4de79d54b93 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Sat, 4 Dec 2021 15:06:12 -0800 Subject: [PATCH 16/28] improve series test --- python/cudf/cudf/core/index.py | 3 +++ python/cudf/cudf/tests/test_series.py | 16 ++++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 8f905ee6d49..8374678bafc 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -113,6 +113,9 @@ def _index_from_data(data: MutableMapping, name: Any = None): index_class_type = cudf.MultiIndex return index_class_type._from_data(data, None, name) +def _index_from_column_list(columns: List[ColumnBase]): + """Construct an index from a list of unnamed columns.""" + return _index_from_data(dict(zip(range(len(columns)), columns))) class RangeIndex(BaseIndex): """ diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 3d92cb667a8..3f31a616b89 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1384,10 +1384,18 @@ def test_reset_index_dup_level_name(level, drop, inplace, original_name, name): ps = pd.Series(range(4), index=midx, name=original_name) gs = cudf.from_pandas(ps) if level == [None]: - with pytest.raises( - ValueError, match="occurs multiple times, use a level number$" - ): - gs.reset_index(level=level, drop=drop, inplace=inplace) + assert_exceptions_equal( + lfunc=ps.reset_index, + rfunc=gs.reset_index, + lfunc_args_and_kwargs=( + [], + {"level": level, "drop": drop, "inplace": inplace}, + ), + rfunc_args_and_kwargs=( + [], + {"level": level, "drop": drop, "inplace": inplace}, + ), + ) return expect = ps.reset_index(level=level, drop=drop, inplace=inplace, name=name) From 44583fb440f37915bc6322e24d3e7daca583af8c Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Sat, 4 Dec 2021 15:51:49 -0800 Subject: [PATCH 17/28] not skipping tests when drop=False and inplace=True --- python/cudf/cudf/core/series.py | 10 +++---- python/cudf/cudf/tests/test_series.py | 38 ++++++++++++++++++++++----- 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 0de539a015a..e7c1ea592c2 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -894,13 +894,13 @@ def reindex(self, index=None, copy=True): ) ) def reset_index(self, level=None, drop=False, name=None, inplace=False): + if not drop and inplace: + raise TypeError( + "Cannot reset_index inplace on a Series " + "to create a DataFrame" + ) data, index = self._reset_index(level=level, drop=drop) if not drop: - if inplace is True: - raise TypeError( - "Cannot reset_index inplace on a Series " - "to create a DataFrame" - ) if name is None: name = 0 if self.name is None else self.name data[name] = data.pop(self.name) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 3f31a616b89..724040aee50 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1354,14 +1354,27 @@ def test_nullable_bool_dtype_series(data, bool_dtype): @pytest.mark.parametrize("name", [None, "ser"]) @pytest.mark.parametrize("inplace", [True, False]) def test_reset_index(level, drop, inplace, original_name, name): - if not drop and inplace: - pytest.skip() midx = pd.MultiIndex.from_tuples( [("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=["l0", None] ) ps = pd.Series(range(4), index=midx, name=original_name) gs = cudf.from_pandas(ps) + if not drop and inplace: + assert_exceptions_equal( + lfunc=ps.reset_index, + rfunc=gs.reset_index, + lfunc_args_and_kwargs=( + [], + {"level": level, "drop": drop, "inplace": inplace}, + ), + rfunc_args_and_kwargs=( + [], + {"level": level, "drop": drop, "inplace": inplace}, + ), + ) + return + expect = ps.reset_index(level=level, drop=drop, name=name, inplace=inplace) got = gs.reset_index(level=level, drop=drop, name=name, inplace=inplace) if inplace: @@ -1377,13 +1390,11 @@ def test_reset_index(level, drop, inplace, original_name, name): @pytest.mark.parametrize("original_name", [None, "original_ser"]) @pytest.mark.parametrize("name", [None, "ser"]) def test_reset_index_dup_level_name(level, drop, inplace, original_name, name): - if not drop and inplace: - pytest.skip() # midx levels are named [None, None] midx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) ps = pd.Series(range(4), index=midx, name=original_name) gs = cudf.from_pandas(ps) - if level == [None]: + if level == [None] or (not drop and inplace): assert_exceptions_equal( lfunc=ps.reset_index, rfunc=gs.reset_index, @@ -1412,14 +1423,27 @@ def test_reset_index_dup_level_name(level, drop, inplace, original_name, name): @pytest.mark.parametrize("original_name", [None, "original_ser"]) @pytest.mark.parametrize("name", [None, "ser"]) def test_reset_index_named(drop, inplace, original_name, name): - if not drop and inplace: - pytest.skip() ps = pd.Series(range(4), index=["x", "y", "z", "w"], name=original_name) gs = cudf.from_pandas(ps) ps.index.name = "cudf" gs.index.name = "cudf" + if not drop and inplace: + assert_exceptions_equal( + lfunc=ps.reset_index, + rfunc=gs.reset_index, + lfunc_args_and_kwargs=( + [], + {"level": None, "drop": drop, "inplace": inplace}, + ), + rfunc_args_and_kwargs=( + [], + {"level": None, "drop": drop, "inplace": inplace}, + ), + ) + return + expect = ps.reset_index(drop=drop, inplace=inplace, name=name) got = gs.reset_index(drop=drop, inplace=inplace, name=name) From 041385863b8370c6adf56d91478477c670243e46 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Sat, 4 Dec 2021 15:53:49 -0800 Subject: [PATCH 18/28] revert factory pending discussion --- python/cudf/cudf/core/index.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 8374678bafc..8f905ee6d49 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -113,9 +113,6 @@ def _index_from_data(data: MutableMapping, name: Any = None): index_class_type = cudf.MultiIndex return index_class_type._from_data(data, None, name) -def _index_from_column_list(columns: List[ColumnBase]): - """Construct an index from a list of unnamed columns.""" - return _index_from_data(dict(zip(range(len(columns)), columns))) class RangeIndex(BaseIndex): """ From 5b81aa3b8d55f88c4d5c8252a07a5dd47f08ba8f Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 7 Dec 2021 15:53:02 -0800 Subject: [PATCH 19/28] add _index_from_columns helper --- python/cudf/cudf/core/frame.py | 4 +--- python/cudf/cudf/core/index.py | 7 +++++++ python/cudf/cudf/core/indexed_frame.py | 11 ++++++----- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index d7a75cb9f40..252fe639a0b 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -157,9 +157,7 @@ def _from_columns( n_index_columns = 0 if index_names is not None: n_index_columns = len(index_names) - index = cudf.core.index._index_from_data( - dict(zip(range(n_index_columns), columns)) - ) + index = cudf.core.index._index_from_columns(columns) if isinstance(index, cudf.MultiIndex): index.names = index_names else: diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 8f905ee6d49..9dedde4502a 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -114,6 +114,13 @@ def _index_from_data(data: MutableMapping, name: Any = None): return index_class_type._from_data(data, None, name) +def _index_from_columns( + columns: List[cudf.core.column.ColumnBase], name: Any = None +): + """Construct an index from ``columns``, with levels named 0, 1, 2...""" + return _index_from_data(dict(zip(range(len(columns)), columns)), name=name) + + class RangeIndex(BaseIndex): """ Immutable Index implementing a monotonic integer range. diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index a76b8bebb44..6ce5e9adaea 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -19,7 +19,11 @@ from cudf.api.types import is_categorical_dtype, is_integer_dtype, is_list_like from cudf.core.column import arange from cudf.core.frame import Frame -from cudf.core.index import Index, RangeIndex, _index_from_data +from cudf.core.index import ( + Index, + RangeIndex, + _index_from_columns, +) from cudf.core.multiindex import MultiIndex from cudf.utils.utils import _gather_map_is_valid, cached_property @@ -893,10 +897,7 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""): index_names, ) = self._index._split_columns_by_levels(level) if index_columns: - index = _index_from_data( - dict(zip(range(len(index_columns)), index_columns)), - name=self._index.name, - ) + index = _index_from_columns(index_columns, name=self._index.name,) if isinstance(index, MultiIndex): index.names = index_names else: From 45ff7d6dfc45ff5bc9f139570c5b9fd14a62728c Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 7 Dec 2021 15:58:12 -0800 Subject: [PATCH 20/28] doc --- python/cudf/cudf/core/indexed_frame.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 6ce5e9adaea..3e93e0e3dcf 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -19,11 +19,7 @@ from cudf.api.types import is_categorical_dtype, is_integer_dtype, is_list_like from cudf.core.column import arange from cudf.core.frame import Frame -from cudf.core.index import ( - Index, - RangeIndex, - _index_from_columns, -) +from cudf.core.index import Index, RangeIndex, _index_from_columns from cudf.core.multiindex import MultiIndex from cudf.utils.utils import _gather_map_is_valid, cached_property @@ -913,6 +909,8 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""): if name == "index" and "index" in self._data: name = "level_0" new_column_data[name] = col + # This is to match pandas where the new data columns are always + # inserted to the left of existing data columns. return {**new_column_data, **self._data}, index From 4e38d97b77662168f43accdf5bcbff79f0419e87 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 7 Dec 2021 15:58:49 -0800 Subject: [PATCH 21/28] make naming consistent --- python/cudf/cudf/core/indexed_frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 3e93e0e3dcf..9f8575fcd22 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -889,7 +889,7 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""): ( data_columns, index_columns, - column_names, + data_names, index_names, ) = self._index._split_columns_by_levels(level) if index_columns: @@ -905,7 +905,7 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""): return self._data, index new_column_data = {} - for name, col in zip(column_names, data_columns): + for name, col in zip(data_names, data_columns): if name == "index" and "index" in self._data: name = "level_0" new_column_data[name] = col From 1eb98737860a6503cf4c79ca48420497fc429f80 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 7 Dec 2021 16:10:23 -0800 Subject: [PATCH 22/28] raise all encountered duplicates --- python/cudf/cudf/core/indexed_frame.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 9f8575fcd22..5d13ab2ca88 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -921,8 +921,10 @@ def _check_duplicate_level_names(specified, level_names): return duplicates = {key for key, val in Counter(level_names).items() if val > 1} - for x in specified: - if x in duplicates: - raise ValueError( - f"The name {x} occurs multiple times, use a level number" - ) + duplicates_specified = [spec for spec in specified if spec in duplicates] + if not len(duplicates_specified) == 0: + # Note: pandas raises first encountered duplicates, cuDF raises all. + raise ValueError( + f"The names {duplicates_specified} occurs multiple times, use a" + " level number" + ) From b9dc154d23c712104ee35b69ca09d3ab3751b4df Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 7 Dec 2021 16:13:14 -0800 Subject: [PATCH 23/28] move doc template to indexedframe --- python/cudf/cudf/core/dataframe.py | 3 ++- python/cudf/cudf/core/indexed_frame.py | 25 +++++++++++++++++++++++++ python/cudf/cudf/core/series.py | 3 ++- python/cudf/cudf/utils/docutils.py | 25 ------------------------- 4 files changed, 29 insertions(+), 27 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 17b0b0b6ea0..ce83ce446d5 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -58,6 +58,7 @@ _FrameIndexer, _get_label_range_or_mask, _indices_from_labels, + doc_reset_index_template, ) from cudf.core.resample import DataFrameResampler from cudf.core.series import Series @@ -2487,7 +2488,7 @@ def set_index( return df if not inplace else None @docutils.doc_apply( - docutils.doc_reset_index_template.format( + doc_reset_index_template.format( klass="DataFrame", argument="", return_type="DataFrame or None", diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 5d13ab2ca88..102190844df 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -23,6 +23,31 @@ from cudf.core.multiindex import MultiIndex from cudf.utils.utils import _gather_map_is_valid, cached_property +doc_reset_index_template = """ + Reset the index of the {klass}, or a level of it. + + Parameters + ---------- + level : int, str, tuple, or list, default None + Only remove the given levels from the index. Removes all levels by + default. + drop : bool, default False + Do not try to insert index into dataframe columns. This resets + the index to the default integer index. +{argument} + inplace : bool, default False + Modify the DataFrame in place (do not create a new object). + + Returns + ------- + {return_type} + {klass} with the new index or None if ``inplace=True``.{return_doc} + + Examples + -------- + {example} +""" + def _indices_from_labels(obj, labels): from cudf.core.column import column diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index e7c1ea592c2..cbdce1e44ce 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -65,6 +65,7 @@ _FrameIndexer, _get_label_range_or_mask, _indices_from_labels, + doc_reset_index_template, ) from cudf.core.single_column_frame import SingleColumnFrame from cudf.utils import cudautils, docutils @@ -832,7 +833,7 @@ def reindex(self, index=None, copy=True): return series @docutils.doc_apply( - docutils.doc_reset_index_template.format( + doc_reset_index_template.format( klass="Series", argument=""" name : object, optional diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py index 93e8690b991..7a4a2673f9b 100644 --- a/python/cudf/cudf/utils/docutils.py +++ b/python/cudf/cudf/utils/docutils.py @@ -345,28 +345,3 @@ def wrapper(func): max 3.0 """ ) - -doc_reset_index_template = """ - Reset the index of the {klass}, or a level of it. - - Parameters - ---------- - level : int, str, tuple, or list, default None - Only remove the given levels from the index. Removes all levels by - default. - drop : bool, default False - Do not try to insert index into dataframe columns. This resets - the index to the default integer index. -{argument} - inplace : bool, default False - Modify the DataFrame in place (do not create a new object). - - Returns - ------- - {return_type} - {klass} with the new index or None if ``inplace=True``.{return_doc} - - Examples - -------- - {example} -""" From 06a9d6a63934abb129c2b4ef9911df426e9c40b4 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 8 Dec 2021 11:07:41 -0800 Subject: [PATCH 24/28] test fix --- python/cudf/cudf/core/frame.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index f85af37e7f7..c00b3aee536 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -157,7 +157,9 @@ def _from_columns( n_index_columns = 0 if index_names is not None: n_index_columns = len(index_names) - index = cudf.core.index._index_from_columns(columns) + index = cudf.core.index._index_from_columns( + columns[:n_index_columns] + ) if isinstance(index, cudf.MultiIndex): index.names = index_names else: From 0772bca379352cce6579c9e651b305bbcff443a0 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 10 Dec 2021 10:19:52 -0800 Subject: [PATCH 25/28] fix broken tests due to change in exception raised --- python/cudf/cudf/tests/test_dataframe.py | 1 + python/cudf/cudf/tests/test_series.py | 84 +++++++++++++----------- 2 files changed, 48 insertions(+), 37 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index eed053bddb0..467b0a5bb43 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2621,6 +2621,7 @@ def test_reset_index_dup_level_name(level, drop, inplace): [], {"level": level, "drop": drop, "inplace": inplace}, ), + expected_error_message="occurs multiple times, use a level number", ) return diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 26400219957..1885b39c7e0 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1361,19 +1361,10 @@ def test_reset_index(level, drop, inplace, original_name, name): gs = cudf.from_pandas(ps) if not drop and inplace: - assert_exceptions_equal( - lfunc=ps.reset_index, - rfunc=gs.reset_index, - lfunc_args_and_kwargs=( - [], - {"level": level, "drop": drop, "inplace": inplace}, - ), - rfunc_args_and_kwargs=( - [], - {"level": level, "drop": drop, "inplace": inplace}, - ), + pytest.skip( + "For exception checks, see " + "test_reset_index_dup_level_name_exceptions" ) - return expect = ps.reset_index(level=level, drop=drop, name=name, inplace=inplace) got = gs.reset_index(level=level, drop=drop, name=name, inplace=inplace) @@ -1394,20 +1385,11 @@ def test_reset_index_dup_level_name(level, drop, inplace, original_name, name): midx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) ps = pd.Series(range(4), index=midx, name=original_name) gs = cudf.from_pandas(ps) - if level == [None] or (not drop and inplace): - assert_exceptions_equal( - lfunc=ps.reset_index, - rfunc=gs.reset_index, - lfunc_args_and_kwargs=( - [], - {"level": level, "drop": drop, "inplace": inplace}, - ), - rfunc_args_and_kwargs=( - [], - {"level": level, "drop": drop, "inplace": inplace}, - ), + if level == [None] or not drop and inplace: + pytest.skip( + "For exception checks, see " + "test_reset_index_dup_level_name_exceptions" ) - return expect = ps.reset_index(level=level, drop=drop, inplace=inplace, name=name) got = gs.reset_index(level=level, drop=drop, inplace=inplace, name=name) @@ -1430,19 +1412,10 @@ def test_reset_index_named(drop, inplace, original_name, name): gs.index.name = "cudf" if not drop and inplace: - assert_exceptions_equal( - lfunc=ps.reset_index, - rfunc=gs.reset_index, - lfunc_args_and_kwargs=( - [], - {"level": None, "drop": drop, "inplace": inplace}, - ), - rfunc_args_and_kwargs=( - [], - {"level": None, "drop": drop, "inplace": inplace}, - ), + pytest.skip( + "For exception checks, see " + "test_reset_index_dup_level_name_exceptions" ) - return expect = ps.reset_index(drop=drop, inplace=inplace, name=name) got = gs.reset_index(drop=drop, inplace=inplace, name=name) @@ -1454,6 +1427,43 @@ def test_reset_index_named(drop, inplace, original_name, name): assert_eq(expect, got) +def test_reset_index_dup_level_name_exceptions(): + midx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) + ps = pd.Series(range(4), index=midx) + gs = cudf.from_pandas(ps) + + # Should specify duplicate level names with level number. + assert_exceptions_equal( + lfunc=ps.reset_index, + rfunc=gs.reset_index, + lfunc_args_and_kwargs=([], {"level": [None]},), + rfunc_args_and_kwargs=([], {"level": [None]},), + expected_error_message="occurs multiple times, use a level number", + ) + + # Cannot use drop=False and inplace=True to turn a series into dataframe. + assert_exceptions_equal( + lfunc=ps.reset_index, + rfunc=gs.reset_index, + lfunc_args_and_kwargs=([], {"drop": False, "inplace": True},), + rfunc_args_and_kwargs=([], {"drop": False, "inplace": True},), + ) + + # Pandas raises the above exception should these two inputs crosses. + assert_exceptions_equal( + lfunc=ps.reset_index, + rfunc=gs.reset_index, + lfunc_args_and_kwargs=( + [], + {"level": [None], "drop": False, "inplace": True}, + ), + rfunc_args_and_kwargs=( + [], + {"level": [None], "drop": False, "inplace": True}, + ), + ) + + def test_series_add_prefix(): cd_s = cudf.Series([1, 2, 3, 4]) pd_s = cd_s.to_pandas() From 63727941bba0ebf0402f4b0584fd36154d3afb96 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 10 Dec 2021 15:48:05 -0800 Subject: [PATCH 26/28] also support multilevel column names in reset index --- python/cudf/cudf/core/indexed_frame.py | 32 ++++++---- python/cudf/cudf/tests/test_dataframe.py | 75 ++++++++++++++++++++---- python/cudf/cudf/utils/utils.py | 24 +++++++- 3 files changed, 105 insertions(+), 26 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 8e64a2677e3..a1b7543ee8c 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -23,10 +23,15 @@ is_list_like, ) from cudf.core.column import arange +from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import Frame from cudf.core.index import Index, RangeIndex, _index_from_columns from cudf.core.multiindex import MultiIndex -from cudf.utils.utils import _gather_map_is_valid, cached_property +from cudf.utils.utils import ( + _gather_map_is_valid, + _make_column_name, + cached_property, +) doc_reset_index_template = """ Reset the index of the {klass}, or a level of it. @@ -1132,16 +1137,6 @@ def resample( def _reset_index(self, level, drop, col_level=0, col_fill=""): """Shared path for DataFrame.reset_index and Series.reset_index.""" - if col_level != 0: - raise NotImplementedError( - "col_level parameter is not supported yet." - ) - - if col_fill != "": - raise NotImplementedError( - "col_fill parameter is not supported yet." - ) - if level is not None and not isinstance(level, (tuple, list)): level = (level,) _check_duplicate_level_names(level, self._index.names) @@ -1169,13 +1164,26 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""): for name, col in zip(data_names, data_columns): if name == "index" and "index" in self._data: name = "level_0" + name = _make_column_name( + name, + self._data.multiindex, + col_level, + col_fill, + self._data.nlevels, + ) new_column_data[name] = col # This is to match pandas where the new data columns are always # inserted to the left of existing data columns. - return {**new_column_data, **self._data}, index + return ( + ColumnAccessor( + {**new_column_data, **self._data}, self._data.multiindex + ), + index, + ) def _check_duplicate_level_names(specified, level_names): + """Raise if any of `specified` has duplicates in `level_names`.""" if specified is None: return if len(set(level_names)) == len(level_names): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 467b0a5bb43..9c6fc5cb1e3 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2581,9 +2581,18 @@ def test_tail_for_string(): @pytest.mark.parametrize("level", [None, 0, "l0", 1, ["l0", 1]]) @pytest.mark.parametrize("drop", [True, False]) -@pytest.mark.parametrize("column_names", [["v0", "v1"], ["v0", "index"]]) +@pytest.mark.parametrize( + "column_names", + [ + ["v0", "v1"], + ["v0", "index"], + pd.MultiIndex.from_tuples([("x0", "x1"), ("y0", "y1")]), + ], +) @pytest.mark.parametrize("inplace", [True, False]) -def test_reset_index(level, drop, column_names, inplace): +@pytest.mark.parametrize("col_level", [0, 1]) +@pytest.mark.parametrize("col_fill", ["", "some_lv"]) +def test_reset_index(level, drop, column_names, inplace, col_level, col_fill): midx = pd.MultiIndex.from_tuples( [("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=["l0", None] ) @@ -2592,8 +2601,20 @@ def test_reset_index(level, drop, column_names, inplace): ) gdf = cudf.from_pandas(pdf) - expect = pdf.reset_index(level=level, drop=drop, inplace=inplace) - got = gdf.reset_index(level=level, drop=drop, inplace=inplace) + expect = pdf.reset_index( + level=level, + drop=drop, + inplace=inplace, + col_level=col_level, + col_fill=col_fill, + ) + got = gdf.reset_index( + level=level, + drop=drop, + inplace=inplace, + col_level=col_level, + col_fill=col_fill, + ) if inplace: expect = pdf got = gdf @@ -2604,7 +2625,9 @@ def test_reset_index(level, drop, column_names, inplace): @pytest.mark.parametrize("level", [None, 0, 1, [None]]) @pytest.mark.parametrize("drop", [False, True]) @pytest.mark.parametrize("inplace", [False, True]) -def test_reset_index_dup_level_name(level, drop, inplace): +@pytest.mark.parametrize("col_level", [0, 1]) +@pytest.mark.parametrize("col_fill", ["", "some_lv"]) +def test_reset_index_dup_level_name(level, drop, inplace, col_level, col_fill): # midx levels are named [None, None] midx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) pdf = pd.DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]], index=midx) @@ -2625,8 +2648,20 @@ def test_reset_index_dup_level_name(level, drop, inplace): ) return - expect = pdf.reset_index(level=level, drop=drop, inplace=inplace) - got = gdf.reset_index(level=level, drop=drop, inplace=inplace) + expect = pdf.reset_index( + level=level, + drop=drop, + inplace=inplace, + col_level=col_level, + col_fill=col_fill, + ) + got = gdf.reset_index( + level=level, + drop=drop, + inplace=inplace, + col_level=col_level, + col_fill=col_fill, + ) if inplace: expect = pdf got = gdf @@ -2636,12 +2671,18 @@ def test_reset_index_dup_level_name(level, drop, inplace): @pytest.mark.parametrize("drop", [True, False]) @pytest.mark.parametrize("inplace", [False, True]) -def test_reset_index_named(pdf, gdf, drop, inplace): +@pytest.mark.parametrize("col_level", [0, 1]) +@pytest.mark.parametrize("col_fill", ["", "some_lv"]) +def test_reset_index_named(pdf, gdf, drop, inplace, col_level, col_fill): pdf.index.name = "cudf" gdf.index.name = "cudf" - expect = pdf.reset_index(drop=drop, inplace=inplace) - got = gdf.reset_index(drop=drop, inplace=inplace) + expect = pdf.reset_index( + drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill + ) + got = gdf.reset_index( + drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill + ) if inplace: expect = pdf got = gdf @@ -2651,12 +2692,20 @@ def test_reset_index_named(pdf, gdf, drop, inplace): @pytest.mark.parametrize("drop", [True, False]) @pytest.mark.parametrize("inplace", [False, True]) @pytest.mark.parametrize("column_names", [["x", "y"], ["index", "y"]]) -def test_reset_index_unnamed(pdf, gdf, drop, inplace, column_names): +@pytest.mark.parametrize("col_level", [0, 1]) +@pytest.mark.parametrize("col_fill", ["", "some_lv"]) +def test_reset_index_unnamed( + pdf, gdf, drop, inplace, column_names, col_level, col_fill +): pdf.columns = column_names gdf.columns = column_names - expect = pdf.reset_index(drop=drop, inplace=inplace) - got = gdf.reset_index(drop=drop, inplace=inplace) + expect = pdf.reset_index( + drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill + ) + got = gdf.reset_index( + drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill + ) if inplace: expect = pdf got = gdf diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index cea384b9c11..da8d48a4b92 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -3,7 +3,7 @@ import decimal import functools from collections.abc import Sequence -from typing import FrozenSet, Set, Union +from typing import Any, FrozenSet, Set, Union import cupy as cp import numpy as np @@ -524,3 +524,25 @@ def _gather_map_is_valid( return True gm_min, gm_max = minmax(gather_map) return gm_min >= -nrows and gm_max < nrows + + +def _make_column_name( + name: Any, + multilevel: bool, + col_level: int = 0, + col_fill: Any = "", + total_level: int = 0, +): + """Make a cudf column name from `name`. + + If `multilevel` is True, return a tuple with len == `total_level`. `name` + is the `col_level`th item, the rest is `col_fill`. + + Otherwise, return ``name``. + """ + # Should this be a ColumnAccessor method instead? + return ( + tuple(name if i == col_level else col_fill for i in range(total_level)) + if multilevel + else name + ) From fb02ae38953e14434b77bacac15a8ddf08fd4685 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 14 Dec 2021 16:33:08 -0800 Subject: [PATCH 27/28] move helper inline --- python/cudf/cudf/core/indexed_frame.py | 19 ++++++++----------- python/cudf/cudf/utils/utils.py | 24 +----------------------- 2 files changed, 9 insertions(+), 34 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index a1b7543ee8c..36708ab2749 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -27,11 +27,7 @@ from cudf.core.frame import Frame from cudf.core.index import Index, RangeIndex, _index_from_columns from cudf.core.multiindex import MultiIndex -from cudf.utils.utils import ( - _gather_map_is_valid, - _make_column_name, - cached_property, -) +from cudf.utils.utils import _gather_map_is_valid, cached_property doc_reset_index_template = """ Reset the index of the {klass}, or a level of it. @@ -1164,12 +1160,13 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""): for name, col in zip(data_names, data_columns): if name == "index" and "index" in self._data: name = "level_0" - name = _make_column_name( - name, - self._data.multiindex, - col_level, - col_fill, - self._data.nlevels, + name = ( + tuple( + name if i == col_level else col_fill + for i in range(self._data.nlevels) + ) + if self._data.multiindex + else name ) new_column_data[name] = col # This is to match pandas where the new data columns are always diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index da8d48a4b92..cea384b9c11 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -3,7 +3,7 @@ import decimal import functools from collections.abc import Sequence -from typing import Any, FrozenSet, Set, Union +from typing import FrozenSet, Set, Union import cupy as cp import numpy as np @@ -524,25 +524,3 @@ def _gather_map_is_valid( return True gm_min, gm_max = minmax(gather_map) return gm_min >= -nrows and gm_max < nrows - - -def _make_column_name( - name: Any, - multilevel: bool, - col_level: int = 0, - col_fill: Any = "", - total_level: int = 0, -): - """Make a cudf column name from `name`. - - If `multilevel` is True, return a tuple with len == `total_level`. `name` - is the `col_level`th item, the rest is `col_fill`. - - Otherwise, return ``name``. - """ - # Should this be a ColumnAccessor method instead? - return ( - tuple(name if i == col_level else col_fill for i in range(total_level)) - if multilevel - else name - ) From 1fb372c047a6a13fa40843dc0757de75c400eee7 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 6 Jan 2022 06:12:03 -0800 Subject: [PATCH 28/28] fix docstring --- python/cudf/cudf/core/series.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index c97a23d1dc8..64a778f587d 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -855,7 +855,7 @@ def reindex(self, index=None, copy=True): 13 d dtype: object >>> series.reset_index() - index 0 + index 0 0 10 a 1 11 b 2 12 c @@ -879,12 +879,12 @@ def reindex(self, index=None, copy=True): >>> s2 a b bar one 0 - two 1 + two 1 baz one 2 - two 3 + two 3 Name: foo, dtype: int64 >>> s2.reset_index(level='a') - a foo + a foo b one bar 0 two bar 1