Skip to content

Commit

Permalink
Fix issues with MultiIndex in dropna, stack & reset_index (#8753
Browse files Browse the repository at this point in the history
)

Fixes: #3583 

This PR contains fixes for :

- [x] `stack`: Where the MultiIndex names are not being assigned correctly in `from_table` call.
- [x] `dropna`: Where the MultiIndex names are not being preserved after a `libcudf` API call.
- [x] `reset_index`: Where the MultiIndex level names are not being materialized correctly when the index is reset.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Michael Wang (https://github.com/isVoid)
  - Ashwin Srinath (https://github.com/shwina)

URL: #8753
  • Loading branch information
galipremsagar authored Jul 16, 2021
1 parent 7ff4724 commit ceb3922
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 12 deletions.
24 changes: 17 additions & 7 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2987,17 +2987,22 @@ class max_speed
result = self
else:
result = self.copy()
if all(name is None for name in self.index.names):

if not drop:
if isinstance(self.index, cudf.MultiIndex):
names = tuple(
f"level_{i}" for i, _ in enumerate(self.index.names)
name if name is not None else f"level_{i}"
for i, name in enumerate(self.index.names)
)
else:
names = ("index",)
else:
names = self.index.names
if self.index.name is None:
if "index" in self._data.names:
names = ("level_0",)
else:
names = ("index",)
else:
names = (self.index.name,)

if not drop:
index_columns = self.index._data.columns
for name, index_column in zip(
reversed(names), reversed(index_columns)
Expand Down Expand Up @@ -7493,8 +7498,13 @@ def stack(self, level=-1, dropna=True):
repeated_index = self.index.repeat(self.shape[1])
name_index = Frame({0: self._column_names}).tile(self.shape[0])
new_index = list(repeated_index._columns) + [name_index._columns[0]]
if isinstance(self._index, cudf.MultiIndex):
index_names = self._index.names + [None]
else:
index_names = [None] * len(new_index)
new_index = cudf.core.multiindex.MultiIndex.from_frame(
DataFrame(dict(zip(range(0, len(new_index)), new_index)))
DataFrame(dict(zip(range(0, len(new_index)), new_index))),
names=index_names,
)

# Collect datatypes and cast columns as that type
Expand Down
4 changes: 4 additions & 0 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1380,6 +1380,10 @@ def _drop_na_rows(
)
)
result._copy_type_metadata(frame)
if self._index is not None:
result._index.name = self._index.name
if isinstance(self._index, cudf.MultiIndex):
result._index.names = self._index.names
return result

def _drop_na_columns(self, how="any", subset=None, thresh=None):
Expand Down
27 changes: 22 additions & 5 deletions python/cudf/cudf/tests/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,16 +103,33 @@ def test_df_stack(nulls, num_cols, num_rows, dtype):
gdf = cudf.from_pandas(pdf)

got = gdf.stack()

expect = pdf.stack()
if {None} == set(expect.index.names):
expect.rename_axis(
list(range(0, len(expect.index.names))), inplace=True
)

assert_eq(expect, got)


def test_df_stack_reset_index():
df = cudf.DataFrame(
{
"a": [1, 2, 3, 4],
"b": [10, 11, 12, 13],
"c": ["ab", "cd", None, "gh"],
}
)
df = df.set_index(["a", "b"])
pdf = df.to_pandas()

expected = pdf.stack()
actual = df.stack()

assert_eq(expected, actual)

expected = expected.reset_index()
actual = actual.reset_index()

assert_eq(expected, actual)


@pytest.mark.parametrize("num_rows", [1, 2, 10, 1000])
@pytest.mark.parametrize("num_cols", [1, 2, 10])
@pytest.mark.parametrize(
Expand Down

0 comments on commit ceb3922

Please sign in to comment.