Skip to content

Commit

Permalink
Fix loc for Series with a MultiIndex (#7243)
Browse files Browse the repository at this point in the history
Fixes #7221 and adds improvements to `loc` with a MultiIndex.

* Previously, `loc` on a `Series` with a `MultiIndex` would fail. For example:

```python
In [7]: sr
Out[7]:
n_workers  type
1          fit        1
2          load       2
3          predict    3
Name: x, dtype: int64

In [8]: sr.loc[(1, "fit")]  # KeyError
````

* Previously, `loc` on a `DataFrame` with a `MultiIndex` would fail when a slice without `start` or `end` was used. For example:

```python
In [3]: df
Out[3]:
                   x
n_workers type
1         fit      1
2         load     2
3         predict  3

In [4]: df.loc[:(2, "load")]  # TypeError
```

Both the above issues have been addressed and tests added.

Authors:
  - Ashwin Srinath (@shwina)

Approvers:
  - Keith Kraus (@kkraus14)
  - Michael Wang (@isVoid)
  - GALI PREM SAGAR (@galipremsagar)
  - Ram (Ramakrishna Prabhu) (@rgsl888prabhu)

URL: #7243
  • Loading branch information
shwina authored Jan 29, 2021
1 parent fe5e07d commit 019d7cc
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 11 deletions.
2 changes: 2 additions & 0 deletions python/cudf/cudf/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,5 @@

# binary operation
BinaryOperand = Union["cudf.Scalar", "cudf.core.column.ColumnBase"]

DataFrameOrSeries = Union["cudf.Series", "cudf.DataFrame"]
22 changes: 20 additions & 2 deletions python/cudf/cudf/core/indexing.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2021, NVIDIA CORPORATION.

from typing import Any, Union

import numpy as np
import pandas as pd
from nvtx import annotate

import cudf
from cudf._lib.scalar import _is_null_host_scalar
from cudf._typing import DataFrameOrSeries, ScalarLike
from cudf.utils.dtypes import (
is_categorical_dtype,
is_column_like,
Expand Down Expand Up @@ -113,7 +117,21 @@ class _SeriesLocIndexer(object):
def __init__(self, sr):
self._sr = sr

def __getitem__(self, arg):
def __getitem__(self, arg: Any) -> Union[ScalarLike, DataFrameOrSeries]:
if isinstance(arg, pd.MultiIndex):
arg = cudf.from_pandas(arg)

if isinstance(self._sr.index, cudf.MultiIndex) and not isinstance(
arg, cudf.MultiIndex
):
result = self._sr.index._get_row_major(self._sr, arg)
if (
isinstance(arg, tuple)
and len(arg) == self._sr._index.nlevels
and not any((isinstance(x, slice) for x in arg))
):
result = result.iloc[0]
return result
try:
arg = self._loc_to_iloc(arg)
except (TypeError, KeyError, IndexError, ValueError):
Expand Down
46 changes: 42 additions & 4 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
# Copyright (c) 2019-2020, NVIDIA CORPORATION.
import itertools
import numbers
import pickle
import warnings
from collections import OrderedDict
from collections.abc import Sequence
from typing import Any, List, Tuple, Union

import cupy
import numpy as np
Expand All @@ -12,6 +14,7 @@

import cudf
from cudf import _lib as libcudf
from cudf._typing import DataFrameOrSeries
from cudf.core.column import column
from cudf.core.frame import Frame
from cudf.core.index import Index, as_index
Expand Down Expand Up @@ -746,7 +749,6 @@ def _compute_validity_mask(self, index, row_tuple, max_length):
return result

def _get_valid_indices_by_tuple(self, index, row_tuple, max_length):

# Instructions for Slicing
# if tuple, get first and last elements of tuple
# if open beginning tuple, get 0 to highest valid_index
Expand Down Expand Up @@ -833,13 +835,23 @@ def _index_and_downcast(self, result, index, index_key):
result = result.set_index(index)
return result

def _get_row_major(self, df, row_tuple):

def _get_row_major(
self,
df: DataFrameOrSeries,
row_tuple: Union[
numbers.Number, slice, Tuple[Any, ...], List[Tuple[Any, ...]]
],
) -> DataFrameOrSeries:
if pd.api.types.is_bool_dtype(
list(row_tuple) if isinstance(row_tuple, tuple) else row_tuple
):
return df[row_tuple]

if isinstance(row_tuple, slice):
if row_tuple.start is None:
row_tuple = slice(self[0], row_tuple.stop, row_tuple.step)
if row_tuple.stop is None:
row_tuple = slice(row_tuple.start, self[-1], row_tuple.step)
self._validate_indexer(row_tuple)
valid_indices = self._get_valid_indices_by_tuple(
df.index, row_tuple, len(df.index)
)
Expand All @@ -848,6 +860,32 @@ def _get_row_major(self, df, row_tuple):
final = self._index_and_downcast(result, result.index, row_tuple)
return final

def _validate_indexer(
self,
indexer: Union[
numbers.Number, slice, Tuple[Any, ...], List[Tuple[Any, ...]]
],
):
if isinstance(indexer, numbers.Number):
return
if isinstance(indexer, tuple):
# drop any slice(None) from the end:
indexer = tuple(
itertools.dropwhile(
lambda x: x == slice(None), reversed(indexer)
)
)[::-1]

# now check for size
if len(indexer) > self.nlevels:
raise IndexError("Indexer size exceeds number of levels")
elif isinstance(indexer, slice):
self._validate_indexer(indexer.start)
self._validate_indexer(indexer.stop)
else:
for i in indexer:
self._validate_indexer(i)

def _split_tuples(self, tuples):
if len(tuples) == 1:
return tuples, slice(None)
Expand Down
18 changes: 18 additions & 0 deletions python/cudf/cudf/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1343,3 +1343,21 @@ def test_loc_zero_dim_array():

assert_eq(psr[np.array(0)], gsr[np.array(0)])
assert_eq(psr[np.array([0])[0]], gsr[np.array([0])[0]])


@pytest.mark.parametrize(
"arg",
[
slice(None),
slice((1, 2), None),
slice(None, (1, 2)),
(1, 1),
(1, slice(None)),
],
)
def test_loc_series_multiindex(arg):
gsr = cudf.DataFrame(
{"a": [1, 1, 2], "b": [1, 2, 3], "c": ["a", "b", "c"]}
).set_index(["a", "b"])["c"]
psr = gsr.to_pandas()
assert_eq(psr.loc[arg], gsr.loc[arg])
31 changes: 26 additions & 5 deletions python/cudf/cudf/tests/test_multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,15 +299,36 @@ def test_multiindex_loc(pdf, gdf, pdfIndex, key_tuple):
assert_eq(pdf.loc[key_tuple], gdf.loc[key_tuple])


def test_multiindex_loc_slice(pdf, gdf, pdfIndex):
@pytest.mark.parametrize(
"arg",
[
slice(("a", "store"), ("b", "house")),
slice(None, ("b", "house")),
slice(("a", "store"), None),
slice(None),
],
)
def test_multiindex_loc_slice(pdf, gdf, pdfIndex, arg):
gdf = cudf.from_pandas(pdf)
gdfIndex = cudf.from_pandas(pdfIndex)
pdf.index = pdfIndex
gdf.index = gdfIndex
assert_eq(
pdf.loc[("a", "store"):("b", "house")],
gdf.loc[("a", "store"):("b", "house")],
)
assert_eq(pdf.loc[arg], gdf.loc[arg])


def test_multiindex_loc_errors(pdf, gdf, pdfIndex):
gdf = cudf.from_pandas(pdf)
gdfIndex = cudf.from_pandas(pdfIndex)
gdf.index = gdfIndex

with pytest.raises(KeyError):
gdf.loc[("a", "store", "clouds", "foo")]
with pytest.raises(IndexError):
gdf.loc[
("a", "store", "clouds", "fire", "x", "y")
] # too many indexers
with pytest.raises(IndexError):
gdf.loc[slice(None, ("a", "store", "clouds", "fire", "x", "y"))]


def test_multiindex_loc_then_column(pdf, gdf, pdfIndex):
Expand Down

0 comments on commit 019d7cc

Please sign in to comment.