Fix loc for Series with a MultiIndex (#7243)

Fixes #7221 and adds improvements to `loc` with a MultiIndex. * Previously, `loc` on a `Series` with a `MultiIndex` would fail. For example: ```python In [7]: sr Out[7]: n_workers type 1 fit 1 2 load 2 3 predict 3 Name: x, dtype: int64 In [8]: sr.loc[(1, "fit")] # KeyError ```` * Previously, `loc` on a `DataFrame` with a `MultiIndex` would fail when a slice without `start` or `end` was used. For example: ```python In [3]: df Out[3]: x n_workers type 1 fit 1 2 load 2 3 predict 3 In [4]: df.loc[:(2, "load")] # TypeError ``` Both the above issues have been addressed and tests added. Authors: - Ashwin Srinath (@shwina) Approvers: - Keith Kraus (@kkraus14) - Michael Wang (@isVoid) - GALI PREM SAGAR (@galipremsagar) - Ram (Ramakrishna Prabhu) (@rgsl888prabhu) URL: #7243
rapidsai · Jan 29, 2021 · 019d7cc · 019d7cc
1 parent fe5e07d
commit 019d7cc
Show file tree

Hide file tree

Showing 5 changed files with 108 additions and 11 deletions.
diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py
@@ -26,3 +26,5 @@
 
 # binary operation
 BinaryOperand = Union["cudf.Scalar", "cudf.core.column.ColumnBase"]
+
+DataFrameOrSeries = Union["cudf.Series", "cudf.DataFrame"]
diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py
@@ -1,10 +1,14 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+
+from typing import Any, Union
+
 import numpy as np
 import pandas as pd
 from nvtx import annotate
 
 import cudf
 from cudf._lib.scalar import _is_null_host_scalar
+from cudf._typing import DataFrameOrSeries, ScalarLike
 from cudf.utils.dtypes import (
     is_categorical_dtype,
     is_column_like,
@@ -113,7 +117,21 @@ class _SeriesLocIndexer(object):
     def __init__(self, sr):
         self._sr = sr
 
-    def __getitem__(self, arg):
+    def __getitem__(self, arg: Any) -> Union[ScalarLike, DataFrameOrSeries]:
+        if isinstance(arg, pd.MultiIndex):
+            arg = cudf.from_pandas(arg)
+
+        if isinstance(self._sr.index, cudf.MultiIndex) and not isinstance(
+            arg, cudf.MultiIndex
+        ):
+            result = self._sr.index._get_row_major(self._sr, arg)
+            if (
+                isinstance(arg, tuple)
+                and len(arg) == self._sr._index.nlevels
+                and not any((isinstance(x, slice) for x in arg))
+            ):
+                result = result.iloc[0]
+            return result
         try:
             arg = self._loc_to_iloc(arg)
         except (TypeError, KeyError, IndexError, ValueError):

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
@@ -1,9 +1,11 @@
 # Copyright (c) 2019-2020, NVIDIA CORPORATION.
+import itertools
 import numbers
 import pickle
 import warnings
 from collections import OrderedDict
 from collections.abc import Sequence
+from typing import Any, List, Tuple, Union
 
 import cupy
 import numpy as np
@@ -12,6 +14,7 @@
 
 import cudf
 from cudf import _lib as libcudf
+from cudf._typing import DataFrameOrSeries
 from cudf.core.column import column
 from cudf.core.frame import Frame
 from cudf.core.index import Index, as_index
@@ -746,7 +749,6 @@ def _compute_validity_mask(self, index, row_tuple, max_length):
         return result
 
     def _get_valid_indices_by_tuple(self, index, row_tuple, max_length):
-
         # Instructions for Slicing
         # if tuple, get first and last elements of tuple
         # if open beginning tuple, get 0 to highest valid_index
@@ -833,13 +835,23 @@ def _index_and_downcast(self, result, index, index_key):
             result = result.set_index(index)
         return result
 
-    def _get_row_major(self, df, row_tuple):
-
+    def _get_row_major(
+        self,
+        df: DataFrameOrSeries,
+        row_tuple: Union[
+            numbers.Number, slice, Tuple[Any, ...], List[Tuple[Any, ...]]
+        ],
+    ) -> DataFrameOrSeries:
         if pd.api.types.is_bool_dtype(
             list(row_tuple) if isinstance(row_tuple, tuple) else row_tuple
         ):
             return df[row_tuple]
-
+        if isinstance(row_tuple, slice):
+            if row_tuple.start is None:
+                row_tuple = slice(self[0], row_tuple.stop, row_tuple.step)
+            if row_tuple.stop is None:
+                row_tuple = slice(row_tuple.start, self[-1], row_tuple.step)
+        self._validate_indexer(row_tuple)
         valid_indices = self._get_valid_indices_by_tuple(
             df.index, row_tuple, len(df.index)
         )
@@ -848,6 +860,32 @@ def _get_row_major(self, df, row_tuple):
         final = self._index_and_downcast(result, result.index, row_tuple)
         return final
 
+    def _validate_indexer(
+        self,
+        indexer: Union[
+            numbers.Number, slice, Tuple[Any, ...], List[Tuple[Any, ...]]
+        ],
+    ):
+        if isinstance(indexer, numbers.Number):
+            return
+        if isinstance(indexer, tuple):
+            # drop any slice(None) from the end:
+            indexer = tuple(
+                itertools.dropwhile(
+                    lambda x: x == slice(None), reversed(indexer)
+                )
+            )[::-1]
+
+            # now check for size
+            if len(indexer) > self.nlevels:
+                raise IndexError("Indexer size exceeds number of levels")
+        elif isinstance(indexer, slice):
+            self._validate_indexer(indexer.start)
+            self._validate_indexer(indexer.stop)
+        else:
+            for i in indexer:
+                self._validate_indexer(i)
+
     def _split_tuples(self, tuples):
         if len(tuples) == 1:
             return tuples, slice(None)

diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
@@ -1343,3 +1343,21 @@ def test_loc_zero_dim_array():
 
     assert_eq(psr[np.array(0)], gsr[np.array(0)])
     assert_eq(psr[np.array([0])[0]], gsr[np.array([0])[0]])
+
+
+@pytest.mark.parametrize(
+    "arg",
+    [
+        slice(None),
+        slice((1, 2), None),
+        slice(None, (1, 2)),
+        (1, 1),
+        (1, slice(None)),
+    ],
+)
+def test_loc_series_multiindex(arg):
+    gsr = cudf.DataFrame(
+        {"a": [1, 1, 2], "b": [1, 2, 3], "c": ["a", "b", "c"]}
+    ).set_index(["a", "b"])["c"]
+    psr = gsr.to_pandas()
+    assert_eq(psr.loc[arg], gsr.loc[arg])
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
@@ -299,15 +299,36 @@ def test_multiindex_loc(pdf, gdf, pdfIndex, key_tuple):
     assert_eq(pdf.loc[key_tuple], gdf.loc[key_tuple])
 
 
-def test_multiindex_loc_slice(pdf, gdf, pdfIndex):
+@pytest.mark.parametrize(
+    "arg",
+    [
+        slice(("a", "store"), ("b", "house")),
+        slice(None, ("b", "house")),
+        slice(("a", "store"), None),
+        slice(None),
+    ],
+)
+def test_multiindex_loc_slice(pdf, gdf, pdfIndex, arg):
     gdf = cudf.from_pandas(pdf)
     gdfIndex = cudf.from_pandas(pdfIndex)
     pdf.index = pdfIndex
     gdf.index = gdfIndex
-    assert_eq(
-        pdf.loc[("a", "store"):("b", "house")],
-        gdf.loc[("a", "store"):("b", "house")],
-    )
+    assert_eq(pdf.loc[arg], gdf.loc[arg])
+
+
+def test_multiindex_loc_errors(pdf, gdf, pdfIndex):
+    gdf = cudf.from_pandas(pdf)
+    gdfIndex = cudf.from_pandas(pdfIndex)
+    gdf.index = gdfIndex
+
+    with pytest.raises(KeyError):
+        gdf.loc[("a", "store", "clouds", "foo")]
+    with pytest.raises(IndexError):
+        gdf.loc[
+            ("a", "store", "clouds", "fire", "x", "y")
+        ]  # too many indexers
+    with pytest.raises(IndexError):
+        gdf.loc[slice(None, ("a", "store", "clouds", "fire", "x", "y"))]
 
 
 def test_multiindex_loc_then_column(pdf, gdf, pdfIndex):
Original file line number	Diff line number	Diff line change
Expand Up		@@ -26,3 +26,5 @@

		# binary operation
		BinaryOperand = Union["cudf.Scalar", "cudf.core.column.ColumnBase"]

		DataFrameOrSeries = Union["cudf.Series", "cudf.DataFrame"]