diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py index 0087daa1676..034b18ec9e0 100644 --- a/python/cudf/cudf/_typing.py +++ b/python/cudf/cudf/_typing.py @@ -26,3 +26,5 @@ # binary operation BinaryOperand = Union["cudf.Scalar", "cudf.core.column.ColumnBase"] + +DataFrameOrSeries = Union["cudf.Series", "cudf.DataFrame"] diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index bcbb8454a63..4d685408df3 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -1,10 +1,14 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. + +from typing import Any, Union + import numpy as np import pandas as pd from nvtx import annotate import cudf from cudf._lib.scalar import _is_null_host_scalar +from cudf._typing import DataFrameOrSeries, ScalarLike from cudf.utils.dtypes import ( is_categorical_dtype, is_column_like, @@ -113,7 +117,21 @@ class _SeriesLocIndexer(object): def __init__(self, sr): self._sr = sr - def __getitem__(self, arg): + def __getitem__(self, arg: Any) -> Union[ScalarLike, DataFrameOrSeries]: + if isinstance(arg, pd.MultiIndex): + arg = cudf.from_pandas(arg) + + if isinstance(self._sr.index, cudf.MultiIndex) and not isinstance( + arg, cudf.MultiIndex + ): + result = self._sr.index._get_row_major(self._sr, arg) + if ( + isinstance(arg, tuple) + and len(arg) == self._sr._index.nlevels + and not any((isinstance(x, slice) for x in arg)) + ): + result = result.iloc[0] + return result try: arg = self._loc_to_iloc(arg) except (TypeError, KeyError, IndexError, ValueError): diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index f182e9d238c..e68be1e4584 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1,9 +1,11 @@ # Copyright (c) 2019-2020, NVIDIA CORPORATION. +import itertools import numbers import pickle import warnings from collections import OrderedDict from collections.abc import Sequence +from typing import Any, List, Tuple, Union import cupy import numpy as np @@ -12,6 +14,7 @@ import cudf from cudf import _lib as libcudf +from cudf._typing import DataFrameOrSeries from cudf.core.column import column from cudf.core.frame import Frame from cudf.core.index import Index, as_index @@ -746,7 +749,6 @@ def _compute_validity_mask(self, index, row_tuple, max_length): return result def _get_valid_indices_by_tuple(self, index, row_tuple, max_length): - # Instructions for Slicing # if tuple, get first and last elements of tuple # if open beginning tuple, get 0 to highest valid_index @@ -833,13 +835,23 @@ def _index_and_downcast(self, result, index, index_key): result = result.set_index(index) return result - def _get_row_major(self, df, row_tuple): - + def _get_row_major( + self, + df: DataFrameOrSeries, + row_tuple: Union[ + numbers.Number, slice, Tuple[Any, ...], List[Tuple[Any, ...]] + ], + ) -> DataFrameOrSeries: if pd.api.types.is_bool_dtype( list(row_tuple) if isinstance(row_tuple, tuple) else row_tuple ): return df[row_tuple] - + if isinstance(row_tuple, slice): + if row_tuple.start is None: + row_tuple = slice(self[0], row_tuple.stop, row_tuple.step) + if row_tuple.stop is None: + row_tuple = slice(row_tuple.start, self[-1], row_tuple.step) + self._validate_indexer(row_tuple) valid_indices = self._get_valid_indices_by_tuple( df.index, row_tuple, len(df.index) ) @@ -848,6 +860,32 @@ def _get_row_major(self, df, row_tuple): final = self._index_and_downcast(result, result.index, row_tuple) return final + def _validate_indexer( + self, + indexer: Union[ + numbers.Number, slice, Tuple[Any, ...], List[Tuple[Any, ...]] + ], + ): + if isinstance(indexer, numbers.Number): + return + if isinstance(indexer, tuple): + # drop any slice(None) from the end: + indexer = tuple( + itertools.dropwhile( + lambda x: x == slice(None), reversed(indexer) + ) + )[::-1] + + # now check for size + if len(indexer) > self.nlevels: + raise IndexError("Indexer size exceeds number of levels") + elif isinstance(indexer, slice): + self._validate_indexer(indexer.start) + self._validate_indexer(indexer.stop) + else: + for i in indexer: + self._validate_indexer(i) + def _split_tuples(self, tuples): if len(tuples) == 1: return tuples, slice(None) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 2fba419045a..484748e84b2 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -1343,3 +1343,21 @@ def test_loc_zero_dim_array(): assert_eq(psr[np.array(0)], gsr[np.array(0)]) assert_eq(psr[np.array([0])[0]], gsr[np.array([0])[0]]) + + +@pytest.mark.parametrize( + "arg", + [ + slice(None), + slice((1, 2), None), + slice(None, (1, 2)), + (1, 1), + (1, slice(None)), + ], +) +def test_loc_series_multiindex(arg): + gsr = cudf.DataFrame( + {"a": [1, 1, 2], "b": [1, 2, 3], "c": ["a", "b", "c"]} + ).set_index(["a", "b"])["c"] + psr = gsr.to_pandas() + assert_eq(psr.loc[arg], gsr.loc[arg]) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index b8e157b12ae..bd78612d6c7 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -299,15 +299,36 @@ def test_multiindex_loc(pdf, gdf, pdfIndex, key_tuple): assert_eq(pdf.loc[key_tuple], gdf.loc[key_tuple]) -def test_multiindex_loc_slice(pdf, gdf, pdfIndex): +@pytest.mark.parametrize( + "arg", + [ + slice(("a", "store"), ("b", "house")), + slice(None, ("b", "house")), + slice(("a", "store"), None), + slice(None), + ], +) +def test_multiindex_loc_slice(pdf, gdf, pdfIndex, arg): gdf = cudf.from_pandas(pdf) gdfIndex = cudf.from_pandas(pdfIndex) pdf.index = pdfIndex gdf.index = gdfIndex - assert_eq( - pdf.loc[("a", "store"):("b", "house")], - gdf.loc[("a", "store"):("b", "house")], - ) + assert_eq(pdf.loc[arg], gdf.loc[arg]) + + +def test_multiindex_loc_errors(pdf, gdf, pdfIndex): + gdf = cudf.from_pandas(pdf) + gdfIndex = cudf.from_pandas(pdfIndex) + gdf.index = gdfIndex + + with pytest.raises(KeyError): + gdf.loc[("a", "store", "clouds", "foo")] + with pytest.raises(IndexError): + gdf.loc[ + ("a", "store", "clouds", "fire", "x", "y") + ] # too many indexers + with pytest.raises(IndexError): + gdf.loc[slice(None, ("a", "store", "clouds", "fire", "x", "y"))] def test_multiindex_loc_then_column(pdf, gdf, pdfIndex):