From b9134db0ad48ab33f5a38d727a31269652f5e0d6 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 17 Jul 2023 10:41:33 +0100 Subject: [PATCH 01/16] ColumnAccessor does not support duplicate key names Raise NotImplementedError if indexing would try and produce a new object with duplicate key names. This avoids silently losing information. --- python/cudf/cudf/core/column_accessor.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index bec9c367ba9..0607b2bf6e4 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -404,6 +404,10 @@ def select_by_index(self, index: Any) -> ColumnAccessor: ColumnAccessor """ keys = self.get_labels_by_index(index) + if len(set(keys)) != len(keys): + raise NotImplementedError( + "cudf DataFrames do not support repeated column names" + ) data = {k: self._data[k] for k in keys} return self.__class__( data, @@ -499,6 +503,10 @@ def _select_by_label_list_like(self, key: Any) -> ColumnAccessor: if keep ) else: + if len(set(key)) != len(key): + raise NotImplementedError( + "cudf DataFrames do not support repeated column names" + ) data = {k: self._grouped_data[k] for k in key} if self.multiindex: data = _to_flat_dict(data) From f5bfb7b3cc1048e0d154a820c9b588644bdb4549 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 17 Jul 2023 10:48:49 +0100 Subject: [PATCH 02/16] Column index destructuring returns ColumnAccessor Rather than just returning column names, since we will immediately construct the ColumnAccessor, make that and return it. --- python/cudf/cudf/core/dataframe.py | 18 ++++++------------ python/cudf/cudf/core/indexing_utils.py | 19 ++++++------------- 2 files changed, 12 insertions(+), 25 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e67604069f1..6fcba4e9603 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -421,22 +421,16 @@ class _DataFrameIlocIndexer(_DataFrameIndexer): def __getitem__(self, arg): row_key, ( col_is_scalar, - column_names, + ca, ) = indexing_utils.destructure_dataframe_iloc_indexer(arg, self._frame) row_spec = indexing_utils.parse_row_iloc_indexer( row_key, len(self._frame) ) - ca = self._frame._data - index = self._frame.index if col_is_scalar: - s = Series._from_data( - ca._select_by_names(column_names), index=index - ) - return s._getitem_preprocessed(row_spec) - if column_names != list(self._frame._column_names): - frame = self._frame._from_data( - ca._select_by_names(column_names), index=index - ) + series = Series._from_data(ca, index=self._frame.index) + return series._getitem_preprocessed(row_spec) + if ca.names != self._frame._data.names: + frame = self._frame._from_data(ca, index=self._frame.index) else: frame = self._frame if isinstance(row_spec, indexing_utils.MapIndexer): @@ -454,7 +448,7 @@ def __getitem__(self, arg): # you only ask for one row. new_name = result.index[0] result = Series._concat( - [result[name] for name in column_names], + [result[name] for name in frame._data.names], index=result.keys(), ) result.name = new_name diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py index 7242de9964f..385f0074563 100644 --- a/python/cudf/cudf/core/indexing_utils.py +++ b/python/cudf/cudf/core/indexing_utils.py @@ -3,7 +3,7 @@ from __future__ import annotations from dataclasses import dataclass -from typing import Any, List, Tuple, Union +from typing import Any, Tuple, Union from typing_extensions import TypeAlias @@ -14,6 +14,7 @@ is_integer, is_integer_dtype, ) +from cudf.core.column_accessor import ColumnAccessor from cudf.core.copy_types import BooleanMask, GatherMap @@ -55,8 +56,6 @@ class ScalarIndexer: EmptyIndexer, MapIndexer, MaskIndexer, ScalarIndexer, SliceIndexer ] -ColumnLabels: TypeAlias = List[str] - def destructure_iloc_key( key: Any, frame: Union[cudf.Series, cudf.DataFrame] @@ -124,7 +123,7 @@ def destructure_iloc_key( def destructure_dataframe_iloc_indexer( key: Any, frame: cudf.DataFrame -) -> Tuple[Any, Tuple[bool, ColumnLabels]]: +) -> Tuple[Any, Tuple[bool, ColumnAccessor]]: """Destructure an index key for DataFrame iloc getitem. Parameters @@ -154,13 +153,7 @@ def destructure_dataframe_iloc_indexer( cols = slice(None) scalar = is_integer(cols) try: - column_names: ColumnLabels = list( - frame._data.get_labels_by_index(cols) - ) - if len(set(column_names)) != len(column_names): - raise NotImplementedError( - "cudf DataFrames do not support repeated column names" - ) + ca = frame._data.select_by_index(cols) except TypeError: raise TypeError( "Column indices must be integers, slices, " @@ -168,10 +161,10 @@ def destructure_dataframe_iloc_indexer( ) if scalar: assert ( - len(column_names) == 1 + len(ca) == 1 ), "Scalar column indexer should not produce more than one column" - return rows, (scalar, column_names) + return rows, (scalar, ca) def destructure_series_iloc_indexer(key: Any, frame: cudf.Series) -> Any: From 58a7cb7a03334f84936d3682a91c9c5929d85400 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 17 Jul 2023 10:50:22 +0100 Subject: [PATCH 03/16] Remove unused code --- python/cudf/cudf/core/column_accessor.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 0607b2bf6e4..72428da0e8a 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -20,7 +20,6 @@ import pandas as pd from packaging.version import Version from pandas.api.types import is_bool -from typing_extensions import Self import cudf from cudf.core import column @@ -481,13 +480,6 @@ def set_by_label(self, key: Any, value: Any, validate: bool = True): self._data[key] = value self._clear_cache() - def _select_by_names(self, names: abc.Sequence) -> Self: - return self.__class__( - {key: self[key] for key in names}, - multiindex=self.multiindex, - level_names=self.level_names, - ) - def _select_by_label_list_like(self, key: Any) -> ColumnAccessor: # Might be a generator key = tuple(key) From f5a9d61eb183516b86d85ff8afd0a95401486aae Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 17 Jul 2023 12:24:58 +0100 Subject: [PATCH 04/16] Fall back to positional in Series.__getitem__ for integer key If the index is non-numeric, Series.__getitem__ should fall back to positional lookup for integer keys. --- python/cudf/cudf/core/series.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 2fef741ac09..5c3d320df8e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1362,6 +1362,17 @@ def _getitem_preprocessed( def __getitem__(self, arg): if isinstance(arg, slice): return self.iloc[arg] + elif is_integer(arg) and self.index.dtype.kind not in {"i", "u", "f"}: + # Series getitem looks up integers by position if the + # index is non-numeric, but is deprecated in pandas 2 + # https://github.com/pandas-dev/pandas/issues/50617 + # warnings.warn( + # "Treating integer keys positionally is deprecated and " + # "will be removed in a future version. To find a value " + # "by position use ser.iloc[pos] instead", + # FutureWarning, + # ) + return self.iloc[arg] else: return self.loc[arg] From fed2bcbb1aa25cf5e844daf370d4678afcda06a3 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 17 Jul 2023 12:36:14 +0100 Subject: [PATCH 05/16] Parsing to structured data for label-based lookup Similar to positional indexing, implement parsing to structured IndexingSpec data for label indexing. This does not yet treat MultiIndex lookups for which the rules for combining the multi-level keys are more complicated. --- python/cudf/cudf/core/indexing_utils.py | 341 +++++++++++++++++++++++- 1 file changed, 340 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py index 385f0074563..e089040bbaf 100644 --- a/python/cudf/cudf/core/indexing_utils.py +++ b/python/cudf/cudf/core/indexing_utils.py @@ -3,20 +3,27 @@ from __future__ import annotations from dataclasses import dataclass -from typing import Any, Tuple, Union +from typing import TYPE_CHECKING, Any, Tuple, Union +import numpy as np from typing_extensions import TypeAlias import cudf +import cudf._lib as libcudf +from cudf._lib.types import size_type_dtype from cudf.api.types import ( _is_scalar_or_zero_d_array, is_bool_dtype, is_integer, is_integer_dtype, + is_scalar, ) from cudf.core.column_accessor import ColumnAccessor from cudf.core.copy_types import BooleanMask, GatherMap +if TYPE_CHECKING: + from cudf.core.column import ColumnBase + class EmptyIndexer: """An indexer that will produce an empty result.""" @@ -234,3 +241,335 @@ def parse_row_iloc_indexer(key: Any, n: int) -> IndexingSpec: "Cannot index by location " f"with non-integer key of type {type(key)}" ) + + +def destructure_loc_key( + key: Any, frame: cudf.Series | cudf.DataFrame +) -> tuple[Any, ...]: + """ + Destructure a potentially tuple-typed key into row and column indexers + + Tuple arguments to loc indexing are treated specially. They are + picked apart into indexers for the row and column. If the number + of entries is less than the number of modes of the frame, missing + entries are slice-expanded. + + If the user-provided key is not a tuple, it is treated as if it + were a singleton tuple, and then slice-expanded. + + Once this destructuring has occurred, any entries that are + callables are then called with the indexed frame. This should + return a valid indexing object for the rows (respectively + columns), namely one of: + + - A boolean mask of the same length as the frame in the given + dimension + - A scalar label looked up in the index + - A scalar integer that indexes the frame + - An array-like of labels looked up in the index + - A slice of the index + - For multiindices, a tuple of per level indexers + + Slice-based indexing is on the closed interval [start, end], rather + than the semi-open interval [start, end) + + Parameters + ---------- + key + The key to destructure + frame + DataFrame or Series to provide context + + Returns + ------- + tuple of indexers with length equal to the dimension of the frame + + Raises + ------ + IndexError + If there are too many indexers. + """ + n = len(frame.shape) + if ( + isinstance(frame.index, cudf.MultiIndex) + and n == 2 + and isinstance(key, tuple) + and all(map(is_scalar, key)) + ): + # This is "best-effort" and ambiguous + if len(key) == 2: + if key[1] in frame.index._columns[1]: + # key just indexes the rows + key = (key,) + elif key[1] in frame._data: + # key indexes rows and columns + key = key + else: + # key indexes rows and we will raise a keyerror + key = (key,) + else: + # key just indexes rows + key = (key,) + if isinstance(key, tuple): + # Key potentially indexes rows and columns, slice-expand to + # shape of frame + indexers = key + (slice(None),) * (n - len(key)) + if len(indexers) > n: + raise IndexError( + f"Too many indexers: got {len(indexers)} expected {n}" + ) + else: + # Key indexes rows, slice-expand to shape of frame + indexers = (key, *(slice(None),) * (n - 1)) + return tuple(k(frame) if callable(k) else k for k in indexers) + + +def destructure_dataframe_loc_indexer( + key: Any, frame: cudf.DataFrame +) -> Tuple[Any, Tuple[bool, ColumnAccessor]]: + """Destructure an index key for DataFrame loc getitem. + + Parameters + ---------- + key + Key to destructure + frame + DataFrame to provide context context + + Returns + ------- + tuple + 2-tuple of a key for the rows and tuple of + (column_index_is_scalar, column_names) for the columns + + Raises + ------ + TypeError + If the column indexer is invalid + IndexError + If the provided key does not destructure correctly + NotImplementedError + If the requested column indexer repeats columns + """ + rows, cols = destructure_loc_key(key, frame) + if cols is Ellipsis: + cols = slice(None) + try: + scalar = cols in frame._data + except TypeError: + scalar = False + try: + ca = frame._data.select_by_label(cols) + except TypeError: + raise TypeError( + "Column indices must be names, slices, " + "list-like of names, or boolean mask" + ) + if scalar: + assert ( + len(ca) == 1 + ), "Scalar column indexer should not produce more than one column" + + return rows, (scalar, ca) + + +def destructure_series_loc_indexer(key: Any, frame: cudf.Series) -> Any: + """Destructure an index key for Series loc getitem. + + Parameters + ---------- + key + Key to destructure + frame + Series for unpacking context + + Returns + ------- + Single key that will index the rows + """ + (rows,) = destructure_loc_key(key, frame) + return rows + + +def ordered_find(needles: "ColumnBase", haystack: "ColumnBase") -> GatherMap: + """Find locations of needles in a haystack preserving order + + Parameters + ---------- + needles + Labels to look for + haystack + Haystack to search in + + Returns + ------- + NumericalColumn + Integer gather map of locations needles were found in haystack + + Raises + ------ + KeyError + If not all needles were found in the haystack. + If needles cannot be converted to the dtype of haystack. + + Notes + ----- + This sorts the gather map so that the result comes back in the + order the needles were specified (and are found in the haystack). + """ + # Pre-process to match dtypes + needle_kind = needles.dtype.kind + haystack_kind = haystack.dtype.kind + if haystack_kind == "O": + try: + needles = needles.astype(haystack.dtype) + except ValueError: + # Pandas raise KeyError here + raise KeyError("Dtype mismatch in label lookup") + elif needle_kind == haystack_kind or { + haystack_kind, + needle_kind, + }.issubset({"i", "u", "f"}): + needles = needles.astype(haystack.dtype) + elif needles.dtype != haystack.dtype: + # Pandas raise KeyError here + raise KeyError("Dtype mismatch in label lookup") + # Can't always do an inner join because then we can't check if we + # had missing keys (can't check the length because the entries in + # the needle might appear multiple times in the haystack). + lgather, rgather = libcudf.join.join([needles], [haystack], how="left") + (left_order,) = libcudf.copying.gather( + [cudf.core.column.arange(len(needles), dtype=size_type_dtype)], + lgather, + nullify=False, + ) + (right_order,) = libcudf.copying.gather( + [cudf.core.column.arange(len(haystack), dtype=size_type_dtype)], + rgather, + nullify=True, + ) + if right_order.null_count > 0: + raise KeyError("Not all keys in index") + (rgather,) = libcudf.sort.sort_by_key( + [rgather], + [left_order, right_order], + [True, True], + ["last", "last"], + stable=True, + ) + return GatherMap.from_column_unchecked( + rgather, len(haystack), nullify=False + ) + + +def parse_single_row_loc_key( + key: Any, + index: cudf.BaseIndex, +) -> IndexingSpec: + """ + Turn a single label-based row indexer into structured information. + + This converts label-based lookups into structured positional + lookups. + + Valid values for the key are + - a slice (endpoints are looked up) + - a scalar label + - a boolean mask of the same length as the index + - a column of labels to look up (may be empty) + + Parameters + ---------- + key + Key for label-based row indexing + index + Index to act as haystack for labels + + Returns + ------- + IndexingSpec + Structured information for indexing + + Raises + ------ + KeyError + If any label is not found + ValueError + If labels cannot be coerced to index dtype + """ + n = len(index) + if isinstance(key, slice): + # Convert label slice to index slice + # TODO: datetime index must be handled specially (unless we go for + # pandas 2 compatibility) + parsed_key = index.find_label_range(key) + if len(range(n)[parsed_key]) == 0: + return EmptyIndexer() + else: + return SliceIndexer(parsed_key) + else: + is_scalar = _is_scalar_or_zero_d_array(key) + if is_scalar and isinstance(key, np.ndarray): + key = cudf.core.column.as_column(key.item(), dtype=key.dtype) + else: + key = cudf.core.column.as_column(key) + if ( + isinstance(key, cudf.core.column.CategoricalColumn) + and index.dtype != key.dtype + ): + # TODO: is this right? + key = key._get_decategorized_column() + if is_bool_dtype(key.dtype): + # The only easy one. + return MaskIndexer(BooleanMask(key, n)) + elif len(key) == 0: + return EmptyIndexer() + else: + # TODO: promote to Index objects, so this can handle + # categoricals correctly? + (haystack,) = index._columns + if isinstance(index, cudf.DatetimeIndex): + # Try to turn strings into datetimes + key = cudf.core.column.as_column(key, dtype=index.dtype) + gather_map = ordered_find(key, haystack) + if is_scalar and len(gather_map.column) == 1: + return ScalarIndexer(gather_map) + else: + return MapIndexer(gather_map) + + +def parse_row_loc_indexer(key: Any, index: cudf.BaseIndex) -> IndexingSpec: + """ + Normalize to return structured information for a label-based row indexer. + + Given a label-based row indexer that has already been destructured by + :func:`destructure_loc_key`, inspect further and produce structured + information for indexing operations to act upon. + + Parameters + ---------- + key + Suitably destructured key for row indexing + index + Index to provide context + + Returns + ------- + IndexingSpec + Structured data for indexing. A tag + parsed data. + + Raises + ------ + KeyError + If a valid type of indexer is provided, but not all keys are + found + TypeError + If the indexing key is otherwise invalid. + """ + # TODO: multiindices need to be treated separately + if key is Ellipsis: + # Ellipsis is handled here because multiindex level-based + # indices don't handle ellipsis in pandas. + return SliceIndexer(slice(None)) + else: + return parse_single_row_loc_key(key, index) From 4be3035100f83971a143334204a9a6813909cfca Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 17 Jul 2023 12:43:10 +0100 Subject: [PATCH 06/16] Refactor to expose DataFrame._getitem_preprocessed --- python/cudf/cudf/core/dataframe.py | 99 +++++++++++++++++--------- python/cudf/cudf/core/indexed_frame.py | 2 +- 2 files changed, 65 insertions(+), 36 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 6fcba4e9603..38e5c2742ab 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -36,7 +36,7 @@ from pandas.core.dtypes.common import is_float, is_integer from pandas.io.formats import console from pandas.io.formats.printing import pprint_thing -from typing_extensions import assert_never +from typing_extensions import Self, assert_never import cudf import cudf.core.common @@ -426,39 +426,7 @@ def __getitem__(self, arg): row_spec = indexing_utils.parse_row_iloc_indexer( row_key, len(self._frame) ) - if col_is_scalar: - series = Series._from_data(ca, index=self._frame.index) - return series._getitem_preprocessed(row_spec) - if ca.names != self._frame._data.names: - frame = self._frame._from_data(ca, index=self._frame.index) - else: - frame = self._frame - if isinstance(row_spec, indexing_utils.MapIndexer): - return frame._gather(row_spec.key, keep_index=True) - elif isinstance(row_spec, indexing_utils.MaskIndexer): - return frame._apply_boolean_mask(row_spec.key, keep_index=True) - elif isinstance(row_spec, indexing_utils.SliceIndexer): - return frame._slice(row_spec.key) - elif isinstance(row_spec, indexing_utils.ScalarIndexer): - result = frame._gather(row_spec.key, keep_index=True) - # Attempt to turn into series. - try: - # Behaviour difference from pandas, which will merrily - # turn any heterogeneous set of columns into a series if - # you only ask for one row. - new_name = result.index[0] - result = Series._concat( - [result[name] for name in frame._data.names], - index=result.keys(), - ) - result.name = new_name - return result - except TypeError: - # Couldn't find a common type, just return a 1xN dataframe. - return result - elif isinstance(row_spec, indexing_utils.EmptyIndexer): - return frame._empty_like(keep_index=True) - assert_never(row_spec) + return self._frame._getitem_preprocessed(row_spec, col_is_scalar, ca) @_cudf_nvtx_annotate def _setitem_tuple_arg(self, key, value): @@ -957,7 +925,7 @@ def _from_data( data: MutableMapping, index: Optional[BaseIndex] = None, columns: Any = None, - ) -> DataFrame: + ) -> Self: out = super()._from_data(data=data, index=index) if columns is not None: out.columns = columns @@ -1116,6 +1084,67 @@ def __setattr__(self, key, col): else: super().__setattr__(key, col) + def _getitem_preprocessed( + self, + spec: indexing_utils.IndexingSpec, + col_is_scalar: bool, + ca: ColumnAccessor, + ) -> Union[Self, Series]: + """Get a subset of rows and columns given structured data + + Parameters + ---------- + spec + Indexing specification for the rows + col_is_scalar + Was the indexer of the columns a scalar (return a Series) + ca + ColumnAccessor representing the subsetted column data + + Returns + ------- + Subsetted DataFrame or Series (if a scalar row is requested + and the concatenation of the column types is possible) + + Notes + ----- + This function performs no bounds-checking or massaging of the + inputs. + """ + if col_is_scalar: + series = Series._from_data(ca, index=self.index) + return series._getitem_preprocessed(spec) + if ca.names != self._data.names: + frame = self._from_data(ca, index=self.index) + else: + frame = self + if isinstance(spec, indexing_utils.MapIndexer): + return frame._gather(spec.key, keep_index=True) + elif isinstance(spec, indexing_utils.MaskIndexer): + return frame._apply_boolean_mask(spec.key, keep_index=True) + elif isinstance(spec, indexing_utils.SliceIndexer): + return frame._slice(spec.key) + elif isinstance(spec, indexing_utils.ScalarIndexer): + result = frame._gather(spec.key, keep_index=True) + # Attempt to turn into series. + try: + # Behaviour difference from pandas, which will merrily + # turn any heterogeneous set of columns into a series if + # you only ask for one row. + new_name = result.index[0] + result = Series._concat( + [result[name] for name in frame._data.names], + index=result.keys(), + ) + result.name = new_name + return result + except TypeError: + # Couldn't find a common type, just return a 1xN dataframe. + return result + elif isinstance(spec, indexing_utils.EmptyIndexer): + return frame._empty_like(keep_index=True) + assert_never(spec) + @_cudf_nvtx_annotate def __getitem__(self, arg): """ diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 4c6eb3a50e9..657ed2defec 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1812,7 +1812,7 @@ def _gather( self, gather_map: GatherMap, keep_index=True, - ): + ) -> Self: """Gather rows of frame specified by indices in `gather_map`. Maintain the index if keep_index is True. From 9880b3c2bae0cc5f2541efef00f7077d9e6d094b Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 17 Jul 2023 13:17:19 +0100 Subject: [PATCH 07/16] Use structured data for label indexing for non-MultiIndex case --- python/cudf/cudf/core/dataframe.py | 101 +++++++---------------------- python/cudf/cudf/core/series.py | 10 +++ 2 files changed, 32 insertions(+), 79 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 38e5c2742ab..a5a6fba02a4 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -122,22 +122,6 @@ def _shape_mismatch_error(x, y): class _DataFrameIndexer(_FrameIndexer): - def __getitem__(self, arg): - if ( - isinstance(self._frame.index, MultiIndex) - or self._frame._data.multiindex - ): - # This try/except block allows the use of pandas-like - # tuple arguments into MultiIndex dataframes. - try: - return self._getitem_tuple_arg(arg) - except (TypeError, KeyError, IndexError, ValueError): - return self._getitem_tuple_arg((arg, slice(None))) - else: - if not isinstance(arg, tuple): - arg = (arg, slice(None)) - return self._getitem_tuple_arg(arg) - def __setitem__(self, key, value): if not isinstance(key, tuple): key = (key, slice(None)) @@ -229,14 +213,30 @@ class _DataFrameLocIndexer(_DataFrameIndexer): For selection by label. """ - @_cudf_nvtx_annotate - def _getitem_scalar(self, arg): - return self._frame[arg[1]].loc[arg[0]] + def __getitem__(self, arg): + if isinstance(self._frame.index, cudf.MultiIndex): + # This try/except block allows the use of pandas-like + # tuple arguments into MultiIndex dataframes. + try: + return self._getitem_tuple_arg(arg) + except (TypeError, KeyError, IndexError, ValueError): + return self._getitem_tuple_arg((arg, slice(None))) + else: + row_key, ( + col_is_scalar, + ca, + ) = indexing_utils.destructure_dataframe_loc_indexer( + arg, self._frame + ) + row_spec = indexing_utils.parse_row_loc_indexer( + row_key, self._frame.index + ) + return self._frame._getitem_preprocessed( + row_spec, col_is_scalar, ca + ) @_cudf_nvtx_annotate def _getitem_tuple_arg(self, arg): - from uuid import uuid4 - # Step 1: Gather columns if isinstance(arg, tuple): columns_df = self._frame._get_columns_by_label(arg[1]) @@ -262,64 +262,7 @@ def _getitem_tuple_arg(self, arg): row_arg = arg return columns_df.index._get_row_major(columns_df, row_arg) else: - if isinstance(arg[0], slice): - out = _get_label_range_or_mask( - columns_df.index, arg[0].start, arg[0].stop, arg[0].step - ) - if isinstance(out, slice): - df = columns_df._slice(out) - else: - df = columns_df._apply_boolean_mask( - BooleanMask.from_column_unchecked( - cudf.core.column.as_column(out) - ) - ) - else: - tmp_arg = arg - if is_scalar(arg[0]): - # If a scalar, there is possibility of having duplicates. - # Join would get all the duplicates. So, converting it to - # an array kind. - tmp_arg = ([tmp_arg[0]], tmp_arg[1]) - if len(tmp_arg[0]) == 0: - return columns_df._empty_like(keep_index=True) - tmp_arg = (as_column(tmp_arg[0]), tmp_arg[1]) - - if is_bool_dtype(tmp_arg[0]): - df = columns_df._apply_boolean_mask( - BooleanMask(tmp_arg[0], len(columns_df)) - ) - else: - tmp_col_name = str(uuid4()) - cantor_name = "_" + "_".join( - map(str, columns_df._data.names) - ) - if columns_df._data.multiindex: - # column names must be appropriate length tuples - extra = tuple( - "" for _ in range(columns_df._data.nlevels - 1) - ) - tmp_col_name = (tmp_col_name, *extra) - cantor_name = (cantor_name, *extra) - other_df = DataFrame( - {tmp_col_name: column.arange(len(tmp_arg[0]))}, - index=as_index(tmp_arg[0]), - ) - columns_df[cantor_name] = column.arange(len(columns_df)) - df = other_df.join(columns_df, how="inner") - # as join is not assigning any names to index, - # update it over here - df.index.name = columns_df.index.name - df = df.sort_values(by=[tmp_col_name, cantor_name]) - df.drop(columns=[tmp_col_name, cantor_name], inplace=True) - # There were no indices found - if len(df) == 0: - raise KeyError(arg) - - # Step 3: Downcast - if self._can_downcast_to_series(df, arg): - return self._downcast_to_series(df, arg) - return df + raise RuntimeError("Should have been handled by now") @_cudf_nvtx_annotate def _setitem_tuple_arg(self, key, value): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 5c3d320df8e..8c584b2f17e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -237,8 +237,18 @@ class _SeriesLocIndexer(_FrameIndexer): Label-based selection """ + _frame: Series + @_cudf_nvtx_annotate def __getitem__(self, arg: Any) -> Union[ScalarLike, DataFrameOrSeries]: + if not isinstance(self._frame.index, cudf.MultiIndex): + indexing_spec = indexing_utils.parse_row_loc_indexer( + indexing_utils.destructure_series_loc_indexer( + arg, self._frame + ), + self._frame.index, + ) + return self._frame._getitem_preprocessed(indexing_spec) if isinstance(arg, pd.MultiIndex): arg = cudf.from_pandas(arg) From 199ffdfe634f0460ea4ebeb0d64d58d1f73b84f6 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 17 Jul 2023 16:50:34 +0100 Subject: [PATCH 08/16] Move get_label_range_or_mask to indexing_utils Trying to move all the special-case handling into one place. Additionally, handle step != 1 correctly for datetime indexing. --- python/cudf/cudf/core/dataframe.py | 13 ++++--- python/cudf/cudf/core/indexed_frame.py | 23 ------------ python/cudf/cudf/core/indexing_utils.py | 48 ++++++++++++++++++++----- python/cudf/cudf/core/series.py | 11 ++++-- 4 files changed, 57 insertions(+), 38 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a5a6fba02a4..09386f88f3d 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -75,7 +75,6 @@ from cudf.core.indexed_frame import ( IndexedFrame, _FrameIndexer, - _get_label_range_or_mask, _indices_from_labels, doc_reset_index_template, ) @@ -279,10 +278,16 @@ def _setitem_tuple_arg(self, key, value): columns_df = self._frame._get_columns_by_label(key[1]) except KeyError: if not self._frame.empty and isinstance(key[0], slice): - pos_range = _get_label_range_or_mask( - self._frame.index, key[0].start, key[0].stop, key[0].step + indexer = indexing_utils.find_label_range_or_mask( + key[0], self._frame.index ) - idx = self._frame.index[pos_range] + index = self._frame.index + if isinstance(indexer, indexing_utils.EmptyIndexer): + idx = index[0:0:1] + elif isinstance(indexer, indexing_utils.SliceIndexer): + idx = index[indexer.key] + else: + idx = index[indexer.key.column] elif self._frame.empty and isinstance(key[0], slice): idx = None else: diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 657ed2defec..e120282948a 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -193,29 +193,6 @@ def _indices_from_labels(obj, labels): return lhs.join(rhs).sort_values(by=["__", "_"])["_"] -def _get_label_range_or_mask(index, start, stop, step): - if ( - not (start is None and stop is None) - and type(index) is cudf.core.index.DatetimeIndex - and index.is_monotonic_increasing is False - ): - start = pd.to_datetime(start) - stop = pd.to_datetime(stop) - if start is not None and stop is not None: - if start > stop: - return slice(0, 0, None) - # TODO: Once Index binary ops are updated to support logical_and, - # can use that instead of using cupy. - boolean_mask = cp.logical_and((index >= start), (index <= stop)) - elif start is not None: - boolean_mask = index >= start - else: - boolean_mask = index <= stop - return boolean_mask - else: - return index.find_label_range(slice(start, stop, step)) - - class _FrameIndexer: """Parent class for indexers.""" diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py index e089040bbaf..87cb8270593 100644 --- a/python/cudf/cudf/core/indexing_utils.py +++ b/python/cudf/cudf/core/indexing_utils.py @@ -3,9 +3,12 @@ from __future__ import annotations from dataclasses import dataclass +from functools import partial, reduce from typing import TYPE_CHECKING, Any, Tuple, Union +import cupy as cp import numpy as np +import pandas as pd from typing_extensions import TypeAlias import cudf @@ -462,6 +465,42 @@ def ordered_find(needles: "ColumnBase", haystack: "ColumnBase") -> GatherMap: ) +def find_label_range_or_mask( + key: slice, index: cudf.BaseIndex +) -> EmptyIndexer | MapIndexer | MaskIndexer | SliceIndexer: + # TODO: datetime index must only be handled specially until pandas 2 + if ( + not (key.start is None and key.stop is None) + and isinstance(index, cudf.core.index.DatetimeIndex) + and not index.is_monotonic_increasing + ): + start = pd.to_datetime(key.start) + stop = pd.to_datetime(key.stop) + mask = [] + if start is not None: + mask.append(index >= start) + if stop is not None: + mask.append(index <= stop) + bool_mask = reduce(partial(cp.logical_and, out=mask[0]), mask) + if key.step is None or key.step == 1: + return MaskIndexer(BooleanMask(bool_mask, len(index))) + else: + (map_,) = bool_mask.nonzero() + return MapIndexer( + GatherMap.from_column_unchecked( + cudf.core.column.as_column(map_[:: key.step]), + len(index), + nullify=False, + ) + ) + else: + parsed_key = index.find_label_range(key) + if len(range(len(index))[parsed_key]) == 0: + return EmptyIndexer() + else: + return SliceIndexer(parsed_key) + + def parse_single_row_loc_key( key: Any, index: cudf.BaseIndex, @@ -499,14 +538,7 @@ def parse_single_row_loc_key( """ n = len(index) if isinstance(key, slice): - # Convert label slice to index slice - # TODO: datetime index must be handled specially (unless we go for - # pandas 2 compatibility) - parsed_key = index.find_label_range(key) - if len(range(n)[parsed_key]) == 0: - return EmptyIndexer() - else: - return SliceIndexer(parsed_key) + return find_label_range_or_mask(key, index) else: is_scalar = _is_scalar_or_zero_d_array(key) if is_scalar and isinstance(key, np.ndarray): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 8c584b2f17e..1c5b9173a97 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -66,7 +66,6 @@ from cudf.core.indexed_frame import ( IndexedFrame, _FrameIndexer, - _get_label_range_or_mask, _indices_from_labels, doc_reset_index_template, ) @@ -322,9 +321,15 @@ def _loc_to_iloc(self, arg): raise KeyError("Label scalar is out of bounds") elif isinstance(arg, slice): - return _get_label_range_or_mask( - self._frame.index, arg.start, arg.stop, arg.step + indexer = indexing_utils.find_label_range_or_mask( + arg, self._frame.index ) + if isinstance(indexer, indexing_utils.EmptyIndexer): + return slice(0, 0, 1) + elif isinstance(indexer, indexing_utils.SliceIndexer): + return indexer.key + else: + return indexer.key.column elif isinstance(arg, (cudf.MultiIndex, pd.MultiIndex)): if isinstance(arg, pd.MultiIndex): arg = cudf.MultiIndex.from_pandas(arg) From 00de6c4536c9c52fa23b03fab6c0dca0aa8f6000 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 17 Jul 2023 16:51:26 +0100 Subject: [PATCH 09/16] Cast labels to datetime in DatetimeIndex.get_slice_bound --- python/cudf/cudf/core/index.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index c7e25cdc430..ad1b9e7d937 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2129,6 +2129,11 @@ def searchsorted( value, side=side, ascending=ascending, na_position=na_position ) + def get_slice_bound(self, label, side: str, kind=None) -> int: + if isinstance(label, str): + label = pd.to_datetime(label) + return super().get_slice_bound(label, side, kind=kind) + @property # type: ignore @_cudf_nvtx_annotate def year(self): From da093f4b33900402e007393ffa8f0c9a9b67e17d Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 17 Jul 2023 17:09:49 +0100 Subject: [PATCH 10/16] Remove some xfails --- python/cudf/cudf/tests/test_indexing.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 2e169a2b0b1..05e14107f12 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -1824,7 +1824,6 @@ def test_loc_repeated_index_label_issue_8693(): assert_eq(expect, actual) -@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/13268") @pytest.mark.parametrize( "indexer", [(..., 0), (0, ...)], ids=["row_ellipsis", "column_ellipsis"] ) @@ -1873,6 +1872,17 @@ def test_iloc_integer_categorical_issue_13013(indexer): assert_eq(expect, actual) +@pytest.mark.parametrize("indexer", [[1], [0, 2]]) +def test_loc_integer_categorical_issue_13014(indexer): + # https://github.com/rapidsai/cudf/issues/13014 + s = pd.Series([0, 1, 2]) + index = pd.Categorical(indexer) + expect = s.loc[index] + c = cudf.from_pandas(s) + actual = c.loc[index] + assert_eq(expect, actual) + + def test_iloc_incorrect_boolean_mask_length_issue_13015(): # https://github.com/rapidsai/cudf/issues/13015 s = pd.Series([0, 1, 2]) @@ -1959,7 +1969,6 @@ def test_loc_unsorted_index_slice_lookup_keyerror_issue_12833(): cdf.loc[1:5] -@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/13379") @pytest.mark.parametrize("index", [range(5), list(range(5))]) def test_loc_missing_label_keyerror_issue_13379(index): # https://github.com/rapidsai/cudf/issues/13379 @@ -1973,6 +1982,16 @@ def test_loc_missing_label_keyerror_issue_13379(index): cdf.loc[[0, 5]] +def test_loc_categorical_no_integer_fallback_issue_13653(): + # https://github.com/rapidsai/cudf/issues/13653 + s = cudf.Series( + [1, 2], index=cudf.CategoricalIndex([3, 4], categories=[3, 4]) + ) + actual = s.loc[3] + expect = s.to_pandas().loc[3] + assert_eq(actual, expect) + + @pytest.mark.parametrize("series", [True, False], ids=["Series", "DataFrame"]) def test_loc_repeated_label_ordering_issue_13658(series): # https://github.com/rapidsai/cudf/issues/13658 From 9cbd2be87f3e826907045bd9ceeb7969e48740e8 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 17 Jul 2023 17:36:11 +0100 Subject: [PATCH 11/16] Docstring --- python/cudf/cudf/core/indexing_utils.py | 32 ++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py index 87cb8270593..52a764d1e10 100644 --- a/python/cudf/cudf/core/indexing_utils.py +++ b/python/cudf/cudf/core/indexing_utils.py @@ -468,12 +468,42 @@ def ordered_find(needles: "ColumnBase", haystack: "ColumnBase") -> GatherMap: def find_label_range_or_mask( key: slice, index: cudf.BaseIndex ) -> EmptyIndexer | MapIndexer | MaskIndexer | SliceIndexer: - # TODO: datetime index must only be handled specially until pandas 2 + """ + Convert a slice of labels into a slice of positions + + Parameters + ---------- + key + Slice to convert + index + Index to look up in + + Returns + ------- + IndexingSpec + Structured data for indexing (but never a :class:`ScalarIndexer`) + + Raises + ------ + KeyError + If the index is unsorted and not a DatetimeIndex + + Notes + ----- + Until Pandas 2, looking up slices in an unsorted DatetimeIndex + constructs a mask by checking which dates fall in the range. + + From Pandas 2, slice lookup in DatetimeIndexes will behave + identically to other index types and fail with a KeyError for + an unsorted index if either of the slice endpoints are not unique + in the index or are not in the index at all. + """ if ( not (key.start is None and key.stop is None) and isinstance(index, cudf.core.index.DatetimeIndex) and not index.is_monotonic_increasing ): + # TODO: datetime index must only be handled specially until pandas 2 start = pd.to_datetime(key.start) stop = pd.to_datetime(key.stop) mask = [] From da385ccece193218ac66fd2980de6c066081450d Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 18 Jul 2023 11:04:52 +0100 Subject: [PATCH 12/16] Share some logic between loc and iloc paths Perhaps not worth it... --- python/cudf/cudf/core/indexing_utils.py | 134 ++++++++++++------------ 1 file changed, 68 insertions(+), 66 deletions(-) diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py index 52a764d1e10..bd5b7bba962 100644 --- a/python/cudf/cudf/core/indexing_utils.py +++ b/python/cudf/cudf/core/indexing_utils.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from functools import partial, reduce -from typing import TYPE_CHECKING, Any, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable, Tuple, Union import cupy as cp import numpy as np @@ -67,6 +67,52 @@ class ScalarIndexer: ] +# Helpers for code-sharing between loc and iloc paths +def expand_key(key, frame): + """Slice-expand key into a tuple of length frame.dim + + Also apply callables on each piece. + """ + dim = len(frame.shape) + if isinstance(key, tuple): + # Key potentially indexes rows and columns, slice-expand to + # shape of frame + indexers = key + (slice(None),) * (dim - len(key)) + if len(indexers) > dim: + raise IndexError( + f"Too many indexers: got {len(indexers)} expected {dim}" + ) + else: + # Key indexes rows, slice-expand to shape of frame + indexers = (key, *(slice(None),) * (dim - 1)) + return tuple(k(frame) if callable(k) else k for k in indexers) + + +def destructure_dataframe_indexer( + key: Any, + frame: cudf.DataFrame, + destructure: Callable[[Any, cudf.DataFrame], tuple[Any, ...]], + is_scalar: Callable[[Any, ColumnAccessor], bool], + get_ca: str, +): + rows, cols = destructure(key, frame) + if cols is Ellipsis: + cols = slice(None) + try: + ca = getattr(frame._data, get_ca)(cols) + except TypeError as e: + raise TypeError( + "Column indices must be names, slices, " + "list-like of names, or boolean mask" + ) from e + scalar = is_scalar(cols, ca) + if scalar: + assert ( + len(ca) == 1 + ), "Scalar column indexer should not produce more than one column" + return rows, (scalar, ca) + + def destructure_iloc_key( key: Any, frame: Union[cudf.Series, cudf.DataFrame] ) -> tuple[Any, ...]: @@ -111,19 +157,7 @@ def destructure_iloc_key( IndexError If there are too many indexers, or any individual indexer is a tuple. """ - n = len(frame.shape) - if isinstance(key, tuple): - # Key potentially indexes rows and columns, slice-expand to - # shape of frame - indexers = key + (slice(None),) * (n - len(key)) - if len(indexers) > n: - raise IndexError( - f"Too many indexers: got {len(indexers)} expected {n}" - ) - else: - # Key indexes rows, slice-expand to shape of frame - indexers = (key, *(slice(None),) * (n - 1)) - indexers = tuple(k(frame) if callable(k) else k for k in indexers) + indexers = expand_key(key, frame) if any(isinstance(k, tuple) for k in indexers): raise IndexError( "Too many indexers: can't have nested tuples in iloc indexing" @@ -147,7 +181,7 @@ def destructure_dataframe_iloc_indexer( ------- tuple 2-tuple of a key for the rows and tuple of - (column_index_is_scalar, column_names) for the columns + (column_index_is_scalar, ColumnAccessor) for the columns Raises ------ @@ -158,23 +192,13 @@ def destructure_dataframe_iloc_indexer( NotImplementedError If the requested column indexer repeats columns """ - rows, cols = destructure_iloc_key(key, frame) - if cols is Ellipsis: - cols = slice(None) - scalar = is_integer(cols) - try: - ca = frame._data.select_by_index(cols) - except TypeError: - raise TypeError( - "Column indices must be integers, slices, " - "or list-like of integers" - ) - if scalar: - assert ( - len(ca) == 1 - ), "Scalar column indexer should not produce more than one column" - - return rows, (scalar, ca) + return destructure_dataframe_indexer( + key, + frame, + destructure_iloc_key, + lambda col, _ca: is_integer(col), + "select_by_index", + ) def destructure_series_iloc_indexer(key: Any, frame: cudf.Series) -> Any: @@ -292,10 +316,9 @@ def destructure_loc_key( IndexError If there are too many indexers. """ - n = len(frame.shape) if ( isinstance(frame.index, cudf.MultiIndex) - and n == 2 + and len(frame.shape) == 2 and isinstance(key, tuple) and all(map(is_scalar, key)) ): @@ -313,18 +336,7 @@ def destructure_loc_key( else: # key just indexes rows key = (key,) - if isinstance(key, tuple): - # Key potentially indexes rows and columns, slice-expand to - # shape of frame - indexers = key + (slice(None),) * (n - len(key)) - if len(indexers) > n: - raise IndexError( - f"Too many indexers: got {len(indexers)} expected {n}" - ) - else: - # Key indexes rows, slice-expand to shape of frame - indexers = (key, *(slice(None),) * (n - 1)) - return tuple(k(frame) if callable(k) else k for k in indexers) + return expand_key(key, frame) def destructure_dataframe_loc_indexer( @@ -343,7 +355,7 @@ def destructure_dataframe_loc_indexer( ------- tuple 2-tuple of a key for the rows and tuple of - (column_index_is_scalar, column_names) for the columns + (column_index_is_scalar, ColumnAccessor) for the columns Raises ------ @@ -354,26 +366,16 @@ def destructure_dataframe_loc_indexer( NotImplementedError If the requested column indexer repeats columns """ - rows, cols = destructure_loc_key(key, frame) - if cols is Ellipsis: - cols = slice(None) - try: - scalar = cols in frame._data - except TypeError: - scalar = False - try: - ca = frame._data.select_by_label(cols) - except TypeError: - raise TypeError( - "Column indices must be names, slices, " - "list-like of names, or boolean mask" - ) - if scalar: - assert ( - len(ca) == 1 - ), "Scalar column indexer should not produce more than one column" - return rows, (scalar, ca) + def is_scalar(name, ca): + try: + return name in ca + except TypeError: + return False + + return destructure_dataframe_indexer( + key, frame, destructure_loc_key, is_scalar, "select_by_label" + ) def destructure_series_loc_indexer(key: Any, frame: cudf.Series) -> Any: From ee00399beec2c4fcac4902269b4f733db36168c3 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 18 Jul 2023 15:44:53 +0100 Subject: [PATCH 13/16] Explicitly raise for MultiIndex --- python/cudf/cudf/core/indexing_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py index bd5b7bba962..4d31afb1731 100644 --- a/python/cudf/cudf/core/indexing_utils.py +++ b/python/cudf/cudf/core/indexing_utils.py @@ -630,6 +630,10 @@ def parse_row_loc_indexer(key: Any, index: cudf.BaseIndex) -> IndexingSpec: TypeError If the indexing key is otherwise invalid. """ + if isinstance(index, cudf.MultiIndex): + raise NotImplementedError( + "This code path is not designed for multiindices" + ) # TODO: multiindices need to be treated separately if key is Ellipsis: # Ellipsis is handled here because multiindex level-based From 87d6998b3cfc8aa41b701e554ca3a3de52b99936 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 18 Jul 2023 15:45:25 +0100 Subject: [PATCH 14/16] Move type annotation to DataframeIndexer --- python/cudf/cudf/core/dataframe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 09386f88f3d..1486f870789 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -121,6 +121,8 @@ def _shape_mismatch_error(x, y): class _DataFrameIndexer(_FrameIndexer): + _frame: DataFrame + def __setitem__(self, key, value): if not isinstance(key, tuple): key = (key, slice(None)) @@ -364,8 +366,6 @@ class _DataFrameIlocIndexer(_DataFrameIndexer): For selection by index. """ - _frame: DataFrame - def __getitem__(self, arg): row_key, ( col_is_scalar, From b8fd56e1ac5b4c911b2d07e76efec8e934172238 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 18 Jul 2023 16:17:50 +0100 Subject: [PATCH 15/16] Remove code handling multiindex keys for now --- python/cudf/cudf/core/indexing_utils.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py index 4d31afb1731..e19607a1ba2 100644 --- a/python/cudf/cudf/core/indexing_utils.py +++ b/python/cudf/cudf/core/indexing_utils.py @@ -19,7 +19,6 @@ is_bool_dtype, is_integer, is_integer_dtype, - is_scalar, ) from cudf.core.column_accessor import ColumnAccessor from cudf.core.copy_types import BooleanMask, GatherMap @@ -316,26 +315,6 @@ def destructure_loc_key( IndexError If there are too many indexers. """ - if ( - isinstance(frame.index, cudf.MultiIndex) - and len(frame.shape) == 2 - and isinstance(key, tuple) - and all(map(is_scalar, key)) - ): - # This is "best-effort" and ambiguous - if len(key) == 2: - if key[1] in frame.index._columns[1]: - # key just indexes the rows - key = (key,) - elif key[1] in frame._data: - # key indexes rows and columns - key = key - else: - # key indexes rows and we will raise a keyerror - key = (key,) - else: - # key just indexes rows - key = (key,) return expand_key(key, frame) From 546fef54d4bc4873767a15defa889a1ee86e1780 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 20 Jul 2023 12:12:36 +0100 Subject: [PATCH 16/16] Add test of #13652 --- python/cudf/cudf/tests/test_indexing.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 05e14107f12..8a81e617d77 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -1982,6 +1982,28 @@ def test_loc_missing_label_keyerror_issue_13379(index): cdf.loc[[0, 5]] +@pytest.mark.parametrize("index_is_ordered", [False, True]) +@pytest.mark.parametrize("label_is_ordered", [False, True]) +def test_loc_categorical_ordering_mismatch_issue_13652( + index_is_ordered, label_is_ordered +): + # https://github.com/rapidsai/cudf/issues/13652 + s = cudf.Series( + [0, 2, 8, 4, 2], + index=cudf.CategoricalIndex( + [1, 2, 3, 4, 5], + categories=[1, 2, 3, 4, 5], + ordered=index_is_ordered, + ), + ) + labels = cudf.CategoricalIndex( + [1, 4], categories=[1, 4], ordered=label_is_ordered + ) + actual = s.loc[labels] + expect = s.to_pandas().loc[labels.to_pandas()] + assert_eq(actual, expect) + + def test_loc_categorical_no_integer_fallback_issue_13653(): # https://github.com/rapidsai/cudf/issues/13653 s = cudf.Series(