Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse (non-MultiIndex) label-based keys to structured data #13717

Open
wants to merge 16 commits into
base: branch-23.10
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions python/cudf/cudf/core/column_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import pandas as pd
from packaging.version import Version
from pandas.api.types import is_bool
from typing_extensions import Self

import cudf
from cudf.core import column
Expand Down Expand Up @@ -404,6 +403,10 @@ def select_by_index(self, index: Any) -> ColumnAccessor:
ColumnAccessor
"""
keys = self.get_labels_by_index(index)
if len(set(keys)) != len(keys):
raise NotImplementedError(
"cudf DataFrames do not support repeated column names"
)
Comment on lines +406 to +409
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This now loudly raises when getting from a ColumnAccessor would produce duplicate column names

data = {k: self._data[k] for k in keys}
return self.__class__(
data,
Expand Down Expand Up @@ -477,13 +480,6 @@ def set_by_label(self, key: Any, value: Any, validate: bool = True):
self._data[key] = value
self._clear_cache()

def _select_by_names(self, names: abc.Sequence) -> Self:
return self.__class__(
{key: self[key] for key in names},
multiindex=self.multiindex,
level_names=self.level_names,
)

Comment on lines -480 to -486
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I introduced this in #13534, but I realised it's better if the column key parsing returns a ColumnAccessor (rather than column names) so it's no longer necessary.

def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:
# Might be a generator
key = tuple(key)
Expand All @@ -499,6 +495,10 @@ def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:
if keep
)
else:
if len(set(key)) != len(key):
raise NotImplementedError(
"cudf DataFrames do not support repeated column names"
)
data = {k: self._grouped_data[k] for k in key}
if self.multiindex:
data = _to_flat_dict(data)
Expand Down
223 changes: 97 additions & 126 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from pandas.core.dtypes.common import is_float, is_integer
from pandas.io.formats import console
from pandas.io.formats.printing import pprint_thing
from typing_extensions import assert_never
from typing_extensions import Self, assert_never

import cudf
import cudf.core.common
Expand Down Expand Up @@ -75,7 +75,6 @@
from cudf.core.indexed_frame import (
IndexedFrame,
_FrameIndexer,
_get_label_range_or_mask,
_indices_from_labels,
doc_reset_index_template,
)
Expand Down Expand Up @@ -122,21 +121,7 @@ def _shape_mismatch_error(x, y):


class _DataFrameIndexer(_FrameIndexer):
def __getitem__(self, arg):
if (
isinstance(self._frame.index, MultiIndex)
or self._frame._data.multiindex
):
# This try/except block allows the use of pandas-like
# tuple arguments into MultiIndex dataframes.
try:
return self._getitem_tuple_arg(arg)
except (TypeError, KeyError, IndexError, ValueError):
return self._getitem_tuple_arg((arg, slice(None)))
else:
if not isinstance(arg, tuple):
arg = (arg, slice(None))
return self._getitem_tuple_arg(arg)
Comment on lines -125 to -139
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No longer shared between loc and iloc cases.

_frame: DataFrame

def __setitem__(self, key, value):
if not isinstance(key, tuple):
Expand Down Expand Up @@ -229,14 +214,30 @@ class _DataFrameLocIndexer(_DataFrameIndexer):
For selection by label.
"""

@_cudf_nvtx_annotate
def _getitem_scalar(self, arg):
return self._frame[arg[1]].loc[arg[0]]
def __getitem__(self, arg):
if isinstance(self._frame.index, cudf.MultiIndex):
# This try/except block allows the use of pandas-like
# tuple arguments into MultiIndex dataframes.
try:
return self._getitem_tuple_arg(arg)
except (TypeError, KeyError, IndexError, ValueError):
return self._getitem_tuple_arg((arg, slice(None)))
else:
row_key, (
col_is_scalar,
ca,
) = indexing_utils.destructure_dataframe_loc_indexer(
arg, self._frame
)
row_spec = indexing_utils.parse_row_loc_indexer(
row_key, self._frame.index
)
return self._frame._getitem_preprocessed(
row_spec, col_is_scalar, ca
)
Comment on lines +226 to +237
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the new approach (which doesn't handle multiindex lookups yet).


@_cudf_nvtx_annotate
def _getitem_tuple_arg(self, arg):
from uuid import uuid4

# Step 1: Gather columns
if isinstance(arg, tuple):
columns_df = self._frame._get_columns_by_label(arg[1])
Expand All @@ -262,64 +263,7 @@ def _getitem_tuple_arg(self, arg):
row_arg = arg
return columns_df.index._get_row_major(columns_df, row_arg)
else:
if isinstance(arg[0], slice):
out = _get_label_range_or_mask(
columns_df.index, arg[0].start, arg[0].stop, arg[0].step
)
if isinstance(out, slice):
df = columns_df._slice(out)
else:
df = columns_df._apply_boolean_mask(
BooleanMask.from_column_unchecked(
cudf.core.column.as_column(out)
)
)
else:
tmp_arg = arg
if is_scalar(arg[0]):
# If a scalar, there is possibility of having duplicates.
# Join would get all the duplicates. So, converting it to
# an array kind.
tmp_arg = ([tmp_arg[0]], tmp_arg[1])
if len(tmp_arg[0]) == 0:
return columns_df._empty_like(keep_index=True)
tmp_arg = (as_column(tmp_arg[0]), tmp_arg[1])

if is_bool_dtype(tmp_arg[0]):
df = columns_df._apply_boolean_mask(
BooleanMask(tmp_arg[0], len(columns_df))
)
else:
tmp_col_name = str(uuid4())
cantor_name = "_" + "_".join(
map(str, columns_df._data.names)
)
if columns_df._data.multiindex:
# column names must be appropriate length tuples
extra = tuple(
"" for _ in range(columns_df._data.nlevels - 1)
)
tmp_col_name = (tmp_col_name, *extra)
cantor_name = (cantor_name, *extra)
other_df = DataFrame(
{tmp_col_name: column.arange(len(tmp_arg[0]))},
index=as_index(tmp_arg[0]),
)
columns_df[cantor_name] = column.arange(len(columns_df))
df = other_df.join(columns_df, how="inner")
# as join is not assigning any names to index,
# update it over here
df.index.name = columns_df.index.name
df = df.sort_values(by=[tmp_col_name, cantor_name])
df.drop(columns=[tmp_col_name, cantor_name], inplace=True)
# There were no indices found
if len(df) == 0:
raise KeyError(arg)

# Step 3: Downcast
if self._can_downcast_to_series(df, arg):
return self._downcast_to_series(df, arg)
return df
raise RuntimeError("Should have been handled by now")

@_cudf_nvtx_annotate
def _setitem_tuple_arg(self, key, value):
Expand All @@ -336,10 +280,16 @@ def _setitem_tuple_arg(self, key, value):
columns_df = self._frame._get_columns_by_label(key[1])
except KeyError:
if not self._frame.empty and isinstance(key[0], slice):
pos_range = _get_label_range_or_mask(
self._frame.index, key[0].start, key[0].stop, key[0].step
indexer = indexing_utils.find_label_range_or_mask(
key[0], self._frame.index
)
idx = self._frame.index[pos_range]
index = self._frame.index
if isinstance(indexer, indexing_utils.EmptyIndexer):
idx = index[0:0:1]
elif isinstance(indexer, indexing_utils.SliceIndexer):
idx = index[indexer.key]
else:
idx = index[indexer.key.column]
Comment on lines -339 to +292
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've moved _get_label_range_or_mask into indexing_utils and return structured data (which for now we must pull apart here because I haven't touched setitem yet).

elif self._frame.empty and isinstance(key[0], slice):
idx = None
else:
Expand Down Expand Up @@ -416,55 +366,15 @@ class _DataFrameIlocIndexer(_DataFrameIndexer):
For selection by index.
"""

_frame: DataFrame

Comment on lines -419 to -420
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This moved to the parent class.

def __getitem__(self, arg):
row_key, (
col_is_scalar,
column_names,
ca,
) = indexing_utils.destructure_dataframe_iloc_indexer(arg, self._frame)
row_spec = indexing_utils.parse_row_iloc_indexer(
row_key, len(self._frame)
)
ca = self._frame._data
index = self._frame.index
if col_is_scalar:
s = Series._from_data(
ca._select_by_names(column_names), index=index
)
return s._getitem_preprocessed(row_spec)
if column_names != list(self._frame._column_names):
frame = self._frame._from_data(
ca._select_by_names(column_names), index=index
)
else:
frame = self._frame
if isinstance(row_spec, indexing_utils.MapIndexer):
return frame._gather(row_spec.key, keep_index=True)
elif isinstance(row_spec, indexing_utils.MaskIndexer):
return frame._apply_boolean_mask(row_spec.key, keep_index=True)
elif isinstance(row_spec, indexing_utils.SliceIndexer):
return frame._slice(row_spec.key)
elif isinstance(row_spec, indexing_utils.ScalarIndexer):
result = frame._gather(row_spec.key, keep_index=True)
# Attempt to turn into series.
try:
# Behaviour difference from pandas, which will merrily
# turn any heterogeneous set of columns into a series if
# you only ask for one row.
new_name = result.index[0]
result = Series._concat(
[result[name] for name in column_names],
index=result.keys(),
)
result.name = new_name
return result
except TypeError:
# Couldn't find a common type, just return a 1xN dataframe.
return result
elif isinstance(row_spec, indexing_utils.EmptyIndexer):
return frame._empty_like(keep_index=True)
assert_never(row_spec)
return self._frame._getitem_preprocessed(row_spec, col_is_scalar, ca)
Comment on lines -429 to +377
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Promotes the handling of structured data to the dataframe object (so this is code movement).


@_cudf_nvtx_annotate
def _setitem_tuple_arg(self, key, value):
Expand Down Expand Up @@ -963,7 +873,7 @@ def _from_data(
data: MutableMapping,
index: Optional[BaseIndex] = None,
columns: Any = None,
) -> DataFrame:
) -> Self:
out = super()._from_data(data=data, index=index)
if columns is not None:
out.columns = columns
Expand Down Expand Up @@ -1122,6 +1032,67 @@ def __setattr__(self, key, col):
else:
super().__setattr__(key, col)

def _getitem_preprocessed(
self,
spec: indexing_utils.IndexingSpec,
col_is_scalar: bool,
ca: ColumnAccessor,
) -> Union[Self, Series]:
"""Get a subset of rows and columns given structured data

Parameters
----------
spec
Indexing specification for the rows
col_is_scalar
Was the indexer of the columns a scalar (return a Series)
ca
ColumnAccessor representing the subsetted column data

Returns
-------
Subsetted DataFrame or Series (if a scalar row is requested
and the concatenation of the column types is possible)

Notes
-----
This function performs no bounds-checking or massaging of the
inputs.
"""
if col_is_scalar:
series = Series._from_data(ca, index=self.index)
return series._getitem_preprocessed(spec)
if ca.names != self._data.names:
frame = self._from_data(ca, index=self.index)
else:
frame = self
if isinstance(spec, indexing_utils.MapIndexer):
return frame._gather(spec.key, keep_index=True)
elif isinstance(spec, indexing_utils.MaskIndexer):
return frame._apply_boolean_mask(spec.key, keep_index=True)
elif isinstance(spec, indexing_utils.SliceIndexer):
return frame._slice(spec.key)
elif isinstance(spec, indexing_utils.ScalarIndexer):
result = frame._gather(spec.key, keep_index=True)
# Attempt to turn into series.
try:
# Behaviour difference from pandas, which will merrily
# turn any heterogeneous set of columns into a series if
# you only ask for one row.
new_name = result.index[0]
result = Series._concat(
[result[name] for name in frame._data.names],
index=result.keys(),
)
result.name = new_name
return result
except TypeError:
# Couldn't find a common type, just return a 1xN dataframe.
return result
elif isinstance(spec, indexing_utils.EmptyIndexer):
return frame._empty_like(keep_index=True)
assert_never(spec)
Comment on lines +1035 to +1094
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And here's the handling of structured data that moved from iloc.


@_cudf_nvtx_annotate
def __getitem__(self, arg):
"""
Expand Down
5 changes: 5 additions & 0 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2129,6 +2129,11 @@ def searchsorted(
value, side=side, ascending=ascending, na_position=na_position
)

def get_slice_bound(self, label, side: str, kind=None) -> int:
if isinstance(label, str):
label = pd.to_datetime(label)
return super().get_slice_bound(label, side, kind=kind)
Comment on lines +2132 to +2135
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have to convert strings.


@property # type: ignore
@_cudf_nvtx_annotate
def year(self):
Expand Down
25 changes: 1 addition & 24 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,29 +193,6 @@ def _indices_from_labels(obj, labels):
return lhs.join(rhs).sort_values(by=["__", "_"])["_"]


def _get_label_range_or_mask(index, start, stop, step):
if (
not (start is None and stop is None)
and type(index) is cudf.core.index.DatetimeIndex
and index.is_monotonic_increasing is False
):
start = pd.to_datetime(start)
stop = pd.to_datetime(stop)
if start is not None and stop is not None:
if start > stop:
return slice(0, 0, None)
# TODO: Once Index binary ops are updated to support logical_and,
# can use that instead of using cupy.
boolean_mask = cp.logical_and((index >= start), (index <= stop))
elif start is not None:
boolean_mask = index >= start
else:
boolean_mask = index <= stop
return boolean_mask
else:
return index.find_label_range(slice(start, stop, step))
Comment on lines -196 to -216
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This basically moved to indexing_utils.



class _FrameIndexer:
"""Parent class for indexers."""

Expand Down Expand Up @@ -1812,7 +1789,7 @@ def _gather(
self,
gather_map: GatherMap,
keep_index=True,
):
) -> Self:
"""Gather rows of frame specified by indices in `gather_map`.

Maintain the index if keep_index is True.
Expand Down
Loading