-
Notifications
You must be signed in to change notification settings - Fork 916
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Parse (non-MultiIndex) label-based keys to structured data #13717
base: branch-23.10
Are you sure you want to change the base?
Changes from all commits
b9134db
f5bfb7b
58a7cb7
f5a9d61
fed2bcb
4be3035
9880b3c
199ffdf
00de6c4
da093f4
9cbd2be
da385cc
ee00399
87d6998
b8fd56e
546fef5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,7 +20,6 @@ | |
import pandas as pd | ||
from packaging.version import Version | ||
from pandas.api.types import is_bool | ||
from typing_extensions import Self | ||
|
||
import cudf | ||
from cudf.core import column | ||
|
@@ -404,6 +403,10 @@ def select_by_index(self, index: Any) -> ColumnAccessor: | |
ColumnAccessor | ||
""" | ||
keys = self.get_labels_by_index(index) | ||
if len(set(keys)) != len(keys): | ||
raise NotImplementedError( | ||
"cudf DataFrames do not support repeated column names" | ||
) | ||
data = {k: self._data[k] for k in keys} | ||
return self.__class__( | ||
data, | ||
|
@@ -477,13 +480,6 @@ def set_by_label(self, key: Any, value: Any, validate: bool = True): | |
self._data[key] = value | ||
self._clear_cache() | ||
|
||
def _select_by_names(self, names: abc.Sequence) -> Self: | ||
return self.__class__( | ||
{key: self[key] for key in names}, | ||
multiindex=self.multiindex, | ||
level_names=self.level_names, | ||
) | ||
|
||
Comment on lines
-480
to
-486
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I introduced this in #13534, but I realised it's better if the column key parsing returns a |
||
def _select_by_label_list_like(self, key: Any) -> ColumnAccessor: | ||
# Might be a generator | ||
key = tuple(key) | ||
|
@@ -499,6 +495,10 @@ def _select_by_label_list_like(self, key: Any) -> ColumnAccessor: | |
if keep | ||
) | ||
else: | ||
if len(set(key)) != len(key): | ||
raise NotImplementedError( | ||
"cudf DataFrames do not support repeated column names" | ||
) | ||
data = {k: self._grouped_data[k] for k in key} | ||
if self.multiindex: | ||
data = _to_flat_dict(data) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,7 +36,7 @@ | |
from pandas.core.dtypes.common import is_float, is_integer | ||
from pandas.io.formats import console | ||
from pandas.io.formats.printing import pprint_thing | ||
from typing_extensions import assert_never | ||
from typing_extensions import Self, assert_never | ||
|
||
import cudf | ||
import cudf.core.common | ||
|
@@ -75,7 +75,6 @@ | |
from cudf.core.indexed_frame import ( | ||
IndexedFrame, | ||
_FrameIndexer, | ||
_get_label_range_or_mask, | ||
_indices_from_labels, | ||
doc_reset_index_template, | ||
) | ||
|
@@ -122,21 +121,7 @@ def _shape_mismatch_error(x, y): | |
|
||
|
||
class _DataFrameIndexer(_FrameIndexer): | ||
def __getitem__(self, arg): | ||
if ( | ||
isinstance(self._frame.index, MultiIndex) | ||
or self._frame._data.multiindex | ||
): | ||
# This try/except block allows the use of pandas-like | ||
# tuple arguments into MultiIndex dataframes. | ||
try: | ||
return self._getitem_tuple_arg(arg) | ||
except (TypeError, KeyError, IndexError, ValueError): | ||
return self._getitem_tuple_arg((arg, slice(None))) | ||
else: | ||
if not isinstance(arg, tuple): | ||
arg = (arg, slice(None)) | ||
return self._getitem_tuple_arg(arg) | ||
Comment on lines
-125
to
-139
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No longer shared between |
||
_frame: DataFrame | ||
|
||
def __setitem__(self, key, value): | ||
if not isinstance(key, tuple): | ||
|
@@ -229,14 +214,30 @@ class _DataFrameLocIndexer(_DataFrameIndexer): | |
For selection by label. | ||
""" | ||
|
||
@_cudf_nvtx_annotate | ||
def _getitem_scalar(self, arg): | ||
return self._frame[arg[1]].loc[arg[0]] | ||
def __getitem__(self, arg): | ||
if isinstance(self._frame.index, cudf.MultiIndex): | ||
# This try/except block allows the use of pandas-like | ||
# tuple arguments into MultiIndex dataframes. | ||
try: | ||
return self._getitem_tuple_arg(arg) | ||
except (TypeError, KeyError, IndexError, ValueError): | ||
return self._getitem_tuple_arg((arg, slice(None))) | ||
else: | ||
row_key, ( | ||
col_is_scalar, | ||
ca, | ||
) = indexing_utils.destructure_dataframe_loc_indexer( | ||
arg, self._frame | ||
) | ||
row_spec = indexing_utils.parse_row_loc_indexer( | ||
row_key, self._frame.index | ||
) | ||
return self._frame._getitem_preprocessed( | ||
row_spec, col_is_scalar, ca | ||
) | ||
Comment on lines
+226
to
+237
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the new approach (which doesn't handle multiindex lookups yet). |
||
|
||
@_cudf_nvtx_annotate | ||
def _getitem_tuple_arg(self, arg): | ||
from uuid import uuid4 | ||
|
||
# Step 1: Gather columns | ||
if isinstance(arg, tuple): | ||
columns_df = self._frame._get_columns_by_label(arg[1]) | ||
|
@@ -262,64 +263,7 @@ def _getitem_tuple_arg(self, arg): | |
row_arg = arg | ||
return columns_df.index._get_row_major(columns_df, row_arg) | ||
else: | ||
if isinstance(arg[0], slice): | ||
out = _get_label_range_or_mask( | ||
columns_df.index, arg[0].start, arg[0].stop, arg[0].step | ||
) | ||
if isinstance(out, slice): | ||
df = columns_df._slice(out) | ||
else: | ||
df = columns_df._apply_boolean_mask( | ||
BooleanMask.from_column_unchecked( | ||
cudf.core.column.as_column(out) | ||
) | ||
) | ||
else: | ||
tmp_arg = arg | ||
if is_scalar(arg[0]): | ||
# If a scalar, there is possibility of having duplicates. | ||
# Join would get all the duplicates. So, converting it to | ||
# an array kind. | ||
tmp_arg = ([tmp_arg[0]], tmp_arg[1]) | ||
if len(tmp_arg[0]) == 0: | ||
return columns_df._empty_like(keep_index=True) | ||
tmp_arg = (as_column(tmp_arg[0]), tmp_arg[1]) | ||
|
||
if is_bool_dtype(tmp_arg[0]): | ||
df = columns_df._apply_boolean_mask( | ||
BooleanMask(tmp_arg[0], len(columns_df)) | ||
) | ||
else: | ||
tmp_col_name = str(uuid4()) | ||
cantor_name = "_" + "_".join( | ||
map(str, columns_df._data.names) | ||
) | ||
if columns_df._data.multiindex: | ||
# column names must be appropriate length tuples | ||
extra = tuple( | ||
"" for _ in range(columns_df._data.nlevels - 1) | ||
) | ||
tmp_col_name = (tmp_col_name, *extra) | ||
cantor_name = (cantor_name, *extra) | ||
other_df = DataFrame( | ||
{tmp_col_name: column.arange(len(tmp_arg[0]))}, | ||
index=as_index(tmp_arg[0]), | ||
) | ||
columns_df[cantor_name] = column.arange(len(columns_df)) | ||
df = other_df.join(columns_df, how="inner") | ||
# as join is not assigning any names to index, | ||
# update it over here | ||
df.index.name = columns_df.index.name | ||
df = df.sort_values(by=[tmp_col_name, cantor_name]) | ||
df.drop(columns=[tmp_col_name, cantor_name], inplace=True) | ||
# There were no indices found | ||
if len(df) == 0: | ||
raise KeyError(arg) | ||
|
||
# Step 3: Downcast | ||
if self._can_downcast_to_series(df, arg): | ||
return self._downcast_to_series(df, arg) | ||
return df | ||
raise RuntimeError("Should have been handled by now") | ||
|
||
@_cudf_nvtx_annotate | ||
def _setitem_tuple_arg(self, key, value): | ||
|
@@ -336,10 +280,16 @@ def _setitem_tuple_arg(self, key, value): | |
columns_df = self._frame._get_columns_by_label(key[1]) | ||
except KeyError: | ||
if not self._frame.empty and isinstance(key[0], slice): | ||
pos_range = _get_label_range_or_mask( | ||
self._frame.index, key[0].start, key[0].stop, key[0].step | ||
indexer = indexing_utils.find_label_range_or_mask( | ||
key[0], self._frame.index | ||
) | ||
idx = self._frame.index[pos_range] | ||
index = self._frame.index | ||
if isinstance(indexer, indexing_utils.EmptyIndexer): | ||
idx = index[0:0:1] | ||
elif isinstance(indexer, indexing_utils.SliceIndexer): | ||
idx = index[indexer.key] | ||
else: | ||
idx = index[indexer.key.column] | ||
Comment on lines
-339
to
+292
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've moved |
||
elif self._frame.empty and isinstance(key[0], slice): | ||
idx = None | ||
else: | ||
|
@@ -416,55 +366,15 @@ class _DataFrameIlocIndexer(_DataFrameIndexer): | |
For selection by index. | ||
""" | ||
|
||
_frame: DataFrame | ||
|
||
Comment on lines
-419
to
-420
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This moved to the parent class. |
||
def __getitem__(self, arg): | ||
row_key, ( | ||
col_is_scalar, | ||
column_names, | ||
ca, | ||
) = indexing_utils.destructure_dataframe_iloc_indexer(arg, self._frame) | ||
row_spec = indexing_utils.parse_row_iloc_indexer( | ||
row_key, len(self._frame) | ||
) | ||
ca = self._frame._data | ||
index = self._frame.index | ||
if col_is_scalar: | ||
s = Series._from_data( | ||
ca._select_by_names(column_names), index=index | ||
) | ||
return s._getitem_preprocessed(row_spec) | ||
if column_names != list(self._frame._column_names): | ||
frame = self._frame._from_data( | ||
ca._select_by_names(column_names), index=index | ||
) | ||
else: | ||
frame = self._frame | ||
if isinstance(row_spec, indexing_utils.MapIndexer): | ||
return frame._gather(row_spec.key, keep_index=True) | ||
elif isinstance(row_spec, indexing_utils.MaskIndexer): | ||
return frame._apply_boolean_mask(row_spec.key, keep_index=True) | ||
elif isinstance(row_spec, indexing_utils.SliceIndexer): | ||
return frame._slice(row_spec.key) | ||
elif isinstance(row_spec, indexing_utils.ScalarIndexer): | ||
result = frame._gather(row_spec.key, keep_index=True) | ||
# Attempt to turn into series. | ||
try: | ||
# Behaviour difference from pandas, which will merrily | ||
# turn any heterogeneous set of columns into a series if | ||
# you only ask for one row. | ||
new_name = result.index[0] | ||
result = Series._concat( | ||
[result[name] for name in column_names], | ||
index=result.keys(), | ||
) | ||
result.name = new_name | ||
return result | ||
except TypeError: | ||
# Couldn't find a common type, just return a 1xN dataframe. | ||
return result | ||
elif isinstance(row_spec, indexing_utils.EmptyIndexer): | ||
return frame._empty_like(keep_index=True) | ||
assert_never(row_spec) | ||
return self._frame._getitem_preprocessed(row_spec, col_is_scalar, ca) | ||
Comment on lines
-429
to
+377
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Promotes the handling of structured data to the dataframe object (so this is code movement). |
||
|
||
@_cudf_nvtx_annotate | ||
def _setitem_tuple_arg(self, key, value): | ||
|
@@ -963,7 +873,7 @@ def _from_data( | |
data: MutableMapping, | ||
index: Optional[BaseIndex] = None, | ||
columns: Any = None, | ||
) -> DataFrame: | ||
) -> Self: | ||
out = super()._from_data(data=data, index=index) | ||
if columns is not None: | ||
out.columns = columns | ||
|
@@ -1122,6 +1032,67 @@ def __setattr__(self, key, col): | |
else: | ||
super().__setattr__(key, col) | ||
|
||
def _getitem_preprocessed( | ||
self, | ||
spec: indexing_utils.IndexingSpec, | ||
col_is_scalar: bool, | ||
ca: ColumnAccessor, | ||
) -> Union[Self, Series]: | ||
"""Get a subset of rows and columns given structured data | ||
|
||
Parameters | ||
---------- | ||
spec | ||
Indexing specification for the rows | ||
col_is_scalar | ||
Was the indexer of the columns a scalar (return a Series) | ||
ca | ||
ColumnAccessor representing the subsetted column data | ||
|
||
Returns | ||
------- | ||
Subsetted DataFrame or Series (if a scalar row is requested | ||
and the concatenation of the column types is possible) | ||
|
||
Notes | ||
----- | ||
This function performs no bounds-checking or massaging of the | ||
inputs. | ||
""" | ||
if col_is_scalar: | ||
series = Series._from_data(ca, index=self.index) | ||
return series._getitem_preprocessed(spec) | ||
if ca.names != self._data.names: | ||
frame = self._from_data(ca, index=self.index) | ||
else: | ||
frame = self | ||
if isinstance(spec, indexing_utils.MapIndexer): | ||
return frame._gather(spec.key, keep_index=True) | ||
elif isinstance(spec, indexing_utils.MaskIndexer): | ||
return frame._apply_boolean_mask(spec.key, keep_index=True) | ||
elif isinstance(spec, indexing_utils.SliceIndexer): | ||
return frame._slice(spec.key) | ||
elif isinstance(spec, indexing_utils.ScalarIndexer): | ||
result = frame._gather(spec.key, keep_index=True) | ||
# Attempt to turn into series. | ||
try: | ||
# Behaviour difference from pandas, which will merrily | ||
# turn any heterogeneous set of columns into a series if | ||
# you only ask for one row. | ||
new_name = result.index[0] | ||
result = Series._concat( | ||
[result[name] for name in frame._data.names], | ||
index=result.keys(), | ||
) | ||
result.name = new_name | ||
return result | ||
except TypeError: | ||
# Couldn't find a common type, just return a 1xN dataframe. | ||
return result | ||
elif isinstance(spec, indexing_utils.EmptyIndexer): | ||
return frame._empty_like(keep_index=True) | ||
assert_never(spec) | ||
Comment on lines
+1035
to
+1094
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And here's the handling of structured data that moved from |
||
|
||
@_cudf_nvtx_annotate | ||
def __getitem__(self, arg): | ||
""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2129,6 +2129,11 @@ def searchsorted( | |
value, side=side, ascending=ascending, na_position=na_position | ||
) | ||
|
||
def get_slice_bound(self, label, side: str, kind=None) -> int: | ||
if isinstance(label, str): | ||
label = pd.to_datetime(label) | ||
return super().get_slice_bound(label, side, kind=kind) | ||
Comment on lines
+2132
to
+2135
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have to convert strings. |
||
|
||
@property # type: ignore | ||
@_cudf_nvtx_annotate | ||
def year(self): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -193,29 +193,6 @@ def _indices_from_labels(obj, labels): | |
return lhs.join(rhs).sort_values(by=["__", "_"])["_"] | ||
|
||
|
||
def _get_label_range_or_mask(index, start, stop, step): | ||
if ( | ||
not (start is None and stop is None) | ||
and type(index) is cudf.core.index.DatetimeIndex | ||
and index.is_monotonic_increasing is False | ||
): | ||
start = pd.to_datetime(start) | ||
stop = pd.to_datetime(stop) | ||
if start is not None and stop is not None: | ||
if start > stop: | ||
return slice(0, 0, None) | ||
# TODO: Once Index binary ops are updated to support logical_and, | ||
# can use that instead of using cupy. | ||
boolean_mask = cp.logical_and((index >= start), (index <= stop)) | ||
elif start is not None: | ||
boolean_mask = index >= start | ||
else: | ||
boolean_mask = index <= stop | ||
return boolean_mask | ||
else: | ||
return index.find_label_range(slice(start, stop, step)) | ||
Comment on lines
-196
to
-216
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This basically moved to indexing_utils. |
||
|
||
|
||
class _FrameIndexer: | ||
"""Parent class for indexers.""" | ||
|
||
|
@@ -1812,7 +1789,7 @@ def _gather( | |
self, | ||
gather_map: GatherMap, | ||
keep_index=True, | ||
): | ||
) -> Self: | ||
"""Gather rows of frame specified by indices in `gather_map`. | ||
|
||
Maintain the index if keep_index is True. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This now loudly raises when getting from a
ColumnAccessor
would produce duplicate column names