Skip to content
/ cudf Public
forked from rapidsai/cudf

Commit

Permalink
Implement iloc-getitem using parse-don't-validate approach
Browse files Browse the repository at this point in the history
To simplify the low-level implementation of iloc-based getitem on both
Series and DataFrames, change the dispatching approach to parse the
user-provided "unstructured" key into structured data (a tagged
union using an enum + tuple). At the libcudf level, there are four
styles of indexing we can do:

1. index by slice
2. index by mask
3. index by map
4. index by scalar

iloc keys are parsed into information that tags them by type and
normalises the key to an appropriate column or other low-level object.

This centralises the business logic for index parsing in a
single place, and ensures that downstream consumers of the validated
and normalised indexer don't need to inspect it again to determine
what to do. Note that we treat index by scalar as composition of index
by map with get_element (since that simplifies the logic when
extracting the single row of a dataframe: we want to keep it on
device), but the scalar "type tag" allows us to determine this
unambiguously without reinspecting the key.

The major benefits will come when updating loc-based getitem (where
the parsing rules are more complicated, but eventually turn into one
of the above four cases). In this latter case, we will no longer
attempt to turn a loc-based key into a "user-facing" key for iloc, but
rather will call directly into the pre-parsed interface.

That said, we already provide some performance improvements since we
only do inspection once.

- Closes rapidsai#13013
- Closes rapidsai#13267
- Closes rapidsai#13515
  • Loading branch information
wence- committed Jun 8, 2023
1 parent 2d74cc7 commit 41598e3
Show file tree
Hide file tree
Showing 6 changed files with 476 additions and 228 deletions.
2 changes: 1 addition & 1 deletion python/cudf/cudf/api/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def is_integer(obj):
bool
"""
if isinstance(obj, cudf.Scalar):
return pd.api.types.is_integer(obj.dtype)
return pd.api.types.is_integer_dtype(obj.dtype)
return pd.api.types.is_integer(obj)


Expand Down
239 changes: 76 additions & 163 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
Set,
Tuple,
Union,
cast,
)

import cupy
Expand All @@ -36,7 +37,7 @@
from pandas.core.dtypes.common import is_float, is_integer
from pandas.io.formats import console
from pandas.io.formats.printing import pprint_thing
from typing_extensions import Self
from typing_extensions import assert_never

import cudf
import cudf.core.common
Expand All @@ -57,7 +58,7 @@
is_string_dtype,
is_struct_dtype,
)
from cudf.core import column, df_protocol, reshape
from cudf.core import column, df_protocol, indexing_utils, reshape
from cudf.core.abc import Serializable
from cudf.core.column import (
CategoricalColumn,
Expand All @@ -70,13 +71,7 @@
)
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.groupby.groupby import DataFrameGroupBy, groupby_doc_template
from cudf.core.index import (
BaseIndex,
Index,
RangeIndex,
_index_from_data,
as_index,
)
from cudf.core.index import BaseIndex, RangeIndex, _index_from_data, as_index
from cudf.core.indexed_frame import (
IndexedFrame,
_FrameIndexer,
Expand Down Expand Up @@ -401,57 +396,80 @@ class _DataFrameIlocIndexer(_DataFrameIndexer):
For selection by index.
"""

@_cudf_nvtx_annotate
def _getitem_tuple_arg(self, arg):
# Iloc Step 1:
# Gather the columns specified by the second tuple arg
columns_df = self._frame._from_data(
self._frame._data.select_by_index(arg[1]), self._frame._index
)
_frame: DataFrame

# Iloc Step 2:
# Gather the rows specified by the first tuple arg
if isinstance(columns_df.index, MultiIndex):
if isinstance(arg[0], slice):
df = columns_df[arg[0]]
else:
df = columns_df.index._get_row_major(columns_df, arg[0])
if (len(df) == 1 and len(columns_df) >= 1) and not (
isinstance(arg[0], slice) or isinstance(arg[1], slice)
):
# Pandas returns a numpy scalar in this case
return df.iloc[0]
if self._can_downcast_to_series(df, arg):
return self._downcast_to_series(df, arg)
return df
def __getitem__(self, arg):
row_spec, (
col_scalar,
column_names,
) = indexing_utils.unpack_dataframe_iloc_indexer(arg, self._frame)
row_tag, row_key = indexing_utils.normalize_row_iloc_indexer(
row_spec, len(self._frame), check_bounds=True
)
ca = self._frame._data
index = self._frame.index
if col_scalar:
# TODO column accessor should offer this interface
# Don't want to go through select_by_label because it does
# too much work and we've already turned this into
# appropriate indices.
(name,) = column_names
s = Series._from_data(
ca.__class__(
{name: ca[name]},
multiindex=ca.multiindex,
level_names=ca.level_names,
),
index=index,
)
return s._get(row_tag, row_key)
if column_names != list(self._frame._column_names):
frame = self._frame._from_data(
ca.__class__(
{k: ca[k] for k in column_names},
multiindex=ca.multiindex,
level_names=ca.level_names,
),
index=index,
)
else:
if isinstance(arg[0], slice):
df = columns_df._slice(arg[0])
elif is_scalar(arg[0]):
index = arg[0]
if index < 0:
index += len(columns_df)
df = columns_df._slice(slice(index, index + 1, 1))
else:
arg = (as_column(arg[0]), arg[1])
if is_bool_dtype(arg[0]):
df = columns_df._apply_boolean_mask(arg[0])
else:
df = columns_df._gather(arg[0])

# Iloc Step 3:
# Reindex
if df.shape[0] == 1: # we have a single row without an index
df.index = as_index(self._frame.index[arg[0]])

# Iloc Step 4:
# Downcast
if self._can_downcast_to_series(df, arg):
return self._downcast_to_series(df, arg)

if df.shape[0] == 0 and df.shape[1] == 0 and isinstance(arg[0], slice):
df._index = as_index(self._frame.index[arg[0]])
return df
frame = self._frame
if row_tag is indexing_utils.IndexTag.MAP:
return frame._gather(
row_key,
keep_index=True,
nullify=False,
normalize_and_check=False,
)
elif row_tag is indexing_utils.IndexTag.MASK:
return frame._apply_boolean_mask(
row_key, keep_index=True, normalize_and_check=False
)
elif row_tag is indexing_utils.IndexTag.SLICE:
return frame._slice(cast(slice, row_key))
elif row_tag is indexing_utils.IndexTag.SCALAR:
result = frame._gather(
row_key,
keep_index=True,
nullify=False,
normalize_and_check=False,
)
# Attempt to turn into series.
try:
# Behaviour difference from pandas, which will merrily
# turn any heterogeneous set of columns into a series if
# you only ask for one row.
new_name = result.index[0]
result = Series._concat(
[result[name] for name in column_names],
index=result.keys(),
)
result.name = new_name
return result
except TypeError:
# Couldn't find a common type, just return a 1xN dataframe.
return result
assert_never(row_tag)

@_cudf_nvtx_annotate
def _setitem_tuple_arg(self, key, value):
Expand Down Expand Up @@ -499,10 +517,6 @@ def _setitem_tuple_arg(self, key, value):
for i, col in enumerate(columns_df._column_names):
self._frame[col].iloc[key[0]] = value[i]

def _getitem_scalar(self, arg):
col = self._frame.columns[arg[1]]
return self._frame[col].iloc[arg[0]]


class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
"""
Expand Down Expand Up @@ -1303,107 +1317,6 @@ def __setitem__(self, arg, value):
def __delitem__(self, name):
self._drop_column(name)

@_cudf_nvtx_annotate
def _slice(self, arg: slice) -> Self:
"""
_slice : slice the frame as per the arg
Parameters
----------
arg : should always be of type slice
"""
num_rows = len(self)
if num_rows == 0:
return self
start, stop, stride = arg.indices(num_rows)

# early stop for empty cases
if len(range(start, stop, stride)) == 0:
columns = ColumnAccessor(
{
colname: column.column_empty_like(col, newsize=0)
for colname, col in self._data.items()
},
multiindex=self._data.multiindex,
level_names=self._data.level_names,
)

if isinstance(self.index, MultiIndex):
mi_columns = ColumnAccessor(
{
colname: column.column_empty_like(col, newsize=0)
for colname, col in self.index._data.items()
}
)
return DataFrame._from_data(
columns,
index=MultiIndex._from_data(
mi_columns, name=self.index.name
),
)
else:
return DataFrame._from_data(
columns,
index=(
RangeIndex(
start=start,
stop=stop,
step=stride,
name=self.index.name,
)
if isinstance(self.index, RangeIndex)
else Index(
[], dtype=self.index.dtype, name=self.index.name
)
),
)

# If index type is RangeIndex, slice without materializing.
is_range_index = isinstance(self.index, RangeIndex)
if is_range_index:
if self._num_columns == 0:
result = self._empty_like(keep_index=False)
result._index = self.index[start:stop:stride]
return result

if start < 0:
start = start + num_rows

# Decreasing slices that terminates at -1, such as slice(4, -1, -1),
# has end index of 0, The check below makes sure -1 is not wrapped
# to `-1 + num_rows`.
if stop < 0 and not (stride < 0 and stop == -1):
stop = stop + num_rows
stride = 1 if stride is None else stride

if (stop - start) * stride <= 0:
return self._empty_like(keep_index=True)

start = len(self) if start > num_rows else start
stop = len(self) if stop > num_rows else stop

if stride != 1:
return self._gather(
cudf.core.column.arange(
start, stop=stop, step=stride, dtype=np.int32
)
)

columns_to_slice = [
*(self._index._data.columns if not is_range_index else []),
*self._columns,
]
result = self._from_columns_like_self(
libcudf.copying.columns_slice(columns_to_slice, [start, stop])[0],
self._column_names,
None if is_range_index else self._index.names,
)

if is_range_index:
result.index = self.index[start:stop]
return result

@_cudf_nvtx_annotate
def memory_usage(self, index=True, deep=False):
mem_usage = [col.memory_usage for col in self._data.columns]
Expand Down
Loading

0 comments on commit 41598e3

Please sign in to comment.