Skip to content

Commit

Permalink
Eagerly populate the class dict for cudf.pandas proxy types (#14534)
Browse files Browse the repository at this point in the history
Rather than dynamically looking up class attributes (and methods), this PR makes it so that we eagerly populate the class with all known methods and attributes (by inspecting the "slow" class).

This solves a number of problems:

- it makes `getattr` trivially inexpensive (no dynamic `__getattr__` for each attribute access)
- it ensures the _same_ object is returned every time you do, e.g., `DataFrame.max`
- it makes tab completion fast because the attributes don't have to be computed each time
- it no longer exposes attributes that are specific to cuDF - for example `Series.list`
- it allows subclassing of proxy types to work better. For example, derived types can now call `super().` to access attributes of base types

Authors:
  - Ashwin Srinath (https://github.com/shwina)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: #14534
  • Loading branch information
shwina authored May 17, 2024
1 parent d10b8e4 commit e6e6761
Show file tree
Hide file tree
Showing 9 changed files with 326 additions and 157 deletions.
12 changes: 0 additions & 12 deletions docs/cudf/source/cudf_pandas/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,15 +151,3 @@ for testing or benchmarking purposes. To do so, set the
```bash
CUDF_PANDAS_FALLBACK_MODE=1 python -m cudf.pandas some_script.py
```

## Slow tab completion in IPython?

You may experience slow tab completion when inspecting the
methods/attributes of large dataframes. We expect this issue to be
resolved in an upcoming release. In the mean time, you may execute the
following command in IPython before loading `cudf.pandas` to work
around the issue:

```
%config IPCompleter.jedi_compute_type_timeout=0
```
8 changes: 4 additions & 4 deletions python/cudf/cudf/pandas/_wrappers/common.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0

Expand All @@ -17,9 +17,9 @@ def array_method(self: _FastSlowProxy, *args, **kwargs):

def array_function_method(self, func, types, args, kwargs):
try:
return _FastSlowAttribute("__array_function__").__get__(self)(
func, types, args, kwargs
)
return _FastSlowAttribute("__array_function__").__get__(
self, type(self)
)(func, types, args, kwargs)
except Exception:
# if something went wrong with __array_function__ we
# attempt to call the function directly on the slow
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/pandas/_wrappers/numpy.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0

Expand All @@ -10,6 +10,7 @@
import numpy.core.multiarray

from ..fast_slow_proxy import (
_FastSlowAttribute,
make_final_proxy_type,
make_intermediate_proxy_type,
)
Expand Down Expand Up @@ -122,6 +123,7 @@ def wrap_ndarray(cls, arr: cupy.ndarray | numpy.ndarray, constructor):
"__iter__": custom_iter,
# Special wrapping to handle scalar values
"_fsproxy_wrap": classmethod(wrap_ndarray),
"base": _FastSlowAttribute("base", private=True),
},
)

Expand Down
117 changes: 100 additions & 17 deletions python/cudf/cudf/pandas/_wrappers/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,14 +107,16 @@ class _AccessorAttr:
"""

def __init__(self, typ):
self.__typ = typ
self._typ = typ

def __set_name__(self, owner, name):
self._name = name

def __get__(self, obj, cls=None):
if obj is None:
return self.__typ
return self._typ
else:
# allow __getattr__ to handle this
raise AttributeError()
return _FastSlowAttribute(self._name).__get__(obj, type(obj))


def Timestamp_Timedelta__new__(cls, *args, **kwargs):
Expand Down Expand Up @@ -214,6 +216,7 @@ def _DataFrame__dir__(self):
"__dir__": _DataFrame__dir__,
"_constructor": _FastSlowAttribute("_constructor"),
"_constructor_sliced": _FastSlowAttribute("_constructor_sliced"),
"_accessors": set(),
},
)

Expand All @@ -236,6 +239,7 @@ def _DataFrame__dir__(self):
"cat": _AccessorAttr(_CategoricalAccessor),
"_constructor": _FastSlowAttribute("_constructor"),
"_constructor_expanddim": _FastSlowAttribute("_constructor_expanddim"),
"_accessors": set(),
},
)

Expand Down Expand Up @@ -273,6 +277,9 @@ def Index__new__(cls, *args, **kwargs):
"__new__": Index__new__,
"_constructor": _FastSlowAttribute("_constructor"),
"__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
"_accessors": set(),
"_data": _FastSlowAttribute("_data", private=True),
"_mask": _FastSlowAttribute("_mask", private=True),
},
)

Expand Down Expand Up @@ -337,7 +344,11 @@ def Index__new__(cls, *args, **kwargs):
fast_to_slow=lambda fast: fast.to_pandas(),
slow_to_fast=cudf.from_pandas,
bases=(Index,),
additional_attributes={"__init__": _DELETE},
additional_attributes={
"__init__": _DELETE,
"_data": _FastSlowAttribute("_data", private=True),
"_mask": _FastSlowAttribute("_mask", private=True),
},
)

DatetimeArray = make_final_proxy_type(
Expand All @@ -346,6 +357,10 @@ def Index__new__(cls, *args, **kwargs):
pd.arrays.DatetimeArray,
fast_to_slow=_Unusable(),
slow_to_fast=_Unusable(),
additional_attributes={
"_data": _FastSlowAttribute("_data", private=True),
"_mask": _FastSlowAttribute("_mask", private=True),
},
)

DatetimeTZDtype = make_final_proxy_type(
Expand All @@ -364,7 +379,11 @@ def Index__new__(cls, *args, **kwargs):
fast_to_slow=lambda fast: fast.to_pandas(),
slow_to_fast=cudf.from_pandas,
bases=(Index,),
additional_attributes={"__init__": _DELETE},
additional_attributes={
"__init__": _DELETE,
"_data": _FastSlowAttribute("_data", private=True),
"_mask": _FastSlowAttribute("_mask", private=True),
},
)

NumpyExtensionArray = make_final_proxy_type(
Expand All @@ -385,6 +404,10 @@ def Index__new__(cls, *args, **kwargs):
pd.arrays.TimedeltaArray,
fast_to_slow=_Unusable(),
slow_to_fast=_Unusable(),
additional_attributes={
"_data": _FastSlowAttribute("_data", private=True),
"_mask": _FastSlowAttribute("_mask", private=True),
},
)

PeriodIndex = make_final_proxy_type(
Expand All @@ -394,7 +417,11 @@ def Index__new__(cls, *args, **kwargs):
fast_to_slow=_Unusable(),
slow_to_fast=_Unusable(),
bases=(Index,),
additional_attributes={"__init__": _DELETE},
additional_attributes={
"__init__": _DELETE,
"_data": _FastSlowAttribute("_data", private=True),
"_mask": _FastSlowAttribute("_mask", private=True),
},
)

PeriodArray = make_final_proxy_type(
Expand All @@ -403,6 +430,11 @@ def Index__new__(cls, *args, **kwargs):
pd.arrays.PeriodArray,
fast_to_slow=_Unusable(),
slow_to_fast=_Unusable(),
additional_attributes={
"_data": _FastSlowAttribute("_data", private=True),
"_mask": _FastSlowAttribute("_mask", private=True),
"__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
},
)

PeriodDtype = make_final_proxy_type(
Expand Down Expand Up @@ -464,6 +496,10 @@ def Index__new__(cls, *args, **kwargs):
pd.arrays.StringArray,
fast_to_slow=_Unusable(),
slow_to_fast=_Unusable(),
additional_attributes={
"_data": _FastSlowAttribute("_data", private=True),
"_mask": _FastSlowAttribute("_mask", private=True),
},
)

StringDtype = make_final_proxy_type(
Expand All @@ -472,7 +508,10 @@ def Index__new__(cls, *args, **kwargs):
pd.StringDtype,
fast_to_slow=_Unusable(),
slow_to_fast=_Unusable(),
additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
additional_attributes={
"__hash__": _FastSlowAttribute("__hash__"),
"storage": _FastSlowAttribute("storage"),
},
)

BooleanArray = make_final_proxy_type(
Expand All @@ -482,7 +521,9 @@ def Index__new__(cls, *args, **kwargs):
fast_to_slow=_Unusable(),
slow_to_fast=_Unusable(),
additional_attributes={
"__array_ufunc__": _FastSlowAttribute("__array_ufunc__")
"_data": _FastSlowAttribute("_data", private=True),
"_mask": _FastSlowAttribute("_mask", private=True),
"__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
},
)

Expand All @@ -502,7 +543,9 @@ def Index__new__(cls, *args, **kwargs):
fast_to_slow=_Unusable(),
slow_to_fast=_Unusable(),
additional_attributes={
"__array_ufunc__": _FastSlowAttribute("__array_ufunc__")
"__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
"_data": _FastSlowAttribute("_data", private=True),
"_mask": _FastSlowAttribute("_mask", private=True),
},
)

Expand Down Expand Up @@ -586,7 +629,11 @@ def Index__new__(cls, *args, **kwargs):
fast_to_slow=lambda fast: fast.to_pandas(),
slow_to_fast=cudf.from_pandas,
bases=(Index,),
additional_attributes={"__init__": _DELETE},
additional_attributes={
"__init__": _DELETE,
"_data": _FastSlowAttribute("_data", private=True),
"_mask": _FastSlowAttribute("_mask", private=True),
},
)

IntervalArray = make_final_proxy_type(
Expand All @@ -595,6 +642,10 @@ def Index__new__(cls, *args, **kwargs):
pd.arrays.IntervalArray,
fast_to_slow=_Unusable(),
slow_to_fast=_Unusable(),
additional_attributes={
"_data": _FastSlowAttribute("_data", private=True),
"_mask": _FastSlowAttribute("_mask", private=True),
},
)

IntervalDtype = make_final_proxy_type(
Expand Down Expand Up @@ -622,7 +673,9 @@ def Index__new__(cls, *args, **kwargs):
fast_to_slow=_Unusable(),
slow_to_fast=_Unusable(),
additional_attributes={
"__array_ufunc__": _FastSlowAttribute("__array_ufunc__")
"__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
"_data": _FastSlowAttribute("_data", private=True),
"_mask": _FastSlowAttribute("_mask", private=True),
},
)

Expand Down Expand Up @@ -798,6 +851,14 @@ def Index__new__(cls, *args, **kwargs):
pd_Styler,
fast_to_slow=_Unusable(),
slow_to_fast=_Unusable(),
additional_attributes={
"css": _FastSlowAttribute("css"),
"ctx": _FastSlowAttribute("ctx"),
"index": _FastSlowAttribute("ctx"),
"data": _FastSlowAttribute("data"),
"_display_funcs": _FastSlowAttribute("_display_funcs"),
"table_styles": _FastSlowAttribute("table_styles"),
},
)
except ImportError:
# Styler requires Jinja to be installed
Expand All @@ -813,7 +874,7 @@ def _get_eval_locals_and_globals(level, local_dict=None, global_dict=None):
return local_dict, global_dict


@register_proxy_func(pd.eval)
@register_proxy_func(pd.core.computation.eval.eval)
@nvtx.annotate(
"CUDF_PANDAS_EVAL",
color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"],
Expand Down Expand Up @@ -843,6 +904,24 @@ def _eval(
)


_orig_df_eval_method = DataFrame.eval


@register_proxy_func(pd.core.accessor.register_dataframe_accessor)
def _register_dataframe_accessor(name):
return pd.core.accessor._register_accessor(name, DataFrame)


@register_proxy_func(pd.core.accessor.register_series_accessor)
def _register_series_accessor(name):
return pd.core.accessor._register_accessor(name, Series)


@register_proxy_func(pd.core.accessor.register_index_accessor)
def _register_index_accessor(name):
return pd.core.accessor._register_accessor(name, Index)


@nvtx.annotate(
"CUDF_PANDAS_DATAFRAME_EVAL",
color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"],
Expand All @@ -853,11 +932,14 @@ def _df_eval_method(self, *args, local_dict=None, global_dict=None, **kwargs):
local_dict, global_dict = _get_eval_locals_and_globals(
level, local_dict, global_dict
)
return super(type(self), self).__getattr__("eval")(
*args, local_dict=local_dict, global_dict=global_dict, **kwargs
return _orig_df_eval_method(
self, *args, local_dict=local_dict, global_dict=global_dict, **kwargs
)


_orig_query_eval_method = DataFrame.query


@nvtx.annotate(
"CUDF_PANDAS_DATAFRAME_QUERY",
color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"],
Expand All @@ -870,8 +952,8 @@ def _df_query_method(self, *args, local_dict=None, global_dict=None, **kwargs):
local_dict, global_dict = _get_eval_locals_and_globals(
level, local_dict, global_dict
)
return super(type(self), self).__getattr__("query")(
*args, local_dict=local_dict, global_dict=global_dict, **kwargs
return _orig_query_eval_method(
self, *args, local_dict=local_dict, global_dict=global_dict, **kwargs
)


Expand Down Expand Up @@ -1277,6 +1359,7 @@ def holiday_calendar_factory_wrapper(*args, **kwargs):
additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
)


MonthBegin = make_final_proxy_type(
"MonthBegin",
_Unusable,
Expand Down
Loading

0 comments on commit e6e6761

Please sign in to comment.