Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid redefining Frame._get_columns_by_label in subclasses #15912

Merged
36 changes: 10 additions & 26 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1345,7 +1345,16 @@ def __getitem__(self, arg):
8 8 8 8
"""
if _is_scalar_or_zero_d_array(arg) or isinstance(arg, tuple):
return self._get_columns_by_label(arg, downcast=True)
out = self._get_columns_by_label(arg)
if is_scalar(arg):
nlevels = 1
elif isinstance(arg, tuple):
nlevels = len(arg)
if self._data.multiindex is False or nlevels == self._data.nlevels:
out = self._constructor_sliced._from_data(out._data)
out.index = self.index
out.name = arg
return out

elif isinstance(arg, slice):
return self._slice(arg)
Expand Down Expand Up @@ -1990,31 +1999,6 @@ def _repr_html_(self):
def _repr_latex_(self):
return self._get_renderable_dataframe().to_pandas()._repr_latex_()

@_cudf_nvtx_annotate
def _get_columns_by_label(
self, labels, *, downcast=False
) -> Self | Series:
"""
Return columns of dataframe by `labels`

If downcast is True, try and downcast from a DataFrame to a Series
"""
ca = self._data.select_by_label(labels)
if downcast:
if is_scalar(labels):
nlevels = 1
elif isinstance(labels, tuple):
nlevels = len(labels)
if self._data.multiindex is False or nlevels == self._data.nlevels:
out = self._constructor_sliced._from_data(
ca, index=self.index, name=labels
)
return out
out = self.__class__._from_data(
ca, index=self.index, columns=ca.to_pandas_index()
)
return out

def _make_operands_and_index_for_binop(
self,
other: Any,
Expand Down
28 changes: 16 additions & 12 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,12 +132,19 @@ def deserialize(cls, header, frames):
@classmethod
@_cudf_nvtx_annotate
def _from_data(cls, data: MutableMapping) -> Self:
"""
Construct cls from a ColumnAccessor-like mapping.
"""
obj = cls.__new__(cls)
Frame.__init__(obj, data)
return obj

@_cudf_nvtx_annotate
def _from_data_like_self(self, data: MutableMapping) -> Self:
"""
Return type(self) from a ColumnAccessor-like mapping but
with the external properties, e.g. .index, .name, of self.
"""
return self._from_data(data)

@_cudf_nvtx_annotate
Expand Down Expand Up @@ -351,12 +358,13 @@ def equals(self, other) -> bool:
)

@_cudf_nvtx_annotate
def _get_columns_by_label(self, labels, *, downcast=False) -> Self:
def _get_columns_by_label(self, labels) -> Self:
"""
Returns columns of the Frame specified by `labels`
Returns columns of the Frame specified by `labels`.

Akin to cudf.DataFrame(...).loc[:, labels]
"""
return self.__class__._from_data(self._data.select_by_label(labels))
return self._from_data_like_self(self._data.select_by_label(labels))

@property
@_cudf_nvtx_annotate
Expand Down Expand Up @@ -1434,22 +1442,18 @@ def _get_sorted_inds(
Get the indices required to sort self according to the columns
specified in by.
"""

to_sort = [
*(
self
if by is None
else self._get_columns_by_label(list(by), downcast=False)
)._columns
]
if by is None:
to_sort = self._columns
else:
to_sort = self._get_columns_by_label(list(by))._columns
Comment on lines +1445 to +1448
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Outside of dropping of downcast, is there any reason for the restructure here? Are there cases where we would expect the unpacking here not to work?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was mostly a micro-optimization to avoid the unnecessary unpacking of the if by is None case


if is_scalar(ascending):
ascending_lst = [ascending] * len(to_sort)
else:
ascending_lst = list(ascending)

return libcudf.sort.order_by(
to_sort,
list(to_sort),
ascending_lst,
na_position,
stable=True,
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,8 +306,8 @@ def _from_data(

@_cudf_nvtx_annotate
def _from_data_like_self(self, data: MutableMapping):
out = self._from_data(data, self.index)
out._data._level_names = self._data._level_names
out = super()._from_data_like_self(data)
out.index = self.index
return out

@_cudf_nvtx_annotate
Expand Down
20 changes: 6 additions & 14 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,12 @@ def _from_data(
out.name = name
return out

@_cudf_nvtx_annotate
def _from_data_like_self(self, data: MutableMapping):
out = super()._from_data_like_self(data)
out.name = self.name
return out

@_cudf_nvtx_annotate
def __contains__(self, item):
return item in self.index
Expand Down Expand Up @@ -856,20 +862,6 @@ def deserialize(cls, header, frames):

return obj

def _get_columns_by_label(self, labels, *, downcast=False) -> Self:
"""Return the column specified by `labels`

For cudf.Series, either the column, or an empty series is returned.
Parameter `downcast` does not have effects.
"""
ca = self._data.select_by_label(labels)

return (
self.__class__._from_data(data=ca, index=self.index)
if len(ca) > 0
else self.__class__(dtype=self.dtype, name=self.name)
)

@_cudf_nvtx_annotate
def drop(
self,
Expand Down
Loading