Skip to content

Commit

Permalink
Remove override_dtypes and include_index from `Frame._copy_type_m…
Browse files Browse the repository at this point in the history
…etadata` (#16043)

* `override_dtypes` logic was only needed for `.explode`. I think it's appropriate to make it a postprocessing step in that function
* `include_index` logic was able to be transferred more simply to `IndexedFrame._from_columns_like_self`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: #16043
  • Loading branch information
mroeschke authored Jun 18, 2024
1 parent dcc153b commit 102d30a
Show file tree
Hide file tree
Showing 7 changed files with 63 additions and 123 deletions.
4 changes: 1 addition & 3 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,9 +282,7 @@ def __contains__(self, item):
hash(item)
return item in self._values

def _copy_type_metadata(
self, other: Self, *, override_dtypes=None
) -> Self:
def _copy_type_metadata(self: Self, other: Self) -> Self:
raise NotImplementedError

def get_level_values(self, level):
Expand Down
6 changes: 0 additions & 6 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7361,9 +7361,6 @@ def explode(self, column, ignore_index=False):
3 4 44
3 5 44
"""
if column not in self._column_names:
raise KeyError(column)

return super()._explode(column, ignore_index)

def pct_change(
Expand Down Expand Up @@ -7511,14 +7508,11 @@ def _from_columns_like_self(
columns: list[ColumnBase],
column_names: abc.Iterable[str] | None = None,
index_names: list[str] | None = None,
*,
override_dtypes: abc.Iterable[Dtype | None] | None = None,
) -> DataFrame:
result = super()._from_columns_like_self(
columns,
column_names,
index_names,
override_dtypes=override_dtypes,
)
result._set_columns_like(self._data)
return result
Expand Down
26 changes: 4 additions & 22 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from __future__ import annotations

import copy
import itertools
import operator
import pickle
import warnings
Expand Down Expand Up @@ -80,7 +79,7 @@ def _columns(self) -> tuple[ColumnBase, ...]:
return self._data.columns

@property
def _dtypes(self) -> abc.Iterator:
def _dtypes(self) -> abc.Iterable:
return zip(self._data.names, (col.dtype for col in self._data.columns))

@property
Expand Down Expand Up @@ -145,8 +144,6 @@ def _from_columns_like_self(
self,
columns: list[ColumnBase],
column_names: abc.Iterable[str] | None = None,
*,
override_dtypes: abc.Iterable[Dtype | None] | None = None,
):
"""Construct a Frame from a list of columns with metadata from self.
Expand All @@ -156,7 +153,7 @@ def _from_columns_like_self(
column_names = self._column_names
data = dict(zip(column_names, columns))
frame = self.__class__._from_data(data)
return frame._copy_type_metadata(self, override_dtypes=override_dtypes)
return frame._copy_type_metadata(self)

@_cudf_nvtx_annotate
def _mimic_inplace(
Expand Down Expand Up @@ -1032,29 +1029,14 @@ def _positions_from_column_names(self, column_names) -> list[int]:
]

@_cudf_nvtx_annotate
def _copy_type_metadata(
self,
other: Self,
*,
override_dtypes: abc.Iterable[Dtype | None] | None = None,
) -> Self:
def _copy_type_metadata(self: Self, other: Self) -> Self:
"""
Copy type metadata from each column of `other` to the corresponding
column of `self`.
If override_dtypes is provided, any non-None entry
will be used in preference to the relevant column of other to
provide the new dtype.
See `ColumnBase._with_type_metadata` for more information.
"""
if override_dtypes is None:
override_dtypes = itertools.repeat(None)
dtypes = (
dtype if dtype is not None else col.dtype
for (dtype, col) in zip(override_dtypes, other._data.values())
)
for (name, col), dtype in zip(self._data.items(), dtypes):
for (name, col), (_, dtype) in zip(self._data.items(), other._dtypes):
self._data.set_by_label(
name, col._with_type_metadata(dtype), validate=False
)
Expand Down
25 changes: 8 additions & 17 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
from cudf.utils.utils import _warn_no_dask_cudf, search_range

if TYPE_CHECKING:
from collections.abc import Generator
from collections.abc import Generator, Iterable


class IndexMeta(type):
Expand Down Expand Up @@ -232,9 +232,7 @@ def __init__(
raise ValueError("Step must not be zero.") from err
raise

def _copy_type_metadata(
self, other: RangeIndex, *, override_dtypes=None
) -> Self:
def _copy_type_metadata(self: Self, other: Self) -> Self:
# There is no metadata to be copied for RangeIndex since it does not
# have an underlying column.
return self
Expand Down Expand Up @@ -485,6 +483,10 @@ def dtype(self):
dtype = np.dtype(np.int64)
return _maybe_convert_to_default_type(dtype)

@property
def _dtypes(self) -> Iterable:
return [(self.name, self.dtype)]

@_cudf_nvtx_annotate
def to_pandas(
self, *, nullable: bool = False, arrow_type: bool = False
Expand Down Expand Up @@ -1115,15 +1117,6 @@ def _binaryop(
return ret.values
return ret

# Override just to make mypy happy.
@_cudf_nvtx_annotate
def _copy_type_metadata(
self, other: Self, *, override_dtypes=None
) -> Self:
return super()._copy_type_metadata(
other, override_dtypes=override_dtypes
)

@property # type: ignore
@_cudf_nvtx_annotate
def _values(self):
Expand Down Expand Up @@ -1769,10 +1762,8 @@ def __init__(
raise ValueError("No unique frequency found")

@_cudf_nvtx_annotate
def _copy_type_metadata(
self: DatetimeIndex, other: DatetimeIndex, *, override_dtypes=None
) -> Index:
super()._copy_type_metadata(other, override_dtypes=override_dtypes)
def _copy_type_metadata(self: Self, other: Self) -> Self:
super()._copy_type_metadata(other)
self._freq = _validate_freq(other._freq)
return self

Expand Down
101 changes: 29 additions & 72 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,17 +313,11 @@ def _from_columns_like_self(
columns: list[ColumnBase],
column_names: abc.Iterable[str] | None = None,
index_names: list[str] | None = None,
*,
override_dtypes: abc.Iterable[Dtype | None] | None = None,
) -> Self:
"""Construct a `Frame` from a list of columns with metadata from self.
If `index_names` is set, the first `len(index_names)` columns are
used to construct the index of the frame.
If override_dtypes is provided then any non-None entry will be
used for the dtype of the matching column in preference to the
dtype of the column in self.
"""
if column_names is None:
column_names = self._column_names
Expand All @@ -337,22 +331,24 @@ def _from_columns_like_self(
index = _index_from_data(
dict(enumerate(columns[:n_index_columns]))
)
index = index._copy_type_metadata(self.index)
# TODO: Should this if statement be handled in Index._copy_type_metadata?
if (
isinstance(self.index, cudf.CategoricalIndex)
and not isinstance(index, cudf.CategoricalIndex)
) or (
isinstance(self.index, cudf.MultiIndex)
and not isinstance(index, cudf.MultiIndex)
):
index = type(self.index)._from_data(index._data)
if isinstance(index, cudf.MultiIndex):
index.names = index_names
else:
index.name = index_names[0]

data = dict(zip(column_names, data_columns))
frame = self.__class__._from_data(data)

if index is not None:
# TODO: triage why using the setter here breaks dask_cuda.ProxifyHostFile
frame._index = index
return frame._copy_type_metadata(
self,
include_index=bool(index_names),
override_dtypes=override_dtypes,
)
frame = type(self)._from_data(data, index)
return frame._copy_type_metadata(self)

def __round__(self, digits=0):
# Shouldn't be added to BinaryOperand
Expand Down Expand Up @@ -1913,45 +1909,6 @@ def nans_to_nulls(self):
self._data._from_columns_like_self(result)
)

def _copy_type_metadata(
self,
other: Self,
include_index: bool = True,
*,
override_dtypes: abc.Iterable[Dtype | None] | None = None,
) -> Self:
"""
Copy type metadata from each column of `other` to the corresponding
column of `self`.
See `ColumnBase._with_type_metadata` for more information.
"""
super()._copy_type_metadata(other, override_dtypes=override_dtypes)
if (
include_index
and self.index is not None
and other.index is not None
):
self.index._copy_type_metadata(other.index)
# When other.index is a CategoricalIndex, the current index
# will be a NumericalIndex with an underlying CategoricalColumn
# (the above _copy_type_metadata call will have converted the
# column). Calling cudf.Index on that column generates the
# appropriate index.
if isinstance(
other.index, cudf.core.index.CategoricalIndex
) and not isinstance(self.index, cudf.core.index.CategoricalIndex):
self.index = cudf.Index(
cast("cudf.Index", self.index)._column,
name=self.index.name,
)
elif isinstance(other.index, cudf.MultiIndex) and not isinstance(
self.index, cudf.MultiIndex
):
self.index = cudf.MultiIndex._from_data(
self.index._data, name=self.index.name
)
return self

@_cudf_nvtx_annotate
def interpolate(
self,
Expand Down Expand Up @@ -5195,36 +5152,36 @@ def _explode(self, explode_column: Any, ignore_index: bool):
# duplicated. If ignore_index is set, the original index is not
# exploded and will be replaced with a `RangeIndex`.
if not isinstance(self._data[explode_column].dtype, ListDtype):
data = self._data.copy(deep=True)
idx = None if ignore_index else self.index.copy(deep=True)
return self.__class__._from_data(data, index=idx)
result = self.copy()
if ignore_index:
result.index = RangeIndex(len(result))
return result

column_index = self._column_names.index(explode_column)
if not ignore_index and self.index is not None:
index_offset = self.index.nlevels
if not ignore_index:
idx_cols = self.index._columns
else:
index_offset = 0
idx_cols = ()

exploded = libcudf.lists.explode_outer(
[
*(self.index._data.columns if not ignore_index else ()),
*self._columns,
],
column_index + index_offset,
[*idx_cols, *self._columns],
column_index + len(idx_cols),
)
# We must copy inner datatype of the exploded list column to
# maintain struct dtype key names
exploded_dtype = cast(
element_type = cast(
ListDtype, self._columns[column_index].dtype
).element_type
exploded = [
column._with_type_metadata(element_type)
if i == column_index
else column
for i, column in enumerate(exploded, start=-len(idx_cols))
]
return self._from_columns_like_self(
exploded,
self._column_names,
self._index_names if not ignore_index else None,
override_dtypes=(
exploded_dtype if i == column_index else None
for i in range(len(self._columns))
),
self.index.names if not ignore_index else None,
)

@_cudf_nvtx_annotate
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
if TYPE_CHECKING:
from collections.abc import Generator

from typing_extensions import Self

from cudf._typing import DataFrameOrSeries


Expand Down Expand Up @@ -2100,9 +2102,7 @@ def _intersection(self, other, sort=None):
return midx

@_cudf_nvtx_annotate
def _copy_type_metadata(
self: MultiIndex, other: MultiIndex, *, override_dtypes=None
) -> MultiIndex:
def _copy_type_metadata(self: Self, other: Self) -> Self:
res = super()._copy_type_metadata(other)
if isinstance(other, MultiIndex):
res._names = other._names
Expand Down
18 changes: 18 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -9466,6 +9466,24 @@ def test_explode(data, labels, ignore_index, p_index, label_to_explode):
assert_eq(expect, got, check_dtype=False)


def test_explode_preserve_categorical():
gdf = cudf.DataFrame(
{
"A": [[1, 2], None, [2, 3]],
"B": cudf.Series([0, 1, 2], dtype="category"),
}
)
result = gdf.explode("A")
expected = cudf.DataFrame(
{
"A": [1, 2, None, 2, 3],
"B": cudf.Series([0, 0, 1, 2, 2], dtype="category"),
}
)
expected.index = cudf.Index([0, 0, 1, 2, 2])
assert_eq(result, expected)


@pytest.mark.parametrize(
"df,ascending,expected",
[
Expand Down

0 comments on commit 102d30a

Please sign in to comment.