diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 3e5ff9c18b5..f1d2accc5a8 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3593,15 +3593,15 @@ def rename( if level is not None and isinstance(self.index, MultiIndex): level = self.index._get_level_label(level) - out_index = self.index.copy(deep=copy) - level_values = out_index.get_level_values(level) - level_values.to_frame().replace( + level_values = self.index.get_level_values(level) + ca = self.index._data.copy(deep=copy) + ca[level] = level_values._column.find_and_replace( to_replace=list(index.keys()), - value=list(index.values()), - inplace=True, + replacement=list(index.values()), + ) + out_index = type(self.index)._from_data( + ca, name=self.index.name ) - out_index._data[level] = column.as_column(level_values) - out_index._compute_levels_and_codes() else: to_replace = list(index.keys()) vals = list(index.values()) @@ -7058,12 +7058,8 @@ def stack(self, level=-1, dropna=no_default, future_stack=False): # Assemble the final index new_index_columns = [*repeated_index._columns, *tiled_index] index_names = [*self.index.names, *unique_named_levels.names] - new_index = MultiIndex.from_frame( - DataFrame._from_data( - dict(zip(range(0, len(new_index_columns)), new_index_columns)) - ), - names=index_names, - ) + new_index = MultiIndex._from_data(dict(enumerate(new_index_columns))) + new_index.names = index_names # Compute the column indices that serves as the input for # `interleave_columns` diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index dbbd1eab6c8..6503dae6ff5 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -7,9 +7,7 @@ import operator import pickle import warnings -from collections import abc from functools import cached_property -from numbers import Integral from typing import TYPE_CHECKING, Any, MutableMapping import cupy as cp @@ -20,7 +18,7 @@ import cudf._lib as libcudf from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default -from cudf.api.types import is_integer, is_list_like, is_object_dtype +from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar from cudf.core import column from cudf.core._base_index import _return_get_indexer_result from cudf.core.algorithms import factorize @@ -64,6 +62,20 @@ def _maybe_indices_to_slice(indices: cp.ndarray) -> slice | cp.ndarray: return indices +def _compute_levels_and_codes( + data: MutableMapping, +) -> tuple[list[cudf.Index], list[column.ColumnBase]]: + """Return MultiIndex level and codes from a ColumnAccessor-like mapping.""" + levels = [] + codes = [] + for col in data.values(): + code, cats = factorize(col) + codes.append(column.as_column(code.astype(np.int64))) + levels.append(cats) + + return levels, codes + + class MultiIndex(Frame, BaseIndex, NotIterable): """A multi-level or hierarchical index. @@ -146,50 +158,36 @@ def __init__( raise NotImplementedError( "Use `names`, `name` is not yet supported" ) - if len(levels) == 0: - raise ValueError("Must pass non-zero number of levels/codes") - if not isinstance(codes, cudf.DataFrame) and not isinstance( - codes[0], (abc.Sequence, np.ndarray, cp.ndarray) - ): - raise TypeError("Codes is not a Sequence of sequences") - - if copy: - if isinstance(codes, cudf.DataFrame): - codes = codes.copy(deep=True) - if len(levels) > 0 and isinstance( - levels[0], (cudf.Index, cudf.Series) - ): - levels = [level.copy(deep=True) for level in levels] - - if not isinstance(codes, cudf.DataFrame): - if len(levels) == len(codes): - codes = cudf.DataFrame._from_data( - { - i: column.as_column(code).astype(np.int64) - for i, code in enumerate(codes) - } - ) - else: - raise ValueError( - "MultiIndex has unequal number of levels and " - "codes and is inconsistent!" - ) - - levels = [ensure_index(level) for level in levels] - - if len(levels) != len(codes._data): - raise ValueError( - "MultiIndex has unequal number of levels and " - "codes and is inconsistent!" - ) - if len({c.size for c in codes._data.columns}) != 1: + if levels is None or codes is None: + raise TypeError("Must pass both levels and codes") + elif not (is_list_like(levels) and len(levels) > 0): + raise ValueError("Must pass non-zero length sequence of levels") + elif not (is_list_like(codes) and len(codes) > 0): + raise ValueError("Must pass non-zero length sequence of codes") + elif len(codes) != len(levels): raise ValueError( - "MultiIndex length of codes does not match " - "and is inconsistent!" + f"levels must have the same length ({len(levels)}) " + f"as codes ({len(codes)})." ) + new_levels = [] + for level in levels: + new_level = ensure_index(level) + if copy and new_level is level: + new_level = new_level.copy(deep=True) + new_levels.append(new_level) + + new_codes = [] + for code in codes: + if not (is_list_like(code) or is_column_like(code)): + raise TypeError("Each code must be list-like") + new_code = column.as_column(code).astype("int64") + if copy and new_code is code: + new_code = new_code.copy(deep=True) + new_codes.append(new_code) + source_data = {} - for (column_name, code), level in zip(codes._data.items(), levels): + for i, (code, level) in enumerate(zip(new_codes, new_levels)): if len(code): lo, hi = libcudf.reduce.minmax(code) if lo.value < -1 or hi.value > len(level) - 1: @@ -202,13 +200,11 @@ def __init__( result_col = libcudf.copying.gather( [level._column], code, nullify=True ) - source_data[column_name] = result_col[0]._with_type_metadata( - level.dtype - ) + source_data[i] = result_col[0]._with_type_metadata(level.dtype) - super().__init__(source_data) - self._levels = levels - self._codes = codes + super().__init__(ColumnAccessor(source_data)) + self._levels = new_levels + self._codes = new_codes self._name = None self.names = names @@ -350,10 +346,37 @@ def _from_data( data: MutableMapping, name: Any = None, ) -> MultiIndex: - obj = cls.from_frame(cudf.DataFrame._from_data(data=data)) - if name is not None: - obj.name = name - return obj + """ + Use when you have a ColumnAccessor-like mapping but no codes and levels. + """ + levels, codes = _compute_levels_and_codes(data) + return cls._simple_new( + data=ColumnAccessor(data), + levels=levels, + codes=codes, + names=pd.core.indexes.frozen.FrozenList(data.keys()), + name=name, + ) + + @classmethod + def _simple_new( + cls, + data: ColumnAccessor, + levels: list[cudf.Index], + codes: list[column.ColumnBase], + names: pd.core.indexes.frozen.FrozenList, + name: Any = None, + ) -> Self: + """ + Use when you have a ColumnAccessor-like mapping, codes, and levels. + """ + mi = object.__new__(cls) + mi._data = data + mi._levels = levels + mi._codes = codes + mi._names = names + mi._name = name + return mi @property # type: ignore @_performance_tracking @@ -421,18 +444,17 @@ def copy( 2020-08-28 AMZN 3401.80 MSFT 228.91 """ - - mi = MultiIndex._from_data(self._data.copy(deep=deep)) - if self._levels is not None: - mi._levels = [idx.copy(deep=deep) for idx in self._levels] - if self._codes is not None: - mi._codes = self._codes.copy(deep) if names is not None: - mi.names = names - elif self.names is not None: - mi.names = self.names.copy() - - return mi + names = pd.core.indexes.frozen.FrozenList(names) + else: + names = self.names + return type(self)._simple_new( + data=self._data.copy(deep=deep), + levels=[idx.copy(deep=deep) for idx in self._levels], + codes=[code.copy(deep=deep) for code in self._codes], + names=names, + name=name, + ) @_performance_tracking def __repr__(self): @@ -478,14 +500,8 @@ def __repr__(self): data_output = "\n".join(lines) return output_prefix + data_output - @property - def _codes_frame(self): - if self._codes is None: - self._compute_levels_and_codes() - return self._codes - @property # type: ignore - @_external_only_api("Use ._codes_frame instead") + @_external_only_api("Use ._codes instead") @_performance_tracking def codes(self): """ @@ -505,7 +521,7 @@ def codes(self): FrozenList([[0, 1, 2], [0, 1, 2]]) """ return pd.core.indexes.frozen.FrozenList( - col.values for col in self._codes_frame._columns + col.values for col in self._codes ) def get_slice_bound(self, label, side, kind=None): @@ -519,13 +535,13 @@ def nlevels(self): @property # type: ignore @_performance_tracking - def levels(self): + def levels(self) -> list[cudf.Index]: """ Returns list of levels in the MultiIndex Returns ------- - List of Series objects + List of Index objects Examples -------- @@ -545,9 +561,9 @@ def levels(self): >>> midx.levels [Index([1, 2, 3], dtype='int64', name='a'), Index([10, 11, 12], dtype='int64', name='b')] """ # noqa: E501 - if self._levels is None: - self._compute_levels_and_codes() - return self._levels + return [ + idx.rename(name) for idx, name in zip(self._levels, self.names) + ] @property # type: ignore @_performance_tracking @@ -566,11 +582,10 @@ def _get_level_label(self, level): else if level is index of the level, then level label will be returned as per the index. """ - - if level in self._data.names: + if level in self.names: return level else: - return self._data.names[level] + return self.names[level] @_performance_tracking def isin(self, values, level=None): @@ -671,20 +686,6 @@ def where(self, cond, other=None, inplace=False): ".where is not supported for MultiIndex operations" ) - @_performance_tracking - def _compute_levels_and_codes(self): - levels = [] - - codes = {} - for name, col in self._data.items(): - code, cats = cudf.Series._from_data({None: col}).factorize() - cats.name = name - codes[name] = code.astype(np.int64) - levels.append(cats) - - self._levels = levels - self._codes = cudf.DataFrame._from_data(codes) - @_performance_tracking def _compute_validity_mask(self, index, row_tuple, max_length): """Computes the valid set of indices of values in the lookup""" @@ -823,7 +824,7 @@ def _index_and_downcast(self, result, index, index_key): result.names = index.names[size:] index = MultiIndex( levels=index.levels[size:], - codes=index._codes_frame.iloc[:, size:], + codes=index._codes[size:], names=index.names[size:], ) @@ -933,28 +934,29 @@ def deserialize(cls, header, frames): def __getitem__(self, index): flatten = isinstance(index, int) - if isinstance(index, (Integral, abc.Sequence)): - index = np.array(index) - elif isinstance(index, slice): + if isinstance(index, slice): start, stop, step = index.indices(len(self)) - index = column.as_column(range(start, stop, step)) - result = MultiIndex.from_frame( - self.to_frame(index=False, name=range(0, self.nlevels)).take( - index - ), - names=self.names, + idx = range(start, stop, step) + elif is_scalar(index): + idx = [index] + else: + idx = index + + indexer = column.as_column(idx) + ca = self._data._from_columns_like_self( + (col.take(indexer) for col in self._columns), verify=False + ) + codes = [code.take(indexer) for code in self._codes] + result = type(self)._simple_new( + data=ca, codes=codes, levels=self._levels, names=self.names ) # we are indexing into a single row of the MultiIndex, # return that row as a tuple: if flatten: return result.to_pandas()[0] - - if self._codes_frame is not None: - result._codes = self._codes_frame.take(index) - if self._levels is not None: - result._levels = self._levels - return result + else: + return result @_performance_tracking def to_frame(self, index=True, name=no_default, allow_duplicates=False): @@ -1270,25 +1272,12 @@ def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None): ('NJ', 'Precip')], names=['state', 'observation']) """ - obj = cls.__new__(cls) - super(cls, obj).__init__() - - source_data = df.copy(deep=False) - source_data.reset_index(drop=True, inplace=True) - if isinstance(source_data, pd.DataFrame): - source_data = cudf.DataFrame.from_pandas(source_data) - - names = names if names is not None else source_data._data.names - # if names are unique - # try using those as the source_data column names: - if len(dict.fromkeys(names)) == len(names): - source_data.columns = names - obj._name = None - obj._data = source_data._data - obj.names = names - obj._codes = None - obj._levels = None - return obj + if isinstance(df, pd.DataFrame): + source_data = cudf.DataFrame.from_pandas(df) + else: + source_data = df + names = names if names is not None else source_data._column_names + return cls.from_arrays(source_data._columns, names=names) @classmethod @_performance_tracking @@ -1436,7 +1425,7 @@ def _poplevels(self, level): # update self self.names = names - self._compute_levels_and_codes() + self._levels, self._codes = _compute_levels_and_codes(self._data) return popped @@ -1560,13 +1549,19 @@ def to_pandas( ) -> pd.MultiIndex: # cudf uses np.iinfo(size_type_dtype).min as missing code # pandas uses -1 as missing code - pd_codes = self._codes_frame.replace(np.iinfo(size_type_dtype).min, -1) + pd_codes = ( + code.find_and_replace( + column.as_column(np.iinfo(size_type_dtype).min, length=1), + column.as_column(-1, length=1), + ) + for code in self._codes + ) return pd.MultiIndex( levels=[ level.to_pandas(nullable=nullable, arrow_type=arrow_type) for level in self.levels ], - codes=[col.values_host for col in pd_codes._columns], + codes=[col.values_host for col in pd_codes], names=self.names, ) @@ -1741,13 +1736,9 @@ def _clean_nulls_from_index(self): @_performance_tracking def memory_usage(self, deep=False): - usage = sum(col.memory_usage for col in self._data.columns) - if self.levels: - for level in self.levels: - usage += level.memory_usage(deep=deep) - if self._codes_frame: - for col in self._codes_frame._data.columns: - usage += col.memory_usage + usage = sum(col.memory_usage for col in self._columns) + usage += sum(level.memory_usage(deep=deep) for level in self._levels) + usage += sum(code.memory_usage for code in self._codes) return usage @_performance_tracking @@ -2043,7 +2034,7 @@ def _union(self, other, sort=None): ignore_index=True, ) - midx = MultiIndex.from_frame(result_df.iloc[:, : self.nlevels]) + midx = type(self)._from_data(result_df.iloc[:, : self.nlevels]._data) midx.names = self.names if self.names == other.names else None if sort in {None, True} and len(other): return midx.sort_values() @@ -2067,7 +2058,8 @@ def _intersection(self, other, sort=None): self_df.columns = col_names result_df = cudf.merge(self_df, other_df, how="inner") - midx = self.__class__.from_frame(result_df, names=res_name) + midx = type(self)._from_data(result_df._data) + midx.names = res_name if sort in {None, True} and len(other): return midx.sort_values() return midx @@ -2077,6 +2069,7 @@ def _copy_type_metadata(self: Self, other: Self) -> Self: res = super()._copy_type_metadata(other) if isinstance(other, MultiIndex): res._names = other._names + self._levels, self._codes = _compute_levels_and_codes(res._data) return res @_performance_tracking diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 07c2e9c3fcf..1941eec91eb 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -832,25 +832,17 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): # Assert ._levels identity lptrs = [ - lv._data._data[None].base_data.get_ptr(mode="read") - for lv in mi1._levels + lv._column.base_data.get_ptr(mode="read") for lv in mi1._levels ] rptrs = [ - lv._data._data[None].base_data.get_ptr(mode="read") - for lv in mi2._levels + lv._column.base_data.get_ptr(mode="read") for lv in mi2._levels ] assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) # Assert ._codes identity - lptrs = [ - c.base_data.get_ptr(mode="read") - for _, c in mi1._codes._data.items() - ] - rptrs = [ - c.base_data.get_ptr(mode="read") - for _, c in mi2._codes._data.items() - ] + lptrs = [c.base_data.get_ptr(mode="read") for c in mi1._codes] + rptrs = [c.base_data.get_ptr(mode="read") for c in mi2._codes] assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 193d64a9e7f..a013745f71e 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -186,13 +186,11 @@ def test_MI(): } ) levels = [["a", "b", "c", "d"], ["w", "x", "y", "z"], ["m", "n"]] - codes = cudf.DataFrame( - { - "a": [0, 0, 0, 0, 1, 1, 2, 2, 3, 3], - "b": [0, 1, 2, 3, 0, 1, 2, 3, 0, 1], - "c": [0, 1, 0, 1, 0, 1, 0, 1, 0, 1], - } - ) + codes = [ + [0, 0, 0, 0, 1, 1, 2, 2, 3, 3], + [0, 1, 2, 3, 0, 1, 2, 3, 0, 1], + [0, 1, 0, 1, 0, 1, 0, 1, 0, 1], + ] pd.options.display.max_rows = 999 pd.options.display.max_columns = 0 gdf = gdf.set_index(cudf.MultiIndex(levels=levels, codes=codes))