From 4bc2eba3429aa02b77967fae17d8bae0942ef927 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 6 Sep 2021 10:09:47 -0700 Subject: [PATCH 01/11] Remove source_data usage outside of multiindex.py. --- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/frame.py | 14 +++++++++----- python/cudf/cudf/core/groupby/groupby.py | 9 +++------ python/cudf/cudf/core/index.py | 2 +- 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index aac0b027c0b..e793a8e8644 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3962,7 +3962,7 @@ def sort_index( ] else: labels = [self.index._get_level_label(level)] - inds = self.index._source_data[labels].argsort( + inds = self.index.to_frame(index=False)[labels].argsort( ascending=ascending, na_position=na_position ) else: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 33be14462d4..5f476bda7d7 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1856,7 +1856,7 @@ def sample( if isinstance(self, cudf.MultiIndex): # TODO: Need to update this once MultiIndex is refactored, # should be able to treat it similar to other Frame object - result = cudf.Index(self._source_data[gather_map]) + result = cudf.Index(self.to_frame(index=False)[gather_map]) else: result = self[gather_map] if not keep_index: @@ -3168,9 +3168,13 @@ def _reindex( index = cudf.core.index.as_index(index) if isinstance(index, cudf.MultiIndex): - idx_dtype_match = ( - df.index._source_data.dtypes == index._source_data.dtypes - ).all() + idx_dtype_match = all( + left_dtype == right_dtype + for left_dtype, right_dtype in zip( + (col.dtype for col in df.index._data.columns), + (col.dtype for col in index._data.columns), + ) + ) else: idx_dtype_match = df.index.dtype == index.dtype @@ -5152,7 +5156,7 @@ def _drop_rows_by_labels( # 1. Merge Index df and data df along column axis: # | id | ._index df | data column(s) | idx_nlv = obj._index.nlevels - working_df = obj._index._source_data + working_df = obj._index.to_frame(index=False) working_df.columns = [i for i in range(idx_nlv)] for i, col in enumerate(obj._data): working_df[idx_nlv + i] = obj._data[col] diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index d98a78efb18..b16de048b8d 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1334,12 +1334,9 @@ def keys(self): if nkeys == 0: return cudf.core.index.as_index([], name=None) elif nkeys > 1: - return cudf.MultiIndex( - source_data=cudf.DataFrame( - dict(zip(range(nkeys), self._key_columns)) - ), - names=self.names, - ) + return cudf.MultiIndex._from_data( + dict(zip(range(nkeys), self._key_columns)) + ).set_names(self.names) else: return cudf.core.index.as_index( self._key_columns[0], name=self.names[0] diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 6b4b77fabc5..cc5cd474db9 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2203,7 +2203,7 @@ def as_index(arbitrary, **kwargs) -> BaseIndex: elif isinstance(arbitrary, pd.MultiIndex): return cudf.MultiIndex.from_pandas(arbitrary) elif isinstance(arbitrary, cudf.DataFrame): - return cudf.MultiIndex(source_data=arbitrary) + return cudf.MultiIndex.from_frame(arbitrary) return as_index( column.as_column(arbitrary, dtype=kwargs.get("dtype", None)), **kwargs ) From bfa5ef9ef48079f7c954b941c4257a7e436c56e0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 6 Sep 2021 10:36:50 -0700 Subject: [PATCH 02/11] Fix test names. --- python/cudf/cudf/tests/test_repr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index fa6c4d9bf24..1ff56522b6e 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -1168,7 +1168,7 @@ def test_timedelta_index_repr(index, expected_repr): 100, ], ) -def test_mulitIndex_repr(pmi, max_seq_items): +def test_multiIndex_repr(pmi, max_seq_items): pd.set_option("display.max_seq_items", max_seq_items) gmi = cudf.from_pandas(pmi) @@ -1413,7 +1413,7 @@ def test_mulitIndex_repr(pmi, max_seq_items): ), ], ) -def test_mulitIndex_null_repr(gdi, expected_repr): +def test_multiIndex_null_repr(gdi, expected_repr): actual_repr = gdi.__repr__() assert actual_repr.split() == expected_repr.split() From d6bafecf3155aa66ad48401e7fde12c5b89531d3 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 6 Sep 2021 12:55:17 -0700 Subject: [PATCH 03/11] First pass at removing uses of source_data in multiindex code. --- python/cudf/cudf/core/multiindex.py | 78 ++++++++++++----------------- 1 file changed, 32 insertions(+), 46 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 3b364a3fa86..5a41f623baa 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -20,7 +20,7 @@ from cudf.core.column import as_column, column from cudf.core.frame import Frame from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index -from cudf.utils.utils import _maybe_indices_to_slice +from cudf.utils.utils import _maybe_indices_to_slice, cached_property class MultiIndex(Frame, BaseIndex): @@ -415,7 +415,7 @@ def copy( mi = MultiIndex(levels=levels, codes=codes, names=names, copy=deep) return mi - mi = MultiIndex(source_data=self._source_data.copy(deep=deep)) + mi = MultiIndex._from_data(self._data.copy(deep=deep)) if self._levels is not None: mi._levels = [s.copy(deep) for s in self._levels] if self._codes is not None: @@ -448,7 +448,11 @@ def _popn(self, n): Removes n names, labels, and codes in order to build a new index for results. """ - result = MultiIndex(source_data=self._source_data.iloc[:, n:]) + result = MultiIndex( + levels=self.levels[n:], + codes=self.codes.iloc[:, n:], + names=self.names[n:], + ) if self.names is not None: result.names = self.names[n:] return result @@ -471,13 +475,9 @@ def __repr__(self): else: preprocess = self.copy(deep=False) - cols_nulls = [ - preprocess._source_data._data[col].has_nulls - for col in preprocess._source_data._data - ] - if any(cols_nulls): - preprocess_df = preprocess._source_data - for name, col in preprocess_df._data.items(): + if any(col.has_nulls for col in preprocess._data.columns): + preprocess_df = preprocess.to_frame(index=False) + for name, col in preprocess._data.items(): if isinstance( col, ( @@ -488,8 +488,6 @@ def __repr__(self): preprocess_df[name] = col.astype("str").fillna( cudf._NA_REP ) - else: - preprocess_df[name] = col tuples_list = list( zip( @@ -506,18 +504,12 @@ def __repr__(self): # TODO: Remove this whole `if` block, # this is a workaround for the following issue: # https://github.com/pandas-dev/pandas/issues/39984 - temp_df = preprocess._source_data - - preprocess_pdf = pd.DataFrame() - for col in temp_df.columns: - if temp_df[col].dtype.kind == "f": - preprocess_pdf[col] = temp_df[col].to_pandas( - nullable=False - ) - else: - preprocess_pdf[col] = temp_df[col].to_pandas( - nullable=True - ) + preprocess_pdf = pd.DataFrame( + { + name: col.to_pandas(nullable=(col.dtype.kind != "f")) + for name, col in preprocess._data.items() + } + ) preprocess_pdf.columns = preprocess.names preprocess = pd.MultiIndex.from_frame(preprocess_pdf) @@ -578,7 +570,7 @@ def nlevels(self): """ Integer number of levels in this MultiIndex. """ - return self._source_data.shape[1] + return len(self._data) @property def levels(self): @@ -762,15 +754,14 @@ def where(self, cond, other=None, inplace=False): def _compute_levels_and_codes(self): levels = [] - codes = cudf.DataFrame() - for name in self._source_data.columns: - code, cats = self._source_data[name].factorize() + codes = {} + for name, col in self._data.items(): + code, cats = cudf.Series._from_data({None: col}).factorize() codes[name] = code.astype(np.int64) - cats = cudf.Series(cats, name=None) - levels.append(cats) + levels.append(cudf.Series(cats, name=None)) self._levels = levels - self._codes = codes + self._codes = cudf.DataFrame._from_data(codes) def _compute_validity_mask(self, index, row_tuple, max_length): """ Computes the valid set of indices of values in the lookup @@ -1478,21 +1469,15 @@ def from_pandas(cls, multiindex, nan_as_null=None): # which preserves all levels of `multiindex`. names = tuple(range(len(multiindex.names))) - mi = cls( + return cls( names=multiindex.names, source_data=multiindex.to_frame(name=names), nan_as_null=nan_as_null, ) - return mi - - @property + @cached_property def is_unique(self): - if not hasattr(self, "_is_unique"): - self._is_unique = len(self._source_data) == len( - self._source_data.drop_duplicates(ignore_index=True) - ) - return self._is_unique + return len(self) == len(self.unique()) @property def is_monotonic(self): @@ -1525,14 +1510,15 @@ def is_monotonic_decreasing(self): ) def argsort(self, ascending=True, **kwargs): - indices = self._source_data.argsort(ascending=ascending, **kwargs) - return cupy.asarray(indices) + return self._get_sorted_inds(ascending=ascending, **kwargs).values def sort_values(self, return_indexer=False, ascending=True, key=None): if key is not None: raise NotImplementedError("key parameter is not yet implemented.") - indices = self._source_data.argsort(ascending=ascending) + indices = cudf.Series._from_data( + {None: self._get_sorted_inds(ascending=ascending)} + ) index_sorted = as_index(self.take(indices), name=self.names) if return_indexer: @@ -1581,21 +1567,21 @@ def fillna(self, value): return super().fillna(value=value) def unique(self): - return MultiIndex.from_frame(self._source_data.drop_duplicates()) + return self.drop_duplicates() def _clean_nulls_from_index(self): """ Convert all na values(if any) in MultiIndex object to `` as a preprocessing step to `__repr__` methods. """ - index_df = self._source_data + index_df = self.to_frame(index=False) return MultiIndex.from_frame( index_df._clean_nulls_from_dataframe(index_df), names=self.names ) def memory_usage(self, deep=False): n = 0 - for col in self._source_data._columns: + for col in self._data._columns: n += col._memory_usage(deep=deep) if self._levels: for level in self._levels: From 8a48911afb5b3f04c67f4667b992b3b39ede5597 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 7 Sep 2021 13:39:45 -0700 Subject: [PATCH 04/11] Rework pickling and clean up remaining instances of source_data. --- python/cudf/cudf/core/multiindex.py | 151 ++++++++++------------------ 1 file changed, 52 insertions(+), 99 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 5a41f623baa..61a443ce133 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -16,8 +16,8 @@ import cudf from cudf import _lib as libcudf from cudf._typing import DataFrameOrSeries +from cudf.core import column from cudf.core._compat import PANDAS_GE_120 -from cudf.core.column import as_column, column from cudf.core.frame import Frame from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index from cudf.utils.utils import _maybe_indices_to_slice, cached_property @@ -153,7 +153,6 @@ def __init__( for i, n in enumerate(self._codes.columns): codes = as_index(self._codes[n]._column) if -1 in self._codes[n].values: - # Must account for null(s) in _source_data column level = cudf.DataFrame( {n: [None] + list(self._levels[i])}, index=range(-1, len(self._levels[i])), @@ -465,11 +464,9 @@ def __repr__(self): # TODO: Update the following two arange calls to # a single arange call once arange has support for # a vector start/end points. - indices = cudf.core.column.arange(start=0, stop=n, step=1) + indices = column.arange(start=0, stop=n, step=1) indices = indices.append( - cudf.core.column.arange( - start=len(self) - n, stop=len(self), step=1 - ) + column.arange(start=len(self) - n, stop=len(self), step=1) ) preprocess = self.take(indices) else: @@ -481,8 +478,8 @@ def __repr__(self): if isinstance( col, ( - cudf.core.column.datetime.DatetimeColumn, - cudf.core.column.timedelta.TimeDeltaColumn, + column.datetime.DatetimeColumn, + column.timedelta.TimeDeltaColumn, ), ): preprocess_df[name] = col.astype("str").fillna( @@ -767,19 +764,16 @@ def _compute_validity_mask(self, index, row_tuple, max_length): """ Computes the valid set of indices of values in the lookup """ lookup = cudf.DataFrame() - for idx, row in enumerate(row_tuple): + for name, row in zip(index.names, row_tuple): if isinstance(row, slice) and row == slice(None): continue - lookup[index._source_data.columns[idx]] = cudf.Series(row) + lookup[name] = cudf.Series(row) + frame = index.to_frame(index=False) data_table = cudf.concat( [ - index._source_data, + frame, cudf.DataFrame( - { - "idx": cudf.Series( - column.arange(len(index._source_data)) - ) - } + {"idx": cudf.Series(column.arange(len(frame)))} ), ], axis=1, @@ -835,12 +829,12 @@ def _index_and_downcast(self, result, index, index_key): if isinstance(index_key, slice): slice_access = True out_index = cudf.DataFrame() - # Select the last n-k columns where n is the number of _source_data - # columns and k is the length of the indexing tuple + # Select the last n-k columns where n is the number of columns and k is + # the length of the indexing tuple size = 0 if not isinstance(index_key, (numbers.Number, slice)): size = len(index_key) - for k in range(size, len(index._source_data.columns)): + for k in range(size, len(index._data)): if index.names is None: name = k else: @@ -848,7 +842,7 @@ def _index_and_downcast(self, result, index, index_key): out_index.insert( len(out_index.columns), name, - index._source_data[index._source_data.columns[k]], + cudf.Series._from_data({None: index._data[index.names[k]]}), ) if len(result) == 1 and size == 0 and slice_access is False: @@ -860,18 +854,17 @@ def _index_and_downcast(self, result, index, index_key): # Pandas returns an empty Series with a tuple as name # the one expected result column series_name = [] - for code in index._source_data.columns: - series_name.append(index._source_data[code][0]) + for code in index.names: + series_name.append(index._data[code][0]) result = cudf.Series([]) result.name = tuple(series_name) elif len(out_index.columns) == 1: # If there's only one column remaining in the output index, convert # it into an Index and name the final index values according - # to the _source_data column names - last_column = index._source_data.columns[-1] - out_index = index._source_data[last_column] - out_index = as_index(out_index) - out_index.name = index.names[len(index.names) - 1] + # to that column's name. + last_column_name = index.names[-1] + out_index = as_index(index._data[last_column_name]) + out_index.name = last_column_name index = out_index elif len(out_index.columns) > 1: # Otherwise pop the leftmost levels, names, and codes from the @@ -952,29 +945,14 @@ def __len__(self): return self._data.nrows def __eq__(self, other): - if not hasattr(other, "_levels"): - return False - # Lazy comparison - if isinstance(other, MultiIndex) or hasattr(other, "_source_data"): + if isinstance(other, MultiIndex): for self_col, other_col in zip( - self._source_data._data.values(), - other._source_data._data.values(), + self._data.values(), other._data.values(), ): if not self_col.equals(other_col): return False return self.names == other.names - else: - # Lazy comparison isn't possible - MI was created manually. - # Actually compare the MI, not its source data (it doesn't have - # any). - equal_levels = self.levels == other.levels - if isinstance(equal_levels, np.ndarray): - equal_levels = equal_levels.all() - return ( - equal_levels - and self.codes.equals(other.codes) - and self.names == other.names - ) + return NotImplemented @property def is_contiguous(self): @@ -997,7 +975,9 @@ def take(self, indices): elif isinstance(indices, slice): start, stop, step = indices.indices(len(self)) indices = column.arange(start, stop, step) - result = MultiIndex(source_data=self._source_data.take(indices)) + result = MultiIndex.from_frame( + self.to_frame(index=False).take(indices) + ) if self._codes is not None: result._codes = self._codes.take(indices) if self._levels is not None: @@ -1010,26 +990,18 @@ def serialize(self): header["type-serialized"] = pickle.dumps(type(self)) header["names"] = pickle.dumps(self.names) - header["source_data"], frames = self._source_data.serialize() + # header["source_data"], frames = self._source_data.serialize() + header["columns"], frames = column.serialize_columns(self._columns) return header, frames @classmethod def deserialize(cls, header, frames): names = pickle.loads(header["names"]) - - source_data_typ = pickle.loads( - header["source_data"]["type-serialized"] - ) - source_data = source_data_typ.deserialize( - header["source_data"], frames - ) - - names = pickle.loads(header["names"]) - return MultiIndex(names=names, source_data=source_data) + columns = column.deserialize_columns(header["columns"], frames) + return cls._from_data(dict(zip(names, columns))) def __getitem__(self, index): - # TODO: This should be a take of the _source_data only match = self.take(index) if isinstance(index, slice): return match @@ -1041,7 +1013,10 @@ def __getitem__(self, index): return match def to_frame(self, index=True, name=None): - df = self._source_data + # TODO: Currently this function makes a shallow copy, which is + # incorrect. We want to make a deep copy, otherwise further + # modifications of the resulting DataFrame will affect the MultiIndex. + df = cudf.DataFrame._from_data(data=self._data) if index: df = df.set_index(self) if name is not None: @@ -1065,7 +1040,7 @@ def get_level_values(self, level): ------- An Index containing the values at the requested level. """ - colnames = list(self._source_data.columns) + colnames = self._data.names if level not in colnames: if isinstance(level, int): if level < 0: @@ -1081,20 +1056,20 @@ def get_level_values(self, level): raise KeyError(f"Level not found: '{level}'") else: level_idx = colnames.index(level) - level_values = as_index( - self._source_data._data[level], name=self.names[level_idx] - ) + level_values = as_index(self._data[level], name=self.names[level_idx]) return level_values @classmethod def _concat(cls, objs): - source_data = [o._source_data for o in objs] + source_data = [o.to_frame(index=False) for o in objs] + # TODO: Verify if this is really necesary or if we can rely on + # DataFrame._concat. if len(source_data) > 1: - for index, obj in enumerate(source_data[1:]): - obj.columns = source_data[0].columns - source_data[index + 1] = obj + colnames = source_data[0].columns + for obj in source_data[1:]: + obj.columns = colnames source_data = cudf.DataFrame._concat(source_data) names = [None for x in source_data.columns] @@ -1102,7 +1077,7 @@ def _concat(cls, objs): for o in range(len(objs)): for i, name in enumerate(objs[o].names): names[i] = names[i] or name - return cudf.MultiIndex(names=names, source_data=source_data) + return cudf.MultiIndex.from_frame(source_data, names=names) @classmethod def from_tuples(cls, tuples, names=None): @@ -1198,7 +1173,7 @@ def values(self): >>> type(midx.values) """ - return self._source_data.values + return self.to_frame(index=False).values @classmethod def from_frame(cls, df, names=None): @@ -1294,8 +1269,7 @@ def from_product(cls, arrays, names=None): """ # Use Pandas for handling Python host objects pdi = pd.MultiIndex.from_product(arrays, names=names) - result = cls.from_pandas(pdi) - return result + return cls.from_pandas(pdi) def _poplevels(self, level): """ @@ -1410,33 +1384,9 @@ def droplevel(self, level=-1): return mi def to_pandas(self, nullable=False, **kwargs): - if hasattr(self, "_source_data"): - result = self._source_data.to_pandas(nullable=nullable) - result.columns = self.names - return pd.MultiIndex.from_frame(result) - - pandas_codes = [] - for code in self.codes.columns: - pandas_codes.append(self.codes[code].to_array()) - - # We do two things here to mimic Pandas behavior: - # 1. as_index() on each level, so DatetimeColumn becomes DatetimeIndex - # 2. convert levels to numpy array so empty levels become Float64Index - levels = np.array( - [as_index(level).to_pandas() for level in self.levels] - ) - - # Backwards compatibility: - # Construct a dummy MultiIndex and check for the codes attr. - # This indicates that it is pandas >= 0.24 - # If no codes attr is present it is pandas <= 0.23 - if hasattr(pd.MultiIndex([[]], [[]]), "codes"): - pandas_mi = pd.MultiIndex(levels=levels, codes=pandas_codes) - else: - pandas_mi = pd.MultiIndex(levels=levels, labels=pandas_codes) - if self.names is not None: - pandas_mi.names = self.names - return pandas_mi + result = self.to_frame(index=False).to_pandas(nullable=nullable) + result.columns = self.names + return pd.MultiIndex.from_frame(result) @classmethod def from_pandas(cls, multiindex, nan_as_null=None): @@ -1474,6 +1424,9 @@ def from_pandas(cls, multiindex, nan_as_null=None): source_data=multiindex.to_frame(name=names), nan_as_null=nan_as_null, ) + # df = cudf.DataFrame.from_pandas( + # multiindex.to_frame(name=names), nan_as_null) + # return cls.from_frame(df, names=multiindex.names) @cached_property def is_unique(self): @@ -1792,7 +1745,7 @@ def get_loc(self, key, method=None, tolerance=None): # Handle partial key search. If length of `key` is less than `nlevels`, # Only search levels up to `len(key)` level. key_as_table = libcudf.table.Table( - {i: as_column(k, length=1) for i, k in enumerate(key)} + {i: column.as_column(k, length=1) for i, k in enumerate(key)} ) partial_index = self.__class__._from_data( data=self._data.select_by_index(slice(key_as_table._num_columns)) From 4e3eb23a57527ef03daa5a1dd02990955c829720 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 7 Sep 2021 16:51:16 -0700 Subject: [PATCH 05/11] Stop passing source_data to constructor in from_pandas. --- python/cudf/cudf/core/multiindex.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 61a443ce133..dbf018af68c 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1419,14 +1419,10 @@ def from_pandas(cls, multiindex, nan_as_null=None): # which preserves all levels of `multiindex`. names = tuple(range(len(multiindex.names))) - return cls( - names=multiindex.names, - source_data=multiindex.to_frame(name=names), - nan_as_null=nan_as_null, + df = cudf.DataFrame.from_pandas( + multiindex.to_frame(index=False, name=names), nan_as_null ) - # df = cudf.DataFrame.from_pandas( - # multiindex.to_frame(name=names), nan_as_null) - # return cls.from_frame(df, names=multiindex.names) + return cls.from_frame(df, names=multiindex.names) @cached_property def is_unique(self): From e74b1d19693d946091edfd1f7c86badda3946ded Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 7 Sep 2021 16:55:22 -0700 Subject: [PATCH 06/11] Move source_data logic from constructor directly into from_frame. --- python/cudf/cudf/core/multiindex.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index dbf018af68c..9237d41d3ab 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1228,7 +1228,25 @@ def from_frame(cls, df, names=None): ('NJ', 'Precip')], names=['state', 'observation']) """ - return cls(source_data=df, names=names) + obj = cls.__new__(cls) + super(cls, obj).__init__() + + source_data = df.copy(deep=False) + source_data.reset_index(drop=True, inplace=True) + if isinstance(source_data, pd.DataFrame): + source_data = cudf.DataFrame.from_pandas(source_data) + + names = names if names is not None else source_data._data.names + # if names are unique + # try using those as the source_data column names: + if len(dict.fromkeys(names)) == len(names): + source_data.columns = names + obj._name = None + obj._data = source_data._data + obj.names = names + obj._codes = None + obj._levels = None + return obj @classmethod def from_product(cls, arrays, names=None): From a83ad5f2bbd2268c1cd846eeb0db90af11e1299a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 7 Sep 2021 17:00:05 -0700 Subject: [PATCH 07/11] Remove all remaining references to _source_data. --- python/cudf/cudf/core/multiindex.py | 32 ----------------------------- 1 file changed, 32 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 9237d41d3ab..7e4e32f98e6 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -101,27 +101,6 @@ def __init__( if labels and not codes: codes = labels - # early termination enables lazy evaluation of codes - if "source_data" in kwargs: - source_data = kwargs["source_data"].copy(deep=False) - source_data.reset_index(drop=True, inplace=True) - - if isinstance(source_data, pd.DataFrame): - nan_as_null = kwargs.get("nan_as_null", None) - source_data = cudf.DataFrame.from_pandas( - source_data, nan_as_null=nan_as_null - ) - names = names if names is not None else source_data._data.names - # if names are unique - # try using those as the source_data column names: - if len(dict.fromkeys(names)) == len(names): - source_data.columns = names - self._data = source_data._data - self.names = names - self._codes = codes - self._levels = levels - return - if len(levels) == 0: raise ValueError("Must pass non-zero number of levels/codes") @@ -133,7 +112,6 @@ def __init__( if isinstance(codes, cudf.DataFrame): self._codes = codes elif len(levels) == len(codes): - self._codes = cudf.DataFrame() self._codes = cudf.DataFrame._from_data( { i: column.as_column(code).astype(np.int64) @@ -294,15 +272,6 @@ def _from_data( def shape(self): return (self._data.nrows, len(self._data.names)) - @property - def _source_data(self): - return cudf.DataFrame._from_data(data=self._data) - - @_source_data.setter - def _source_data(self, value): - self._data = value._data - self._compute_levels_and_codes() - @property def name(self): return self._name @@ -990,7 +959,6 @@ def serialize(self): header["type-serialized"] = pickle.dumps(type(self)) header["names"] = pickle.dumps(self.names) - # header["source_data"], frames = self._source_data.serialize() header["columns"], frames = column.serialize_columns(self._columns) return header, frames From e2a9f8ee2b1952e74723252c54cc580019f50aab Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 10 Sep 2021 12:58:37 -0700 Subject: [PATCH 08/11] Update python/cudf/cudf/core/groupby/groupby.py --- python/cudf/cudf/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 61734f10f1d..f1eeb9580fb 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1337,7 +1337,7 @@ def keys(self): elif nkeys > 1: return cudf.MultiIndex._from_data( dict(zip(range(nkeys), self._key_columns)) - ).set_names(self.names) + )._set_names(self.names) else: return cudf.core.index.as_index( self._key_columns[0], name=self.names[0] From 9be0f06f872550e7987f3f8ffc3c03cef9817db2 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 13 Sep 2021 13:42:08 -0700 Subject: [PATCH 09/11] Fix all but one case where index name duplication could fail. --- python/cudf/cudf/core/multiindex.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 227bc00f613..8432a2622cd 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -812,7 +812,7 @@ def _index_and_downcast(self, result, index, index_key): out_index.insert( len(out_index.columns), name, - cudf.Series._from_data({None: index._data[index.names[k]]}), + cudf.Series._from_data({None: index._data.columns[k]}), ) if len(result) == 1 and size == 0 and slice_access is False: @@ -824,17 +824,17 @@ def _index_and_downcast(self, result, index, index_key): # Pandas returns an empty Series with a tuple as name # the one expected result column series_name = [] - for code in index.names: - series_name.append(index._data[code][0]) + for col in index._data.columns: + series_name.append(col[0]) result = cudf.Series([]) result.name = tuple(series_name) elif len(out_index.columns) == 1: # If there's only one column remaining in the output index, convert # it into an Index and name the final index values according # to that column's name. - last_column_name = index.names[-1] - out_index = as_index(index._data[last_column_name]) - out_index.name = last_column_name + *_, last_column = index._data.columns + out_index = as_index(last_column) + out_index.name = index.names[-1] index = out_index elif len(out_index.columns) > 1: # Otherwise pop the leftmost levels, names, and codes from the From 01a071f3e87a2ca0c60b803cd98d0e3c0e7132d8 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 13 Sep 2021 14:22:06 -0700 Subject: [PATCH 10/11] Add backwards compatibility layer for pickled objects. --- python/cudf/cudf/core/multiindex.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 8432a2622cd..87af3749f3a 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -967,6 +967,18 @@ def serialize(self): @classmethod def deserialize(cls, header, frames): names = pickle.loads(header["names"]) + if "source_data" in header: + warnings.warn( + "MultiIndex objects serialized in cudf version " + "21.08 or older will no longer be deserializable " + "after version 21.10. Please load and resave any " + "pickles before upgrading to version 21.12.", + DeprecationWarning, + ) + df = cudf.DataFrame.deserialize(header["source_data"], frames) + obj = cls.from_frame(df) + obj._set_names(names) + return obj columns = column.deserialize_columns(header["columns"], frames) return cls._from_data(dict(zip(names, columns))) From f5493ae4fa76f6298fd839ad615264d88790b69b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 15 Sep 2021 12:10:15 -0700 Subject: [PATCH 11/11] Always ignore index. --- python/cudf/cudf/core/multiindex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 87af3749f3a..0506fc38443 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1514,7 +1514,7 @@ def fillna(self, value): return super().fillna(value=value) def unique(self): - return self.drop_duplicates() + return self.drop_duplicates(ignore_index=True) def _clean_nulls_from_index(self): """