From 4bc2eba3429aa02b77967fae17d8bae0942ef927 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 6 Sep 2021 10:09:47 -0700
Subject: [PATCH 01/11] Remove source_data usage outside of multiindex.py.

---
 python/cudf/cudf/core/dataframe.py       |  2 +-
 python/cudf/cudf/core/frame.py           | 14 +++++++++-----
 python/cudf/cudf/core/groupby/groupby.py |  9 +++------
 python/cudf/cudf/core/index.py           |  2 +-
 4 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index aac0b027c0b..e793a8e8644 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3962,7 +3962,7 @@ def sort_index(
                     ]
                 else:
                     labels = [self.index._get_level_label(level)]
-                inds = self.index._source_data[labels].argsort(
+                inds = self.index.to_frame(index=False)[labels].argsort(
                     ascending=ascending, na_position=na_position
                 )
             else:
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 33be14462d4..5f476bda7d7 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1856,7 +1856,7 @@ def sample(
             if isinstance(self, cudf.MultiIndex):
                 # TODO: Need to update this once MultiIndex is refactored,
                 # should be able to treat it similar to other Frame object
-                result = cudf.Index(self._source_data[gather_map])
+                result = cudf.Index(self.to_frame(index=False)[gather_map])
             else:
                 result = self[gather_map]
                 if not keep_index:
@@ -3168,9 +3168,13 @@ def _reindex(
             index = cudf.core.index.as_index(index)
 
             if isinstance(index, cudf.MultiIndex):
-                idx_dtype_match = (
-                    df.index._source_data.dtypes == index._source_data.dtypes
-                ).all()
+                idx_dtype_match = all(
+                    left_dtype == right_dtype
+                    for left_dtype, right_dtype in zip(
+                        (col.dtype for col in df.index._data.columns),
+                        (col.dtype for col in index._data.columns),
+                    )
+                )
             else:
                 idx_dtype_match = df.index.dtype == index.dtype
 
@@ -5152,7 +5156,7 @@ def _drop_rows_by_labels(
         # 1. Merge Index df and data df along column axis:
         # | id | ._index df | data column(s) |
         idx_nlv = obj._index.nlevels
-        working_df = obj._index._source_data
+        working_df = obj._index.to_frame(index=False)
         working_df.columns = [i for i in range(idx_nlv)]
         for i, col in enumerate(obj._data):
             working_df[idx_nlv + i] = obj._data[col]
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index d98a78efb18..b16de048b8d 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1334,12 +1334,9 @@ def keys(self):
         if nkeys == 0:
             return cudf.core.index.as_index([], name=None)
         elif nkeys > 1:
-            return cudf.MultiIndex(
-                source_data=cudf.DataFrame(
-                    dict(zip(range(nkeys), self._key_columns))
-                ),
-                names=self.names,
-            )
+            return cudf.MultiIndex._from_data(
+                dict(zip(range(nkeys), self._key_columns))
+            ).set_names(self.names)
         else:
             return cudf.core.index.as_index(
                 self._key_columns[0], name=self.names[0]
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 6b4b77fabc5..cc5cd474db9 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2203,7 +2203,7 @@ def as_index(arbitrary, **kwargs) -> BaseIndex:
     elif isinstance(arbitrary, pd.MultiIndex):
         return cudf.MultiIndex.from_pandas(arbitrary)
     elif isinstance(arbitrary, cudf.DataFrame):
-        return cudf.MultiIndex(source_data=arbitrary)
+        return cudf.MultiIndex.from_frame(arbitrary)
     return as_index(
         column.as_column(arbitrary, dtype=kwargs.get("dtype", None)), **kwargs
     )

From bfa5ef9ef48079f7c954b941c4257a7e436c56e0 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 6 Sep 2021 10:36:50 -0700
Subject: [PATCH 02/11] Fix test names.

---
 python/cudf/cudf/tests/test_repr.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index fa6c4d9bf24..1ff56522b6e 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -1168,7 +1168,7 @@ def test_timedelta_index_repr(index, expected_repr):
         100,
     ],
 )
-def test_mulitIndex_repr(pmi, max_seq_items):
+def test_multiIndex_repr(pmi, max_seq_items):
     pd.set_option("display.max_seq_items", max_seq_items)
     gmi = cudf.from_pandas(pmi)
 
@@ -1413,7 +1413,7 @@ def test_mulitIndex_repr(pmi, max_seq_items):
         ),
     ],
 )
-def test_mulitIndex_null_repr(gdi, expected_repr):
+def test_multiIndex_null_repr(gdi, expected_repr):
     actual_repr = gdi.__repr__()
 
     assert actual_repr.split() == expected_repr.split()

From d6bafecf3155aa66ad48401e7fde12c5b89531d3 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 6 Sep 2021 12:55:17 -0700
Subject: [PATCH 03/11] First pass at removing uses of source_data in
 multiindex code.

---
 python/cudf/cudf/core/multiindex.py | 78 ++++++++++++-----------------
 1 file changed, 32 insertions(+), 46 deletions(-)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 3b364a3fa86..5a41f623baa 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -20,7 +20,7 @@
 from cudf.core.column import as_column, column
 from cudf.core.frame import Frame
 from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index
-from cudf.utils.utils import _maybe_indices_to_slice
+from cudf.utils.utils import _maybe_indices_to_slice, cached_property
 
 
 class MultiIndex(Frame, BaseIndex):
@@ -415,7 +415,7 @@ def copy(
             mi = MultiIndex(levels=levels, codes=codes, names=names, copy=deep)
             return mi
 
-        mi = MultiIndex(source_data=self._source_data.copy(deep=deep))
+        mi = MultiIndex._from_data(self._data.copy(deep=deep))
         if self._levels is not None:
             mi._levels = [s.copy(deep) for s in self._levels]
         if self._codes is not None:
@@ -448,7 +448,11 @@ def _popn(self, n):
         Removes n names, labels, and codes in order to build a new index
         for results.
         """
-        result = MultiIndex(source_data=self._source_data.iloc[:, n:])
+        result = MultiIndex(
+            levels=self.levels[n:],
+            codes=self.codes.iloc[:, n:],
+            names=self.names[n:],
+        )
         if self.names is not None:
             result.names = self.names[n:]
         return result
@@ -471,13 +475,9 @@ def __repr__(self):
         else:
             preprocess = self.copy(deep=False)
 
-        cols_nulls = [
-            preprocess._source_data._data[col].has_nulls
-            for col in preprocess._source_data._data
-        ]
-        if any(cols_nulls):
-            preprocess_df = preprocess._source_data
-            for name, col in preprocess_df._data.items():
+        if any(col.has_nulls for col in preprocess._data.columns):
+            preprocess_df = preprocess.to_frame(index=False)
+            for name, col in preprocess._data.items():
                 if isinstance(
                     col,
                     (
@@ -488,8 +488,6 @@ def __repr__(self):
                     preprocess_df[name] = col.astype("str").fillna(
                         cudf._NA_REP
                     )
-                else:
-                    preprocess_df[name] = col
 
             tuples_list = list(
                 zip(
@@ -506,18 +504,12 @@ def __repr__(self):
                 # TODO: Remove this whole `if` block,
                 # this is a workaround for the following issue:
                 # https://github.com/pandas-dev/pandas/issues/39984
-                temp_df = preprocess._source_data
-
-                preprocess_pdf = pd.DataFrame()
-                for col in temp_df.columns:
-                    if temp_df[col].dtype.kind == "f":
-                        preprocess_pdf[col] = temp_df[col].to_pandas(
-                            nullable=False
-                        )
-                    else:
-                        preprocess_pdf[col] = temp_df[col].to_pandas(
-                            nullable=True
-                        )
+                preprocess_pdf = pd.DataFrame(
+                    {
+                        name: col.to_pandas(nullable=(col.dtype.kind != "f"))
+                        for name, col in preprocess._data.items()
+                    }
+                )
 
                 preprocess_pdf.columns = preprocess.names
                 preprocess = pd.MultiIndex.from_frame(preprocess_pdf)
@@ -578,7 +570,7 @@ def nlevels(self):
         """
         Integer number of levels in this MultiIndex.
         """
-        return self._source_data.shape[1]
+        return len(self._data)
 
     @property
     def levels(self):
@@ -762,15 +754,14 @@ def where(self, cond, other=None, inplace=False):
     def _compute_levels_and_codes(self):
         levels = []
 
-        codes = cudf.DataFrame()
-        for name in self._source_data.columns:
-            code, cats = self._source_data[name].factorize()
+        codes = {}
+        for name, col in self._data.items():
+            code, cats = cudf.Series._from_data({None: col}).factorize()
             codes[name] = code.astype(np.int64)
-            cats = cudf.Series(cats, name=None)
-            levels.append(cats)
+            levels.append(cudf.Series(cats, name=None))
 
         self._levels = levels
-        self._codes = codes
+        self._codes = cudf.DataFrame._from_data(codes)
 
     def _compute_validity_mask(self, index, row_tuple, max_length):
         """ Computes the valid set of indices of values in the lookup
@@ -1478,21 +1469,15 @@ def from_pandas(cls, multiindex, nan_as_null=None):
         # which preserves all levels of `multiindex`.
         names = tuple(range(len(multiindex.names)))
 
-        mi = cls(
+        return cls(
             names=multiindex.names,
             source_data=multiindex.to_frame(name=names),
             nan_as_null=nan_as_null,
         )
 
-        return mi
-
-    @property
+    @cached_property
     def is_unique(self):
-        if not hasattr(self, "_is_unique"):
-            self._is_unique = len(self._source_data) == len(
-                self._source_data.drop_duplicates(ignore_index=True)
-            )
-        return self._is_unique
+        return len(self) == len(self.unique())
 
     @property
     def is_monotonic(self):
@@ -1525,14 +1510,15 @@ def is_monotonic_decreasing(self):
         )
 
     def argsort(self, ascending=True, **kwargs):
-        indices = self._source_data.argsort(ascending=ascending, **kwargs)
-        return cupy.asarray(indices)
+        return self._get_sorted_inds(ascending=ascending, **kwargs).values
 
     def sort_values(self, return_indexer=False, ascending=True, key=None):
         if key is not None:
             raise NotImplementedError("key parameter is not yet implemented.")
 
-        indices = self._source_data.argsort(ascending=ascending)
+        indices = cudf.Series._from_data(
+            {None: self._get_sorted_inds(ascending=ascending)}
+        )
         index_sorted = as_index(self.take(indices), name=self.names)
 
         if return_indexer:
@@ -1581,21 +1567,21 @@ def fillna(self, value):
         return super().fillna(value=value)
 
     def unique(self):
-        return MultiIndex.from_frame(self._source_data.drop_duplicates())
+        return self.drop_duplicates()
 
     def _clean_nulls_from_index(self):
         """
         Convert all na values(if any) in MultiIndex object
         to `<NA>` as a preprocessing step to `__repr__` methods.
         """
-        index_df = self._source_data
+        index_df = self.to_frame(index=False)
         return MultiIndex.from_frame(
             index_df._clean_nulls_from_dataframe(index_df), names=self.names
         )
 
     def memory_usage(self, deep=False):
         n = 0
-        for col in self._source_data._columns:
+        for col in self._data._columns:
             n += col._memory_usage(deep=deep)
         if self._levels:
             for level in self._levels:

From 8a48911afb5b3f04c67f4667b992b3b39ede5597 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 7 Sep 2021 13:39:45 -0700
Subject: [PATCH 04/11] Rework pickling and clean up remaining instances of
 source_data.

---
 python/cudf/cudf/core/multiindex.py | 151 ++++++++++------------------
 1 file changed, 52 insertions(+), 99 deletions(-)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 5a41f623baa..61a443ce133 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -16,8 +16,8 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._typing import DataFrameOrSeries
+from cudf.core import column
 from cudf.core._compat import PANDAS_GE_120
-from cudf.core.column import as_column, column
 from cudf.core.frame import Frame
 from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index
 from cudf.utils.utils import _maybe_indices_to_slice, cached_property
@@ -153,7 +153,6 @@ def __init__(
         for i, n in enumerate(self._codes.columns):
             codes = as_index(self._codes[n]._column)
             if -1 in self._codes[n].values:
-                # Must account for null(s) in _source_data column
                 level = cudf.DataFrame(
                     {n: [None] + list(self._levels[i])},
                     index=range(-1, len(self._levels[i])),
@@ -465,11 +464,9 @@ def __repr__(self):
             # TODO: Update the following two arange calls to
             # a single arange call once arange has support for
             # a vector start/end points.
-            indices = cudf.core.column.arange(start=0, stop=n, step=1)
+            indices = column.arange(start=0, stop=n, step=1)
             indices = indices.append(
-                cudf.core.column.arange(
-                    start=len(self) - n, stop=len(self), step=1
-                )
+                column.arange(start=len(self) - n, stop=len(self), step=1)
             )
             preprocess = self.take(indices)
         else:
@@ -481,8 +478,8 @@ def __repr__(self):
                 if isinstance(
                     col,
                     (
-                        cudf.core.column.datetime.DatetimeColumn,
-                        cudf.core.column.timedelta.TimeDeltaColumn,
+                        column.datetime.DatetimeColumn,
+                        column.timedelta.TimeDeltaColumn,
                     ),
                 ):
                     preprocess_df[name] = col.astype("str").fillna(
@@ -767,19 +764,16 @@ def _compute_validity_mask(self, index, row_tuple, max_length):
         """ Computes the valid set of indices of values in the lookup
         """
         lookup = cudf.DataFrame()
-        for idx, row in enumerate(row_tuple):
+        for name, row in zip(index.names, row_tuple):
             if isinstance(row, slice) and row == slice(None):
                 continue
-            lookup[index._source_data.columns[idx]] = cudf.Series(row)
+            lookup[name] = cudf.Series(row)
+        frame = index.to_frame(index=False)
         data_table = cudf.concat(
             [
-                index._source_data,
+                frame,
                 cudf.DataFrame(
-                    {
-                        "idx": cudf.Series(
-                            column.arange(len(index._source_data))
-                        )
-                    }
+                    {"idx": cudf.Series(column.arange(len(frame)))}
                 ),
             ],
             axis=1,
@@ -835,12 +829,12 @@ def _index_and_downcast(self, result, index, index_key):
         if isinstance(index_key, slice):
             slice_access = True
         out_index = cudf.DataFrame()
-        # Select the last n-k columns where n is the number of _source_data
-        # columns and k is the length of the indexing tuple
+        # Select the last n-k columns where n is the number of columns and k is
+        # the length of the indexing tuple
         size = 0
         if not isinstance(index_key, (numbers.Number, slice)):
             size = len(index_key)
-        for k in range(size, len(index._source_data.columns)):
+        for k in range(size, len(index._data)):
             if index.names is None:
                 name = k
             else:
@@ -848,7 +842,7 @@ def _index_and_downcast(self, result, index, index_key):
             out_index.insert(
                 len(out_index.columns),
                 name,
-                index._source_data[index._source_data.columns[k]],
+                cudf.Series._from_data({None: index._data[index.names[k]]}),
             )
 
         if len(result) == 1 and size == 0 and slice_access is False:
@@ -860,18 +854,17 @@ def _index_and_downcast(self, result, index, index_key):
             # Pandas returns an empty Series with a tuple as name
             # the one expected result column
             series_name = []
-            for code in index._source_data.columns:
-                series_name.append(index._source_data[code][0])
+            for code in index.names:
+                series_name.append(index._data[code][0])
             result = cudf.Series([])
             result.name = tuple(series_name)
         elif len(out_index.columns) == 1:
             # If there's only one column remaining in the output index, convert
             # it into an Index and name the final index values according
-            # to the _source_data column names
-            last_column = index._source_data.columns[-1]
-            out_index = index._source_data[last_column]
-            out_index = as_index(out_index)
-            out_index.name = index.names[len(index.names) - 1]
+            # to that column's name.
+            last_column_name = index.names[-1]
+            out_index = as_index(index._data[last_column_name])
+            out_index.name = last_column_name
             index = out_index
         elif len(out_index.columns) > 1:
             # Otherwise pop the leftmost levels, names, and codes from the
@@ -952,29 +945,14 @@ def __len__(self):
         return self._data.nrows
 
     def __eq__(self, other):
-        if not hasattr(other, "_levels"):
-            return False
-        # Lazy comparison
-        if isinstance(other, MultiIndex) or hasattr(other, "_source_data"):
+        if isinstance(other, MultiIndex):
             for self_col, other_col in zip(
-                self._source_data._data.values(),
-                other._source_data._data.values(),
+                self._data.values(), other._data.values(),
             ):
                 if not self_col.equals(other_col):
                     return False
             return self.names == other.names
-        else:
-            # Lazy comparison isn't possible - MI was created manually.
-            # Actually compare the MI, not its source data (it doesn't have
-            # any).
-            equal_levels = self.levels == other.levels
-            if isinstance(equal_levels, np.ndarray):
-                equal_levels = equal_levels.all()
-            return (
-                equal_levels
-                and self.codes.equals(other.codes)
-                and self.names == other.names
-            )
+        return NotImplemented
 
     @property
     def is_contiguous(self):
@@ -997,7 +975,9 @@ def take(self, indices):
         elif isinstance(indices, slice):
             start, stop, step = indices.indices(len(self))
             indices = column.arange(start, stop, step)
-        result = MultiIndex(source_data=self._source_data.take(indices))
+        result = MultiIndex.from_frame(
+            self.to_frame(index=False).take(indices)
+        )
         if self._codes is not None:
             result._codes = self._codes.take(indices)
         if self._levels is not None:
@@ -1010,26 +990,18 @@ def serialize(self):
         header["type-serialized"] = pickle.dumps(type(self))
         header["names"] = pickle.dumps(self.names)
 
-        header["source_data"], frames = self._source_data.serialize()
+        # header["source_data"], frames = self._source_data.serialize()
+        header["columns"], frames = column.serialize_columns(self._columns)
 
         return header, frames
 
     @classmethod
     def deserialize(cls, header, frames):
         names = pickle.loads(header["names"])
-
-        source_data_typ = pickle.loads(
-            header["source_data"]["type-serialized"]
-        )
-        source_data = source_data_typ.deserialize(
-            header["source_data"], frames
-        )
-
-        names = pickle.loads(header["names"])
-        return MultiIndex(names=names, source_data=source_data)
+        columns = column.deserialize_columns(header["columns"], frames)
+        return cls._from_data(dict(zip(names, columns)))
 
     def __getitem__(self, index):
-        # TODO: This should be a take of the _source_data only
         match = self.take(index)
         if isinstance(index, slice):
             return match
@@ -1041,7 +1013,10 @@ def __getitem__(self, index):
             return match
 
     def to_frame(self, index=True, name=None):
-        df = self._source_data
+        # TODO: Currently this function makes a shallow copy, which is
+        # incorrect. We want to make a deep copy, otherwise further
+        # modifications of the resulting DataFrame will affect the MultiIndex.
+        df = cudf.DataFrame._from_data(data=self._data)
         if index:
             df = df.set_index(self)
         if name is not None:
@@ -1065,7 +1040,7 @@ def get_level_values(self, level):
         -------
         An Index containing the values at the requested level.
         """
-        colnames = list(self._source_data.columns)
+        colnames = self._data.names
         if level not in colnames:
             if isinstance(level, int):
                 if level < 0:
@@ -1081,20 +1056,20 @@ def get_level_values(self, level):
                 raise KeyError(f"Level not found: '{level}'")
         else:
             level_idx = colnames.index(level)
-        level_values = as_index(
-            self._source_data._data[level], name=self.names[level_idx]
-        )
+        level_values = as_index(self._data[level], name=self.names[level_idx])
         return level_values
 
     @classmethod
     def _concat(cls, objs):
 
-        source_data = [o._source_data for o in objs]
+        source_data = [o.to_frame(index=False) for o in objs]
 
+        # TODO: Verify if this is really necesary or if we can rely on
+        # DataFrame._concat.
         if len(source_data) > 1:
-            for index, obj in enumerate(source_data[1:]):
-                obj.columns = source_data[0].columns
-                source_data[index + 1] = obj
+            colnames = source_data[0].columns
+            for obj in source_data[1:]:
+                obj.columns = colnames
 
         source_data = cudf.DataFrame._concat(source_data)
         names = [None for x in source_data.columns]
@@ -1102,7 +1077,7 @@ def _concat(cls, objs):
         for o in range(len(objs)):
             for i, name in enumerate(objs[o].names):
                 names[i] = names[i] or name
-        return cudf.MultiIndex(names=names, source_data=source_data)
+        return cudf.MultiIndex.from_frame(source_data, names=names)
 
     @classmethod
     def from_tuples(cls, tuples, names=None):
@@ -1198,7 +1173,7 @@ def values(self):
         >>> type(midx.values)
         <class 'cupy.core.core.ndarray'>
         """
-        return self._source_data.values
+        return self.to_frame(index=False).values
 
     @classmethod
     def from_frame(cls, df, names=None):
@@ -1294,8 +1269,7 @@ def from_product(cls, arrays, names=None):
         """
         # Use Pandas for handling Python host objects
         pdi = pd.MultiIndex.from_product(arrays, names=names)
-        result = cls.from_pandas(pdi)
-        return result
+        return cls.from_pandas(pdi)
 
     def _poplevels(self, level):
         """
@@ -1410,33 +1384,9 @@ def droplevel(self, level=-1):
             return mi
 
     def to_pandas(self, nullable=False, **kwargs):
-        if hasattr(self, "_source_data"):
-            result = self._source_data.to_pandas(nullable=nullable)
-            result.columns = self.names
-            return pd.MultiIndex.from_frame(result)
-
-        pandas_codes = []
-        for code in self.codes.columns:
-            pandas_codes.append(self.codes[code].to_array())
-
-        # We do two things here to mimic Pandas behavior:
-        # 1. as_index() on each level, so DatetimeColumn becomes DatetimeIndex
-        # 2. convert levels to numpy array so empty levels become Float64Index
-        levels = np.array(
-            [as_index(level).to_pandas() for level in self.levels]
-        )
-
-        # Backwards compatibility:
-        # Construct a dummy MultiIndex and check for the codes attr.
-        # This indicates that it is pandas >= 0.24
-        # If no codes attr is present it is pandas <= 0.23
-        if hasattr(pd.MultiIndex([[]], [[]]), "codes"):
-            pandas_mi = pd.MultiIndex(levels=levels, codes=pandas_codes)
-        else:
-            pandas_mi = pd.MultiIndex(levels=levels, labels=pandas_codes)
-        if self.names is not None:
-            pandas_mi.names = self.names
-        return pandas_mi
+        result = self.to_frame(index=False).to_pandas(nullable=nullable)
+        result.columns = self.names
+        return pd.MultiIndex.from_frame(result)
 
     @classmethod
     def from_pandas(cls, multiindex, nan_as_null=None):
@@ -1474,6 +1424,9 @@ def from_pandas(cls, multiindex, nan_as_null=None):
             source_data=multiindex.to_frame(name=names),
             nan_as_null=nan_as_null,
         )
+        # df = cudf.DataFrame.from_pandas(
+        #     multiindex.to_frame(name=names), nan_as_null)
+        # return cls.from_frame(df, names=multiindex.names)
 
     @cached_property
     def is_unique(self):
@@ -1792,7 +1745,7 @@ def get_loc(self, key, method=None, tolerance=None):
         # Handle partial key search. If length of `key` is less than `nlevels`,
         # Only search levels up to `len(key)` level.
         key_as_table = libcudf.table.Table(
-            {i: as_column(k, length=1) for i, k in enumerate(key)}
+            {i: column.as_column(k, length=1) for i, k in enumerate(key)}
         )
         partial_index = self.__class__._from_data(
             data=self._data.select_by_index(slice(key_as_table._num_columns))

From 4e3eb23a57527ef03daa5a1dd02990955c829720 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 7 Sep 2021 16:51:16 -0700
Subject: [PATCH 05/11] Stop passing source_data to constructor in from_pandas.

---
 python/cudf/cudf/core/multiindex.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 61a443ce133..dbf018af68c 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1419,14 +1419,10 @@ def from_pandas(cls, multiindex, nan_as_null=None):
         # which preserves all levels of `multiindex`.
         names = tuple(range(len(multiindex.names)))
 
-        return cls(
-            names=multiindex.names,
-            source_data=multiindex.to_frame(name=names),
-            nan_as_null=nan_as_null,
+        df = cudf.DataFrame.from_pandas(
+            multiindex.to_frame(index=False, name=names), nan_as_null
         )
-        # df = cudf.DataFrame.from_pandas(
-        #     multiindex.to_frame(name=names), nan_as_null)
-        # return cls.from_frame(df, names=multiindex.names)
+        return cls.from_frame(df, names=multiindex.names)
 
     @cached_property
     def is_unique(self):

From e74b1d19693d946091edfd1f7c86badda3946ded Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 7 Sep 2021 16:55:22 -0700
Subject: [PATCH 06/11] Move source_data logic from constructor directly into
 from_frame.

---
 python/cudf/cudf/core/multiindex.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index dbf018af68c..9237d41d3ab 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1228,7 +1228,25 @@ def from_frame(cls, df, names=None):
                     ('NJ', 'Precip')],
                    names=['state', 'observation'])
         """
-        return cls(source_data=df, names=names)
+        obj = cls.__new__(cls)
+        super(cls, obj).__init__()
+
+        source_data = df.copy(deep=False)
+        source_data.reset_index(drop=True, inplace=True)
+        if isinstance(source_data, pd.DataFrame):
+            source_data = cudf.DataFrame.from_pandas(source_data)
+
+        names = names if names is not None else source_data._data.names
+        # if names are unique
+        # try using those as the source_data column names:
+        if len(dict.fromkeys(names)) == len(names):
+            source_data.columns = names
+        obj._name = None
+        obj._data = source_data._data
+        obj.names = names
+        obj._codes = None
+        obj._levels = None
+        return obj
 
     @classmethod
     def from_product(cls, arrays, names=None):

From a83ad5f2bbd2268c1cd846eeb0db90af11e1299a Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 7 Sep 2021 17:00:05 -0700
Subject: [PATCH 07/11] Remove all remaining references to _source_data.

---
 python/cudf/cudf/core/multiindex.py | 32 -----------------------------
 1 file changed, 32 deletions(-)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 9237d41d3ab..7e4e32f98e6 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -101,27 +101,6 @@ def __init__(
         if labels and not codes:
             codes = labels
 
-        # early termination enables lazy evaluation of codes
-        if "source_data" in kwargs:
-            source_data = kwargs["source_data"].copy(deep=False)
-            source_data.reset_index(drop=True, inplace=True)
-
-            if isinstance(source_data, pd.DataFrame):
-                nan_as_null = kwargs.get("nan_as_null", None)
-                source_data = cudf.DataFrame.from_pandas(
-                    source_data, nan_as_null=nan_as_null
-                )
-            names = names if names is not None else source_data._data.names
-            # if names are unique
-            # try using those as the source_data column names:
-            if len(dict.fromkeys(names)) == len(names):
-                source_data.columns = names
-            self._data = source_data._data
-            self.names = names
-            self._codes = codes
-            self._levels = levels
-            return
-
         if len(levels) == 0:
             raise ValueError("Must pass non-zero number of levels/codes")
 
@@ -133,7 +112,6 @@ def __init__(
         if isinstance(codes, cudf.DataFrame):
             self._codes = codes
         elif len(levels) == len(codes):
-            self._codes = cudf.DataFrame()
             self._codes = cudf.DataFrame._from_data(
                 {
                     i: column.as_column(code).astype(np.int64)
@@ -294,15 +272,6 @@ def _from_data(
     def shape(self):
         return (self._data.nrows, len(self._data.names))
 
-    @property
-    def _source_data(self):
-        return cudf.DataFrame._from_data(data=self._data)
-
-    @_source_data.setter
-    def _source_data(self, value):
-        self._data = value._data
-        self._compute_levels_and_codes()
-
     @property
     def name(self):
         return self._name
@@ -990,7 +959,6 @@ def serialize(self):
         header["type-serialized"] = pickle.dumps(type(self))
         header["names"] = pickle.dumps(self.names)
 
-        # header["source_data"], frames = self._source_data.serialize()
         header["columns"], frames = column.serialize_columns(self._columns)
 
         return header, frames

From e2a9f8ee2b1952e74723252c54cc580019f50aab Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyas.ramasubramani@gmail.com>
Date: Fri, 10 Sep 2021 12:58:37 -0700
Subject: [PATCH 08/11] Update python/cudf/cudf/core/groupby/groupby.py

---
 python/cudf/cudf/core/groupby/groupby.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 61734f10f1d..f1eeb9580fb 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1337,7 +1337,7 @@ def keys(self):
         elif nkeys > 1:
             return cudf.MultiIndex._from_data(
                 dict(zip(range(nkeys), self._key_columns))
-            ).set_names(self.names)
+            )._set_names(self.names)
         else:
             return cudf.core.index.as_index(
                 self._key_columns[0], name=self.names[0]

From 9be0f06f872550e7987f3f8ffc3c03cef9817db2 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 13 Sep 2021 13:42:08 -0700
Subject: [PATCH 09/11] Fix all but one case where index name duplication could
 fail.

---
 python/cudf/cudf/core/multiindex.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 227bc00f613..8432a2622cd 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -812,7 +812,7 @@ def _index_and_downcast(self, result, index, index_key):
             out_index.insert(
                 len(out_index.columns),
                 name,
-                cudf.Series._from_data({None: index._data[index.names[k]]}),
+                cudf.Series._from_data({None: index._data.columns[k]}),
             )
 
         if len(result) == 1 and size == 0 and slice_access is False:
@@ -824,17 +824,17 @@ def _index_and_downcast(self, result, index, index_key):
             # Pandas returns an empty Series with a tuple as name
             # the one expected result column
             series_name = []
-            for code in index.names:
-                series_name.append(index._data[code][0])
+            for col in index._data.columns:
+                series_name.append(col[0])
             result = cudf.Series([])
             result.name = tuple(series_name)
         elif len(out_index.columns) == 1:
             # If there's only one column remaining in the output index, convert
             # it into an Index and name the final index values according
             # to that column's name.
-            last_column_name = index.names[-1]
-            out_index = as_index(index._data[last_column_name])
-            out_index.name = last_column_name
+            *_, last_column = index._data.columns
+            out_index = as_index(last_column)
+            out_index.name = index.names[-1]
             index = out_index
         elif len(out_index.columns) > 1:
             # Otherwise pop the leftmost levels, names, and codes from the

From 01a071f3e87a2ca0c60b803cd98d0e3c0e7132d8 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 13 Sep 2021 14:22:06 -0700
Subject: [PATCH 10/11] Add backwards compatibility layer for pickled objects.

---
 python/cudf/cudf/core/multiindex.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 8432a2622cd..87af3749f3a 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -967,6 +967,18 @@ def serialize(self):
     @classmethod
     def deserialize(cls, header, frames):
         names = pickle.loads(header["names"])
+        if "source_data" in header:
+            warnings.warn(
+                "MultiIndex objects serialized in cudf version "
+                "21.08 or older will no longer be deserializable "
+                "after version 21.10. Please load and resave any "
+                "pickles before upgrading to version 21.12.",
+                DeprecationWarning,
+            )
+            df = cudf.DataFrame.deserialize(header["source_data"], frames)
+            obj = cls.from_frame(df)
+            obj._set_names(names)
+            return obj
         columns = column.deserialize_columns(header["columns"], frames)
         return cls._from_data(dict(zip(names, columns)))
 

From f5493ae4fa76f6298fd839ad615264d88790b69b Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 15 Sep 2021 12:10:15 -0700
Subject: [PATCH 11/11] Always ignore index.

---
 python/cudf/cudf/core/multiindex.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 87af3749f3a..0506fc38443 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1514,7 +1514,7 @@ def fillna(self, value):
         return super().fillna(value=value)
 
     def unique(self):
-        return self.drop_duplicates()
+        return self.drop_duplicates(ignore_index=True)
 
     def _clean_nulls_from_index(self):
         """