Refactor Index hierarchy (#9039)

Until now the class hierarchy of index types has had numerous logical flaws for reasons of convenience: for instance, `RangeIndex` was always inheriting from `Frame` despite not actually being backed by data, and since #8115 `MultiIndex` has been a `SingleColumnFrame` even though it actually has multiple columns. This PR moves `BaseIndex` to the top of its own hierarchy, and uses multiple inheritance with `Frame` and `SingleColumnFrame` to create a more sensible hierarchy for its subclasses. `BaseIndex` is now effectively an ABC defining the interface that subclasses must define, but many of these methods are still inherited from `Frame` types (or in the case of `RangeIndex`, delegated to `Int64Index`). These changes remove lots of broken behavior that was previously present in `MultiIndex` and `RangeIndex`; for instance, binary operations would previously fail in strange ways for `MultiIndex`, and various hacks were necessary for `MultiIndex` methods to bypass `SingleColumnFrame`. `RangeIndex` methods that delegate to `Int64Index` are now made explicit (rather than the previous implicit conversion via `self._data`). The new hierarchy also allows much more sensible type-checking by mypy, which revealed numerous additional conceptual issues. The bulk of this PR is actually moving functions around to make the type checker happy, some of which also fixed actual functional issues: for example, `RangeIndex.get_loc` was previously broken. The refactor will make it much easier to handle future changes to all classes in the index hierarchy. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Marlene (https://github.com/marlenezw) - Michael Wang (https://github.com/isVoid) URL: #9039
rapidsai · Aug 31, 2021 · 48bc39e · 48bc39e
1 parent fdcb90a
commit 48bc39e
Show file tree

Hide file tree

Showing 17 changed files with 1,864 additions and 1,764 deletions.
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
@@ -99,10 +99,12 @@ cdef class GroupBy:
         c_grouped_values = move(c_groups.values)
         c_group_offsets = c_groups.offsets
 
-        grouped_keys = cudf.Index._from_data(*data_from_unique_ptr(
-            move(c_grouped_keys),
-            column_names=range(c_grouped_keys.get()[0].num_columns())
-        ))
+        grouped_keys = cudf.core.index._index_from_data(
+            *data_from_unique_ptr(
+                move(c_grouped_keys),
+                column_names=range(c_grouped_keys.get()[0].num_columns())
+            )
+        )
         grouped_values = data_from_unique_ptr(
             move(c_grouped_values),
             index_names=values._index_names,
@@ -186,7 +188,8 @@ cdef class GroupBy:
                     Column.from_unique_ptr(move(c_result.second[i].results[j]))
                 )
 
-        return result_data, cudf.Index._from_data(grouped_keys)
+        return result_data, cudf.core.index._index_from_data(
+            grouped_keys)
 
     def scan_internal(self, Table values, aggregations):
         from cudf.core.column_accessor import ColumnAccessor
@@ -264,7 +267,8 @@ cdef class GroupBy:
                     Column.from_unique_ptr(move(c_result.second[i].results[j]))
                 )
 
-        return result_data, cudf.Index._from_data(grouped_keys)
+        return result_data, cudf.core.index._index_from_data(
+            grouped_keys)
 
     def aggregate(self, Table values, aggregations):
         """
@@ -311,10 +315,12 @@ cdef class GroupBy:
                 self.c_obj.get()[0].shift(view, offsets, c_fill_values)
             )
 
-        grouped_keys = cudf.Index._from_data(*data_from_unique_ptr(
-            move(c_result.first),
-            column_names=self.keys._column_names
-        ))
+        grouped_keys = cudf.core.index._index_from_data(
+            *data_from_unique_ptr(
+                move(c_result.first),
+                column_names=self.keys._column_names
+            )
+        )
 
         shifted, _ = data_from_unique_ptr(
             move(c_result.second), column_names=values._column_names

diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
@@ -251,7 +251,7 @@ cdef data_from_unique_ptr(
         # Frame factories we may want to look for a less dissonant approach
         # that does not impose performance penalties. The same applies to
         # data_from_table_view below.
-        cudf.Index._from_data(
+        cudf.core.index._index_from_data(
             {
                 name: columns[i]
                 for i, name in enumerate(index_names)
@@ -301,7 +301,8 @@ cdef data_from_table_view(
                 )
             )
             column_idx += 1
-        index = cudf.Index._from_data(dict(zip(index_names, index_columns)))
+        index = cudf.core.index._index_from_data(
+            dict(zip(index_names, index_columns)))
 
     # Construct the data dict
     cdef size_type source_column_idx = 0

diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py
@@ -29,3 +29,6 @@
 
 DataFrameOrSeries = Union["cudf.Series", "cudf.DataFrame"]
 SeriesOrIndex = Union["cudf.Series", "cudf.core.index.BaseIndex"]
+SeriesOrSingleColumnIndex = Union[
+    "cudf.Series", "cudf.core.index.GenericIndex"
+]
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
@@ -194,7 +194,7 @@ def wrapped_func(obj):
 
 
 def _union_categoricals(
-    to_union: List[Union[cudf.Series, cudf.Index]],
+    to_union: List[Union[cudf.Series, cudf.CategoricalIndex]],
     sort_categories: bool = False,
     ignore_order: bool = False,
 ):