Skip to content

Commit

Permalink
Refactor Index hierarchy (#9039)
Browse files Browse the repository at this point in the history
Until now the class hierarchy of index types has had numerous logical flaws for reasons of convenience: for instance, `RangeIndex` was always inheriting from `Frame` despite not actually being backed by data, and since #8115 `MultiIndex` has been a `SingleColumnFrame` even though it actually has multiple columns. This PR moves `BaseIndex` to the top of its own hierarchy, and uses multiple inheritance with `Frame` and `SingleColumnFrame` to create a more sensible hierarchy for its subclasses. `BaseIndex` is now effectively an ABC defining the interface that subclasses must define, but many of these methods are still inherited from `Frame` types (or in the case of `RangeIndex`, delegated to `Int64Index`). 

These changes remove lots of broken behavior that was previously present in `MultiIndex` and `RangeIndex`; for instance, binary operations would previously fail in strange ways for `MultiIndex`, and various hacks were necessary for `MultiIndex` methods to bypass `SingleColumnFrame`. `RangeIndex` methods that delegate to `Int64Index` are now made explicit (rather than the previous implicit conversion via `self._data`). The new hierarchy also allows much more sensible type-checking by mypy, which revealed numerous additional conceptual issues. The bulk of this PR is actually moving functions around to make the type checker happy, some of which also fixed actual functional issues: for example, `RangeIndex.get_loc` was previously broken. The refactor will make it much easier to handle future changes to all classes in the index hierarchy.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Marlene  (https://github.com/marlenezw)
  - Michael Wang (https://github.com/isVoid)

URL: #9039
  • Loading branch information
vyasr authored Aug 31, 2021
1 parent fdcb90a commit 48bc39e
Show file tree
Hide file tree
Showing 17 changed files with 1,864 additions and 1,764 deletions.
26 changes: 16 additions & 10 deletions python/cudf/cudf/_lib/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,12 @@ cdef class GroupBy:
c_grouped_values = move(c_groups.values)
c_group_offsets = c_groups.offsets

grouped_keys = cudf.Index._from_data(*data_from_unique_ptr(
move(c_grouped_keys),
column_names=range(c_grouped_keys.get()[0].num_columns())
))
grouped_keys = cudf.core.index._index_from_data(
*data_from_unique_ptr(
move(c_grouped_keys),
column_names=range(c_grouped_keys.get()[0].num_columns())
)
)
grouped_values = data_from_unique_ptr(
move(c_grouped_values),
index_names=values._index_names,
Expand Down Expand Up @@ -186,7 +188,8 @@ cdef class GroupBy:
Column.from_unique_ptr(move(c_result.second[i].results[j]))
)

return result_data, cudf.Index._from_data(grouped_keys)
return result_data, cudf.core.index._index_from_data(
grouped_keys)

def scan_internal(self, Table values, aggregations):
from cudf.core.column_accessor import ColumnAccessor
Expand Down Expand Up @@ -264,7 +267,8 @@ cdef class GroupBy:
Column.from_unique_ptr(move(c_result.second[i].results[j]))
)

return result_data, cudf.Index._from_data(grouped_keys)
return result_data, cudf.core.index._index_from_data(
grouped_keys)

def aggregate(self, Table values, aggregations):
"""
Expand Down Expand Up @@ -311,10 +315,12 @@ cdef class GroupBy:
self.c_obj.get()[0].shift(view, offsets, c_fill_values)
)

grouped_keys = cudf.Index._from_data(*data_from_unique_ptr(
move(c_result.first),
column_names=self.keys._column_names
))
grouped_keys = cudf.core.index._index_from_data(
*data_from_unique_ptr(
move(c_result.first),
column_names=self.keys._column_names
)
)

shifted, _ = data_from_unique_ptr(
move(c_result.second), column_names=values._column_names
Expand Down
5 changes: 3 additions & 2 deletions python/cudf/cudf/_lib/utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ cdef data_from_unique_ptr(
# Frame factories we may want to look for a less dissonant approach
# that does not impose performance penalties. The same applies to
# data_from_table_view below.
cudf.Index._from_data(
cudf.core.index._index_from_data(
{
name: columns[i]
for i, name in enumerate(index_names)
Expand Down Expand Up @@ -301,7 +301,8 @@ cdef data_from_table_view(
)
)
column_idx += 1
index = cudf.Index._from_data(dict(zip(index_names, index_columns)))
index = cudf.core.index._index_from_data(
dict(zip(index_names, index_columns)))

# Construct the data dict
cdef size_type source_column_idx = 0
Expand Down
3 changes: 3 additions & 0 deletions python/cudf/cudf/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,6 @@

DataFrameOrSeries = Union["cudf.Series", "cudf.DataFrame"]
SeriesOrIndex = Union["cudf.Series", "cudf.core.index.BaseIndex"]
SeriesOrSingleColumnIndex = Union[
"cudf.Series", "cudf.core.index.GenericIndex"
]
2 changes: 1 addition & 1 deletion python/cudf/cudf/api/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def wrapped_func(obj):


def _union_categoricals(
to_union: List[Union[cudf.Series, cudf.Index]],
to_union: List[Union[cudf.Series, cudf.CategoricalIndex]],
sort_categories: bool = False,
ignore_order: bool = False,
):
Expand Down
Loading

0 comments on commit 48bc39e

Please sign in to comment.