diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 7a9ad732c50..49bede7f58c 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -4,7 +4,7 @@ import pickle from numbers import Number -from typing import Any, Dict, Optional, Tuple, Type +from typing import Any, Dict, Optional, Tuple, Type, Union import cupy import numpy as np @@ -68,17 +68,6 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): else: return NotImplemented - def __init__( - self, - data=None, - dtype=None, - copy=False, - name=None, - tupleize_cols=True, - **kwargs, - ): - pass - @cached_property def _values(self) -> ColumnBase: raise NotImplementedError @@ -156,9 +145,7 @@ def equals(self, other, **kwargs): check_types = True try: - return super(BaseIndex, self).equals( - other, check_types=check_types - ) + return super().equals(other, check_types=check_types) except TypeError: return False @@ -764,10 +751,6 @@ def _copy_construct(self, **kwargs): cls = _dtype_to_index[data.dtype.type] except KeyError: cls = GenericIndex - # TODO: GenericIndex has a different API for __new__ - # than other Index types. Refactoring Index types will - # be necessary to clean this up. - kwargs["values"] = kwargs.pop("data") elif isinstance(data, StringColumn): cls = StringIndex elif isinstance(data, DatetimeColumn): @@ -1486,14 +1469,12 @@ class RangeIndex(BaseIndex): RangeIndex(start=1, stop=10, step=1, name='a') """ - def __new__( - cls, start, stop=None, step=1, dtype=None, copy=False, name=None - ) -> "RangeIndex": - + def __init__( + self, start, stop=None, step=1, dtype=None, copy=False, name=None + ): if step == 0: raise ValueError("Step must not be zero.") - out = SingleColumnFrame.__new__(cls) if isinstance(start, range): therange = start start = therange.start @@ -1501,13 +1482,11 @@ def __new__( step = therange.step if stop is None: start, stop = 0, start - out._start = int(start) - out._stop = int(stop) - out._step = int(step) if step is not None else 1 - out._index = None - out._name = name - - return out + self._start = int(start) + self._stop = int(stop) + self._step = int(step) if step is not None else 1 + self._index = None + self._name = name @property def name(self): @@ -1595,12 +1574,10 @@ def copy(self, name=None, deep=False, dtype=None, names=None): name = self.name if name is None else name - _idx_new = RangeIndex( + return RangeIndex( start=self._start, stop=self._stop, step=self._step, name=name ) - return _idx_new - def __repr__(self): return ( f"{self.__class__.__name__}(start={self._start}, stop={self._stop}" @@ -1810,11 +1787,6 @@ def unique(self): return self -def index_from_range(start, stop=None, step=None): - vals = column.arange(start, stop, step, dtype=np.int64) - return as_index(vals) - - class GenericIndex(BaseIndex): """An array of orderable values that represent the indices of another Column @@ -1824,42 +1796,35 @@ class GenericIndex(BaseIndex): name: A string """ - def __new__(cls, values, **kwargs): + def __init__(self, data, **kwargs): """ Parameters ---------- - values : Column - The Column of values for this index + data : Column + The Column of data for this index name : str optional The name of the Index. If not provided, the Index adopts the value Column's name. Otherwise if this name is different from the value - Column's, the values Column will be cloned to adopt this name. + Column's, the data Column will be cloned to adopt this name. """ - out = SingleColumnFrame.__new__(cls) - out._initialize(values, **kwargs) - - return out - - def _initialize(self, values, **kwargs): - - kwargs = _setdefault_name(values, **kwargs) + kwargs = _setdefault_name(data, **kwargs) # normalize the input - if isinstance(values, cudf.Series): - values = values._column - elif isinstance(values, column.ColumnBase): - values = values + if isinstance(data, cudf.Series): + data = data._column + elif isinstance(data, column.ColumnBase): + data = data else: - if isinstance(values, (list, tuple)): - if len(values) == 0: - values = np.asarray([], dtype="int64") + if isinstance(data, (list, tuple)): + if len(data) == 0: + data = np.asarray([], dtype="int64") else: - values = np.asarray(values) - values = column.as_column(values) - assert isinstance(values, (NumericalColumn, StringColumn)) + data = np.asarray(data) + data = column.as_column(data) + assert isinstance(data, (NumericalColumn, StringColumn)) name = kwargs.get("name") - super(BaseIndex, self).__init__({name: values}) + super().__init__({name: data}) @property def _values(self): @@ -1889,13 +1854,7 @@ def copy(self, name=None, deep=False, dtype=None, names=None): dtype = self.dtype if dtype is None else dtype name = self.name if name is None else name - if isinstance(self, (StringIndex, CategoricalIndex)): - result = as_index(self._values.astype(dtype), name=name, copy=deep) - else: - result = as_index( - self._values.copy(deep=deep).astype(dtype), name=name - ) - return result + return as_index(self._values.astype(dtype), name=name, copy=deep) def __sizeof__(self): return self._values.__sizeof__() @@ -1963,8 +1922,8 @@ def __repr__(self): lines = output.split("\n") tmp_meta = lines[-1] - dtype_index = lines[-1].rfind(" dtype=") - prior_to_dtype = lines[-1][:dtype_index] + dtype_index = tmp_meta.rfind(" dtype=") + prior_to_dtype = tmp_meta[:dtype_index] lines = lines[:-1] lines.append(prior_to_dtype + " dtype='%s'" % self.dtype) if self.name is not None: @@ -1985,9 +1944,7 @@ def __getitem__(self, index): if not isinstance(index, int): res = as_index(res) res.name = self.name - return res - else: - return res + return res @property def dtype(self): @@ -2038,10 +1995,12 @@ class NumericIndex(GenericIndex): Index """ - def __new__(cls, data=None, dtype=None, copy=False, name=None): + # Subclasses must define the dtype they are associated with. + _dtype: Union[None, Type[np.number]] = None - out = SingleColumnFrame.__new__(cls) - dtype = _index_to_dtype[cls] + def __init__(self, data=None, dtype=None, copy=False, name=None): + + dtype = type(self)._dtype if copy: data = column.as_column(data, dtype=dtype).copy() @@ -2049,59 +2008,47 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None): data = column.as_column(data, dtype=dtype) - out._initialize(data, **kwargs) - - return out + super().__init__(data, **kwargs) class Int8Index(NumericIndex): - def __init__(cls, data=None, dtype=None, copy=False, name=None): - pass + _dtype = np.int8 class Int16Index(NumericIndex): - def __init__(cls, data=None, dtype=None, copy=False, name=None): - pass + _dtype = np.int16 class Int32Index(NumericIndex): - def __init__(cls, data=None, dtype=None, copy=False, name=None): - pass + _dtype = np.int32 class Int64Index(NumericIndex): - def __init__(self, data=None, dtype=None, copy=False, name=None): - pass + _dtype = np.int64 class UInt8Index(NumericIndex): - def __init__(self, data=None, dtype=None, copy=False, name=None): - pass + _dtype = np.uint8 class UInt16Index(NumericIndex): - def __init__(cls, data=None, dtype=None, copy=False, name=None): - pass + _dtype = np.uint16 class UInt32Index(NumericIndex): - def __init__(cls, data=None, dtype=None, copy=False, name=None): - pass + _dtype = np.uint32 class UInt64Index(NumericIndex): - def __init__(cls, data=None, dtype=None, copy=False, name=None): - pass + _dtype = np.uint64 class Float32Index(NumericIndex): - def __init__(cls, data=None, dtype=None, copy=False, name=None): - pass + _dtype = np.float32 class Float64Index(NumericIndex): - def __init__(cls, data=None, dtype=None, copy=False, name=None): - pass + _dtype = np.float64 class DatetimeIndex(GenericIndex): @@ -2143,8 +2090,8 @@ class DatetimeIndex(GenericIndex): dtype='datetime64[ms]', name='a') """ - def __new__( - cls, + def __init__( + self, data=None, freq=None, tz=None, @@ -2156,14 +2103,12 @@ def __new__( dtype=None, copy=False, name=None, - ) -> "DatetimeIndex": + ): # we should be more strict on what we accept here but # we'd have to go and figure out all the semantics around # pandas dtindex creation first which. For now # just make sure we handle np.datetime64 arrays # and then just dispatch upstream - out = SingleColumnFrame.__new__(cls) - if freq is not None: raise NotImplementedError("Freq is not yet supported") if tz is not None: @@ -2188,8 +2133,7 @@ def __new__( data = column.as_column(data.values) elif isinstance(data, (list, tuple)): data = column.as_column(np.array(data, dtype="datetime64[ms]")) - out._initialize(data, **kwargs) - return out + super().__init__(data, **kwargs) @property def year(self): @@ -2406,8 +2350,8 @@ class TimedeltaIndex(GenericIndex): dtype='timedelta64[s]', name='delta-index') """ - def __new__( - cls, + def __init__( + self, data=None, unit=None, freq=None, @@ -2415,9 +2359,7 @@ def __new__( dtype="timedelta64[ns]", copy=False, name=None, - ) -> "TimedeltaIndex": - - out = SingleColumnFrame.__new__(cls) + ): if freq is not None: raise NotImplementedError("freq is not yet supported") @@ -2437,8 +2379,7 @@ def __new__( data = column.as_column(data.values) elif isinstance(data, (list, tuple)): data = column.as_column(np.array(data, dtype=dtype)) - out._initialize(data, **kwargs) - return out + super().__init__(data, **kwargs) def to_pandas(self): return pd.TimedeltaIndex( @@ -2532,15 +2473,15 @@ class CategoricalIndex(GenericIndex): CategoricalIndex([1, 2, 3, ], categories=[1, 2, 3], ordered=False, name='a', dtype='category', name='a') """ # noqa: E501 - def __new__( - cls, + def __init__( + self, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None, - ) -> "CategoricalIndex": + ): if isinstance(dtype, (pd.CategoricalDtype, cudf.CategoricalDtype)): if categories is not None or ordered is not None: raise ValueError( @@ -2549,7 +2490,6 @@ def __new__( ) if copy: data = column.as_column(data, dtype=dtype).copy(deep=True) - out = SingleColumnFrame.__new__(cls) kwargs = _setdefault_name(data, name=name) if isinstance(data, CategoricalColumn): data = data @@ -2589,9 +2529,7 @@ def __new__( elif ordered is False and data.ordered is True: data.cat().as_unordered(inplace=True) - out._initialize(data, **kwargs) - - return out + super().__init__(data, **kwargs) @property def codes(self): @@ -2770,12 +2708,11 @@ class IntervalIndex(GenericIndex): IntervalIndex """ - def __new__( - cls, data, closed=None, dtype=None, copy=False, name=None, - ) -> "IntervalIndex": + def __init__( + self, data, closed=None, dtype=None, copy=False, name=None, + ): if copy: data = column.as_column(data, dtype=dtype).copy() - out = SingleColumnFrame.__new__(cls) kwargs = _setdefault_name(data, name=name) if isinstance(data, IntervalColumn): data = data @@ -2792,8 +2729,7 @@ def __new__( data = column.as_column(data) data.dtype.closed = closed - out._initialize(data, **kwargs) - return out + super().__init__(data, **kwargs) def from_breaks(breaks, closed="right", name=None, copy=False, dtype=None): """ @@ -2847,8 +2783,7 @@ class StringIndex(GenericIndex): name: A string """ - def __new__(cls, values, copy=False, **kwargs): - out = SingleColumnFrame.__new__(cls) + def __init__(self, values, copy=False, **kwargs): kwargs = _setdefault_name(values, **kwargs) if isinstance(values, StringColumn): values = values.copy(deep=copy) @@ -2861,8 +2796,7 @@ def __new__(cls, values, copy=False, **kwargs): "Couldn't create StringIndex from passed in object" ) - out._initialize(values, **kwargs) - return out + super().__init__(values, **kwargs) def to_pandas(self): return pd.Index(self.to_array(), name=self.name, dtype="object") @@ -2971,26 +2905,10 @@ def as_index(arbitrary, **kwargs) -> BaseIndex: np.float64: Float64Index, } -_index_to_dtype = { - Int8Index: np.int8, - Int16Index: np.int16, - Int32Index: np.int32, - Int64Index: np.int64, - UInt8Index: np.uint8, - UInt16Index: np.uint16, - UInt32Index: np.uint32, - UInt64Index: np.uint64, - Float32Index: np.float32, - Float64Index: np.float64, -} - def _setdefault_name(values, **kwargs): - if "name" not in kwargs or kwargs["name"] is None: - if not hasattr(values, "name"): - kwargs.update({"name": None}) - else: - kwargs.update({"name": values.name}) + if kwargs.get("name") is None: + kwargs["name"] = getattr(values, "name", None) return kwargs diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 3735a36b3eb..90c637daed4 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -19,7 +19,7 @@ from cudf.core._compat import PANDAS_GE_120 from cudf.core.column import as_column, column from cudf.core.column_accessor import ColumnAccessor -from cudf.core.frame import Frame, SingleColumnFrame +from cudf.core.frame import SingleColumnFrame from cudf.core.index import BaseIndex, as_index from cudf.utils.utils import _maybe_indices_to_slice @@ -63,8 +63,8 @@ class MultiIndex(BaseIndex): ) """ - def __new__( - cls, + def __init__( + self, levels=None, codes=None, sortorder=None, @@ -74,7 +74,7 @@ def __new__( copy=False, name=None, **kwargs, - ) -> "MultiIndex": + ): if sortorder is not None: raise NotImplementedError("sortorder is not yet supported") @@ -84,8 +84,7 @@ def __new__( "Use `names`, `name` is not yet supported" ) - out = Frame.__new__(cls) - super(BaseIndex, out).__init__() + super().__init__() if copy: if isinstance(codes, cudf.DataFrame): @@ -93,7 +92,7 @@ def __new__( if len(levels) > 0 and isinstance(levels[0], cudf.Series): levels = [level.copy(deep=True) for level in levels] - out._name = None + self._name = None column_names = [] if labels: @@ -119,11 +118,11 @@ def __new__( # try using those as the source_data column names: if len(dict.fromkeys(names)) == len(names): source_data.columns = names - out._data = source_data._data - out.names = names - out._codes = codes - out._levels = levels - return out + self._data = source_data._data + self.names = names + self._codes = codes + self._levels = levels + return # name setup if isinstance(names, (Sequence, pd.core.indexes.frozen.FrozenList,),): @@ -145,42 +144,40 @@ def __new__( raise TypeError("Codes is not a Sequence of sequences") if isinstance(codes, cudf.DataFrame): - out._codes = codes + self._codes = codes elif len(levels) == len(codes): - out._codes = cudf.DataFrame() + self._codes = cudf.DataFrame() for i, codes in enumerate(codes): name = column_names[i] or i codes = column.as_column(codes) - out._codes[name] = codes.astype(np.int64) + self._codes[name] = codes.astype(np.int64) else: raise ValueError( "MultiIndex has unequal number of levels and " "codes and is inconsistent!" ) - out._levels = [cudf.Series(level) for level in levels] - out._validate_levels_and_codes(out._levels, out._codes) + self._levels = [cudf.Series(level) for level in levels] + self._validate_levels_and_codes(self._levels, self._codes) source_data = cudf.DataFrame() - for i, name in enumerate(out._codes.columns): - codes = as_index(out._codes[name]._column) - if -1 in out._codes[name].values: + for i, name in enumerate(self._codes.columns): + codes = as_index(self._codes[name]._column) + if -1 in self._codes[name].values: # Must account for null(s) in _source_data column level = cudf.DataFrame( - {name: [None] + list(out._levels[i])}, - index=range(-1, len(out._levels[i])), + {name: [None] + list(self._levels[i])}, + index=range(-1, len(self._levels[i])), ) else: - level = cudf.DataFrame({name: out._levels[i]}) + level = cudf.DataFrame({name: self._levels[i]}) source_data[name] = libcudf.copying.gather( level, codes._data.columns[0] )._data[name] - out._data = source_data._data - out.names = names - - return out + self._data = source_data._data + self.names = names @property def names(self): @@ -314,8 +311,7 @@ def shape(self): @property def _source_data(self): - out = cudf.DataFrame._from_data(data=self._data) - return out + return cudf.DataFrame._from_data(data=self._data) @_source_data.setter def _source_data(self, value):