From 258bf3df9d0d29068985c43d43597f480165a17f Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 30 May 2023 08:43:47 -0500 Subject: [PATCH] `Index` class deprecation enforcements (#13204) This PR: - [x] Enforces `Index` related deprecations by removing `Float32Index`, `Float64Index`, `GenericIndex`, `Int8Index`, `Int16Index`, `Int32Index`, `Int64Index`, `StringIndex`, `UInt8Index`, `UInt16Index`, `UInt32Index`, `UInt64Index`. - [x] Cleans up the repr logic to more closely align with pandas for `` value representation incase of `string` dtype. - [x] Fixes docstring and pytests to support the removals of the above classes. This PR also fixes 202 pytests: ```bash = 267 failed, 95670 passed, 2044 skipped, 763 xfailed, 300 xpassed in 442.18s (0:07:22) = ``` On `pandas_2.0_feature_branch`: ```bash = 469 failed, 95464 passed, 2044 skipped, 763 xfailed, 300 xpassed in 469.26s (0:07:49) = ``` --- docs/cudf/source/api_docs/index_objects.rst | 3 - docs/cudf/source/conf.py | 2 +- .../source/developer_guide/library_design.md | 25 +- python/cudf/benchmarks/conftest.py | 6 +- python/cudf/cudf/__init__.py | 24 - python/cudf/cudf/_typing.py | 6 +- python/cudf/cudf/core/_base_index.py | 72 +- python/cudf/cudf/core/algorithms.py | 8 +- python/cudf/cudf/core/column/categorical.py | 4 +- python/cudf/cudf/core/column/methods.py | 4 +- python/cudf/cudf/core/column/string.py | 10 +- python/cudf/cudf/core/dataframe.py | 16 +- python/cudf/cudf/core/dtypes.py | 11 +- python/cudf/cudf/core/frame.py | 8 +- python/cudf/cudf/core/groupby/groupby.py | 3 +- python/cudf/cudf/core/index.py | 674 +++--------------- python/cudf/cudf/core/indexed_frame.py | 2 +- python/cudf/cudf/core/multiindex.py | 14 +- python/cudf/cudf/core/reshape.py | 4 +- python/cudf/cudf/core/series.py | 6 +- python/cudf/cudf/core/single_column_frame.py | 4 +- python/cudf/cudf/testing/testing.py | 26 +- python/cudf/cudf/tests/test_binops.py | 4 +- python/cudf/cudf/tests/test_dataframe.py | 4 +- python/cudf/cudf/tests/test_groupby.py | 7 +- python/cudf/cudf/tests/test_index.py | 109 +-- python/cudf/cudf/tests/test_monotonic.py | 10 +- python/cudf/cudf/tests/test_pack.py | 10 +- python/cudf/cudf/tests/test_pickling.py | 6 +- python/cudf/cudf/tests/test_repr.py | 31 +- python/cudf/cudf/tests/test_serialize.py | 4 +- python/cudf/cudf/tests/test_string.py | 5 +- python/dask_cudf/dask_cudf/backends.py | 8 +- 33 files changed, 284 insertions(+), 846 deletions(-) diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst index 2a8d18e9cb7..1b748a8f69f 100644 --- a/docs/cudf/source/api_docs/index_objects.rst +++ b/docs/cudf/source/api_docs/index_objects.rst @@ -149,9 +149,6 @@ Numeric Index :template: autosummary/class_without_autosummary.rst RangeIndex - Int64Index - UInt64Index - Float64Index .. _api.categoricalindex: diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 2d3d2494747..4d9558ecd33 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -261,7 +261,7 @@ def process_class_docstrings(app, what, name, obj, options, lines): from the processed docstring. """ if what == "class": - if name in {"cudf.RangeIndex", "cudf.Int64Index", "cudf.UInt64Index", "cudf.Float64Index", "cudf.CategoricalIndex", "cudf.IntervalIndex", "cudf.MultiIndex", "cudf.DatetimeIndex", "cudf.TimedeltaIndex", "cudf.TimedeltaIndex"}: + if name in {"cudf.RangeIndex", "cudf.CategoricalIndex", "cudf.IntervalIndex", "cudf.MultiIndex", "cudf.DatetimeIndex", "cudf.TimedeltaIndex", "cudf.TimedeltaIndex"}: cut_index = lines.index('.. rubric:: Attributes') lines[:] = lines[:cut_index] diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index 16b84476549..e1f91a6417d 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -22,7 +22,7 @@ Finally we tie these pieces together to provide a more holistic view of the proj % class IndexedFrame % class SingleColumnFrame % class BaseIndex -% class GenericIndex +% class Index % class MultiIndex % class RangeIndex % class DataFrame @@ -42,8 +42,8 @@ Finally we tie these pieces together to provide a more holistic view of the proj % BaseIndex <|-- MultiIndex % Frame <|-- MultiIndex % -% BaseIndex <|-- GenericIndex -% SingleColumnFrame <|-- GenericIndex +% BaseIndex <|-- Index +% SingleColumnFrame <|-- Index % % @enduml @@ -89,31 +89,26 @@ While we've highlighted some exceptional cases of Indexes before, let's start wi In practice, `BaseIndex` does have concrete implementations of a small set of methods. However, currently many of these implementations are not applicable to all subclasses and will be eventually be removed. -Almost all indexes are subclasses of `GenericIndex`, a single-columned index with the class hierarchy: +Almost all indexes are subclasses of `Index`, a single-columned index with the class hierarchy: ```python -class GenericIndex(SingleColumnFrame, BaseIndex) +class Index(SingleColumnFrame, BaseIndex) ``` Integer, float, or string indexes are all composed of a single column of data. -Most `GenericIndex` methods are inherited from `Frame`, saving us the trouble of rewriting them. +Most `Index` methods are inherited from `Frame`, saving us the trouble of rewriting them. We now consider the three main exceptions to this model: - A `RangeIndex` is not backed by a column of data, so it inherits directly from `BaseIndex` alone. Wherever possible, its methods have special implementations designed to avoid materializing columns. - Where such an implementation is infeasible, we fall back to converting it to an `Int64Index` first instead. + Where such an implementation is infeasible, we fall back to converting it to an `Index` of `int64` + dtype first instead. - A `MultiIndex` is backed by _multiple_ columns of data. Therefore, its inheritance hierarchy looks like `class MultiIndex(Frame, BaseIndex)`. Some of its more `Frame`-like methods may be inherited, but many others must be reimplemented since in many cases a `MultiIndex` is not expected to behave like a `Frame`. -- Just like in pandas, `Index` itself can never be instantiated. - `pandas.Index` is the parent class for indexes, - but its constructor returns an appropriate subclass depending on the input data type and shape. - Unfortunately, mimicking this behavior requires overriding `__new__`, - which in turn makes shared initialization across inheritance trees much more cumbersome to manage. - To enable sharing constructor logic across different index classes, - we instead define `BaseIndex` as the parent class of all indexes. +- To enable sharing constructor logic across different index classes, + we define `BaseIndex` as the parent class of all indexes. `Index` inherits from `BaseIndex`, but it masquerades as a `BaseIndex` to match pandas. - This class should contain no implementations since it is simply a factory for other indexes. ## The Column layer diff --git a/python/cudf/benchmarks/conftest.py b/python/cudf/benchmarks/conftest.py index 4f2bb96061f..5d0f80189c9 100644 --- a/python/cudf/benchmarks/conftest.py +++ b/python/cudf/benchmarks/conftest.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. """Defines pytest fixtures for all benchmarks. @@ -40,8 +40,8 @@ In addition to the above fixtures, we also provide the following more specialized fixtures: - rangeindex: Since RangeIndex always holds int64 data we cannot conflate - it with index_dtype_int64 (a true Int64Index), and it cannot hold nulls. - As a result, it is provided as a separate fixture. + it with index_dtype_int64 (a true Index with int64 dtype), and it + cannot hold nulls. As a result, it is provided as a separate fixture. """ import os diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index de0f2d67add..c64da9a8ab2 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -40,22 +40,10 @@ BaseIndex, CategoricalIndex, DatetimeIndex, - Float32Index, - Float64Index, - GenericIndex, Index, - Int8Index, - Int16Index, - Int32Index, - Int64Index, IntervalIndex, RangeIndex, - StringIndex, TimedeltaIndex, - UInt8Index, - UInt16Index, - UInt32Index, - UInt64Index, interval_range, ) from cudf.core.missing import NA @@ -106,15 +94,8 @@ "DatetimeIndex", "Decimal32Dtype", "Decimal64Dtype", - "Float32Index", - "Float64Index", - "GenericIndex", "Grouper", "Index", - "Int16Index", - "Int32Index", - "Int64Index", - "Int8Index", "IntervalDtype", "IntervalIndex", "ListDtype", @@ -123,13 +104,8 @@ "RangeIndex", "Scalar", "Series", - "StringIndex", "StructDtype", "TimedeltaIndex", - "UInt16Index", - "UInt32Index", - "UInt64Index", - "UInt8Index", "api", "concat", "crosstab", diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py index e2ea12a0e4d..79762edbd65 100644 --- a/python/cudf/cudf/_typing.py +++ b/python/cudf/cudf/_typing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. import sys from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, TypeVar, Union @@ -37,9 +37,7 @@ DataFrameOrSeries = Union["cudf.Series", "cudf.DataFrame"] SeriesOrIndex = Union["cudf.Series", "cudf.core.index.BaseIndex"] -SeriesOrSingleColumnIndex = Union[ - "cudf.Series", "cudf.core.index.GenericIndex" -] +SeriesOrSingleColumnIndex = Union["cudf.Series", "cudf.core.index.Index"] # Groupby aggregation AggType = Union[str, Callable] diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 7d16824174a..46e7cdfac61 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -58,9 +58,9 @@ >>> import cudf >>> index = cudf.Index([1, 2, 3]) >>> index -Int64Index([1, 2, 3], dtype='int64') +Index([1, 2, 3], dtype='int64') >>> index.astype('float64') -Float64Index([1.0, 2.0, 3.0], dtype='float64') +Index([1.0, 2.0, 3.0], dtype='float64') """ @@ -135,7 +135,7 @@ def get_level_values(self, level): >>> import cudf >>> idx = cudf.Index(["a", "b", "c"]) >>> idx.get_level_values(0) - StringIndex(['a' 'b' 'c'], dtype='object') + Index(['a', 'b', 'c'], dtype='object') """ if level == self.name: @@ -182,7 +182,7 @@ def _clean_nulls_from_index(self): to `` as a preprocessing step to `__repr__` methods. This will involve changing type of Index object - to StringIndex but it is the responsibility of the `__repr__` + to string dtype but it is the responsibility of the `__repr__` methods using this method to replace or handle representation of the actual types correctly. """ @@ -225,7 +225,7 @@ def hasnans(self): >>> import numpy as np >>> index = cudf.Index([1, 2, np.nan, 3, 4], nan_as_null=False) >>> index - Float64Index([1.0, 2.0, nan, 3.0, 4.0], dtype='float64') + Index([1.0, 2.0, nan, 3.0, 4.0], dtype='float64') >>> index.hasnans True @@ -233,7 +233,7 @@ def hasnans(self): >>> index = cudf.Index([1, 2, None, 3, 4]) >>> index - Int64Index([1, 2, , 3, 4], dtype='int64') + Index([1, 2, , 3, 4], dtype='int64') >>> index.hasnans True """ @@ -286,9 +286,9 @@ def set_names(self, names, level=None, inplace=False): >>> import cudf >>> idx = cudf.Index([1, 2, 3, 4]) >>> idx - Int64Index([1, 2, 3, 4], dtype='int64') + Index([1, 2, 3, 4], dtype='int64') >>> idx.set_names('quarter') - Int64Index([1, 2, 3, 4], dtype='int64', name='quarter') + Index([1, 2, 3, 4], dtype='int64', name='quarter') >>> idx = cudf.MultiIndex.from_product([['python', 'cobra'], ... [2018, 2019]]) >>> idx @@ -347,7 +347,7 @@ def union(self, other, sort=None): >>> idx1 = cudf.Index([1, 2, 3, 4]) >>> idx2 = cudf.Index([3, 4, 5, 6]) >>> idx1.union(idx2) - Int64Index([1, 2, 3, 4, 5, 6], dtype='int64') + Index([1, 2, 3, 4, 5, 6], dtype='int64') MultiIndex case @@ -437,7 +437,7 @@ def intersection(self, other, sort=False): >>> idx1 = cudf.Index([1, 2, 3, 4]) >>> idx2 = cudf.Index([3, 4, 5, 6]) >>> idx1.intersection(idx2) - Int64Index([3, 4], dtype='int64') + Index([3, 4], dtype='int64') MultiIndex case @@ -541,9 +541,9 @@ def fillna(self, value, downcast=None): >>> import cudf >>> index = cudf.Index([1, 2, None, 4]) >>> index - Int64Index([1, 2, , 4], dtype='int64') + Index([1, 2, , 4], dtype='int64') >>> index.fillna(3) - Int64Index([1, 2, 3, 4], dtype='int64') + Index([1, 2, 3, 4], dtype='int64') """ if downcast is not None: raise NotImplementedError( @@ -635,13 +635,13 @@ def to_pandas(self, nullable=False): >>> import cudf >>> idx = cudf.Index([-3, 10, 15, 20]) >>> idx - Int64Index([-3, 10, 15, 20], dtype='int64') + Index([-3, 10, 15, 20], dtype='int64') >>> idx.to_pandas() - Int64Index([-3, 10, 15, 20], dtype='int64') + Index([-3, 10, 15, 20], dtype='int64') >>> type(idx.to_pandas()) - + >>> type(idx) - + """ raise NotImplementedError @@ -666,7 +666,7 @@ def isin(self, values): -------- >>> idx = cudf.Index([1,2,3]) >>> idx - Int64Index([1, 2, 3], dtype='int64') + Index([1, 2, 3], dtype='int64') Check whether each index value in a list of values. @@ -736,17 +736,17 @@ def append(self, other): >>> import cudf >>> idx = cudf.Index([1, 2, 10, 100]) >>> idx - Int64Index([1, 2, 10, 100], dtype='int64') + Index([1, 2, 10, 100], dtype='int64') >>> other = cudf.Index([200, 400, 50]) >>> other - Int64Index([200, 400, 50], dtype='int64') + Index([200, 400, 50], dtype='int64') >>> idx.append(other) - Int64Index([1, 2, 10, 100, 200, 400, 50], dtype='int64') + Index([1, 2, 10, 100, 200, 400, 50], dtype='int64') append accepts list of Index objects >>> idx.append([other, other]) - Int64Index([1, 2, 10, 100, 200, 400, 50, 200, 400, 50], dtype='int64') + Index([1, 2, 10, 100, 200, 400, 50, 200, 400, 50], dtype='int64') """ raise NotImplementedError @@ -778,14 +778,14 @@ def difference(self, other, sort=None): >>> import cudf >>> idx1 = cudf.Index([2, 1, 3, 4]) >>> idx1 - Int64Index([2, 1, 3, 4], dtype='int64') + Index([2, 1, 3, 4], dtype='int64') >>> idx2 = cudf.Index([3, 4, 5, 6]) >>> idx2 - Int64Index([3, 4, 5, 6], dtype='int64') + Index([3, 4, 5, 6], dtype='int64') >>> idx1.difference(idx2) - Int64Index([1, 2], dtype='int64') + Index([1, 2], dtype='int64') >>> idx1.difference(idx2, sort=False) - Int64Index([2, 1], dtype='int64') + Index([2, 1], dtype='int64') """ if sort not in {None, False}: raise ValueError( @@ -1231,18 +1231,18 @@ def sort_values( >>> import cudf >>> idx = cudf.Index([10, 100, 1, 1000]) >>> idx - Int64Index([10, 100, 1, 1000], dtype='int64') + Index([10, 100, 1, 1000], dtype='int64') Sort values in ascending order (default behavior). >>> idx.sort_values() - Int64Index([1, 10, 100, 1000], dtype='int64') + Index([1, 10, 100, 1000], dtype='int64') Sort values in descending order, and also get the indices `idx` was sorted by. >>> idx.sort_values(ascending=False, return_indexer=True) - (Int64Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2], + (Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2], dtype=int32)) Sorting values in a MultiIndex: @@ -1319,7 +1319,7 @@ def join( names=['a', 'b']) >>> rhs = cudf.DataFrame({"a": [1, 4, 3]}).set_index('a').index >>> rhs - Int64Index([1, 4, 3], dtype='int64', name='a') + Index([1, 4, 3], dtype='int64', name='a') >>> lhs.join(rhs, how='inner') MultiIndex([(3, 4), (1, 2)], @@ -1402,12 +1402,12 @@ def rename(self, name, inplace=False): >>> import cudf >>> index = cudf.Index([1, 2, 3], name='one') >>> index - Int64Index([1, 2, 3], dtype='int64', name='one') + Index([1, 2, 3], dtype='int64', name='one') >>> index.name 'one' >>> renamed_index = index.rename('two') >>> renamed_index - Int64Index([1, 2, 3], dtype='int64', name='two') + Index([1, 2, 3], dtype='int64', name='two') >>> renamed_index.name 'two' """ @@ -1501,9 +1501,9 @@ def from_pandas(cls, index, nan_as_null=None): >>> data = [10, 20, 30, np.nan] >>> pdi = pd.Index(data) >>> cudf.Index.from_pandas(pdi) - Float64Index([10.0, 20.0, 30.0, ], dtype='float64') + Index([10.0, 20.0, 30.0, ], dtype='float64') >>> cudf.Index.from_pandas(pdi, nan_as_null=False) - Float64Index([10.0, 20.0, 30.0, nan], dtype='float64') + Index([10.0, 20.0, 30.0, nan], dtype='float64') """ if not isinstance(index, pd.Index): raise TypeError("not a pandas.Index") @@ -1674,7 +1674,7 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None): -------- >>> idx = cudf.Index(['a', 'b', 'c', 'd', 'e']) >>> idx.take([2, 0, 4, 3]) - StringIndex(['c' 'a' 'e' 'd'], dtype='object') + Index(['c', 'a', 'e', 'd'], dtype='object') """ if axis not in {0, "index"}: @@ -1725,9 +1725,9 @@ def repeat(self, repeats, axis=None): -------- >>> index = cudf.Index([10, 22, 33, 55]) >>> index - Int64Index([10, 22, 33, 55], dtype='int64') + Index([10, 22, 33, 55], dtype='int64') >>> index.repeat(5) - Int64Index([10, 10, 10, 10, 10, 22, 22, 22, 22, 22, 33, + Index([10, 10, 10, 10, 10, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 55, 55, 55, 55, 55], dtype='int64') """ diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 50ec4b774ee..56bb575d6d6 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -46,7 +46,7 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None): >>> codes array([0, 1, 1], dtype=int8) >>> uniques - StringIndex(['a' 'c'], dtype='object') + Index(['a' 'c'], dtype='object') When ``use_na_sentinel=True`` (the default), missing values are indicated in the `codes` with the sentinel value ``-1`` and missing values are not @@ -56,7 +56,7 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None): >>> codes array([ 1, -1, 0, 2, 1], dtype=int8) >>> uniques - StringIndex(['a' 'b' 'c'], dtype='object') + Index(['a', 'b', 'c'], dtype='object') If NA is in the values, and we want to include NA in the uniques of the values, it can be achieved by setting ``use_na_sentinel=False``. @@ -66,12 +66,12 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None): >>> codes array([ 0, 1, 0, -1], dtype=int8) >>> uniques - Float64Index([1.0, 2.0], dtype='float64') + Index([1.0, 2.0], dtype='float64') >>> codes, uniques = cudf.factorize(values, use_na_sentinel=False) >>> codes array([1, 2, 1, 0], dtype=int8) >>> uniques - Float64Index([, 1.0, 2.0], dtype='float64') + Index([, 1.0, 2.0], dtype='float64') """ return_cupy_array = isinstance(values, cp.ndarray) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index c026574f8cd..6352f9f1fa0 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -63,7 +63,7 @@ class CategoricalAccessor(ColumnMethods): dtype: category Categories (3, int64): [1, 2, 3] >>> s.cat.categories - Int64Index([1, 2, 3], dtype='int64') + Index([1, 2, 3], dtype='int64') >>> s.cat.reorder_categories([3,2,1]) 0 1 1 2 @@ -106,7 +106,7 @@ def __init__(self, parent: SeriesOrSingleColumnIndex): super().__init__(parent=parent) @property - def categories(self) -> "cudf.core.index.GenericIndex": + def categories(self) -> "cudf.core.index.Index": """ The categories of this categorical. """ diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index c1b6dad00b7..0e7bcdc296c 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from __future__ import annotations @@ -8,7 +8,7 @@ import cudf -ParentType = Union["cudf.Series", "cudf.core.index.GenericIndex"] +ParentType = Union["cudf.Series", "cudf.core.index.Index"] class ColumnMethods: diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 2e74ec62204..0205d0ee43b 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -708,9 +708,9 @@ def contains( >>> data = ['Mouse', 'dog', 'house and parrot', '23.0', np.NaN] >>> idx = cudf.Index(data) >>> idx - StringIndex(['Mouse' 'dog' 'house and parrot' '23.0' None], dtype='object') + Index(['Mouse' 'dog' 'house and parrot' '23.0' None], dtype='object') >>> idx.str.contains('23', regex=False) - GenericIndex([False, False, False, True, ], dtype='bool') + Index([False, False, False, True, ], dtype='bool') Returning 'house' or 'dog' when either expression occurs in a string. @@ -2811,7 +2811,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: >>> idx = cudf.Index(['X 123', 'Y 999']) >>> idx - StringIndex(['X 123' 'Y 999'], dtype='object') + Index(['X 123' 'Y 999'], dtype='object') Which will create a MultiIndex: @@ -2876,7 +2876,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: >>> idx = cudf.Index(['X 123', 'Y 999']) >>> idx - StringIndex(['X 123' 'Y 999'], dtype='object') + Index(['X 123' 'Y 999'], dtype='object') Which will create a MultiIndex: @@ -3542,7 +3542,7 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: >>> index = cudf.Index(['A', 'A', 'Aaba', 'cat']) >>> index.str.count('a') - Int64Index([0, 0, 2, 1], dtype='int64') + Index([0, 0, 2, 1], dtype='int64') """ # noqa W605 if isinstance(pat, re.Pattern): flags = pat.flags & ~re.U diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 675b870056d..624e378011a 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1673,7 +1673,7 @@ def _concat( if empty_has_index and num_empty_input_frames == len(objs): out._index = cudf.RangeIndex(result_index_length) elif are_all_range_index and not ignore_index: - out._index = cudf.core.index.GenericIndex._concat( + out._index = cudf.core.index.Index._concat( [o._index for o in objs] ) @@ -3381,7 +3381,7 @@ def rename( if index: if ( any(type(item) == str for item in index.values()) - and type(self.index) != cudf.StringIndex + and type(self.index._values) != cudf.core.column.StringColumn ): raise NotImplementedError( "Implicit conversion of index to " @@ -6606,7 +6606,7 @@ def keys(self): Columns: [0, 1, 2, 3] Index: [] >>> df.keys() - Int64Index([0, 1, 2, 3], dtype='int64') + Index([0, 1, 2, 3], dtype='int64') """ return self._data.to_pandas_index() @@ -7308,14 +7308,14 @@ def from_pandas(obj, nan_as_null=None): >>> pidx = pd.Index([1, 2, 10, 20]) >>> pidx - Int64Index([1, 2, 10, 20], dtype='int64') + Index([1, 2, 10, 20], dtype='int64') >>> gidx = cudf.from_pandas(pidx) >>> gidx - Int64Index([1, 2, 10, 20], dtype='int64') + Index([1, 2, 10, 20], dtype='int64') >>> type(gidx) - + >>> type(pidx) - + Converting a Pandas MultiIndex to cuDF MultiIndex: @@ -7494,7 +7494,7 @@ def _get_union_of_indices(indexes): if len(indexes) == 1: return indexes[0] else: - merged_index = cudf.core.index.GenericIndex._concat(indexes) + merged_index = cudf.core.index.Index._concat(indexes) merged_index = merged_index.drop_duplicates() _, inds = merged_index._values.sort_by_values() return merged_index.take(inds) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index edd557aad1f..dce595b0843 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -162,7 +162,7 @@ def __init__(self, categories=None, ordered: bool = False) -> None: self._ordered = ordered @property - def categories(self) -> "cudf.core.index.GenericIndex": + def categories(self) -> "cudf.core.index.Index": """ An ``Index`` containing the unique categories allowed. @@ -171,7 +171,7 @@ def categories(self) -> "cudf.core.index.GenericIndex": >>> import cudf >>> dtype = cudf.CategoricalDtype(categories=['b', 'a'], ordered=True) >>> dtype.categories - StringIndex(['b' 'a'], dtype='object') + Index(['b', 'a'], dtype='object') """ if self._categories is None: return cudf.core.index.as_index( @@ -238,9 +238,10 @@ def to_pandas(self) -> pd.CategoricalDtype: if self._categories is None: categories = None else: - if isinstance( - self._categories, (cudf.Float32Index, cudf.Float64Index) - ): + if self._categories.dtype in { + cudf.dtype("float32"), + cudf.dtype("float64"), + }: categories = self._categories.dropna().to_pandas() else: categories = self._categories.to_pandas() diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index c7330da5cfa..89b38fad376 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -210,12 +210,12 @@ def size(self): >>> index = cudf.Index([]) >>> index - Float64Index([], dtype='float64') + Index([], dtype='float64') >>> index.size 0 >>> index = cudf.Index([1, 2, 3, 10]) >>> index - Int64Index([1, 2, 3, 10], dtype='int64') + Index([1, 2, 3, 10], dtype='int64') >>> index.size 4 @@ -1289,7 +1289,7 @@ def isna(self): >>> idx = cudf.Index([1, 2, None, np.NaN, 0.32, np.inf]) >>> idx - Float64Index([1.0, 2.0, , , 0.32, Inf], dtype='float64') + Index([1.0, 2.0, , , 0.32, Inf], dtype='float64') >>> idx.isna() array([False, False, True, True, False, False]) """ @@ -1368,7 +1368,7 @@ def notna(self): >>> idx = cudf.Index([1, 2, None, np.NaN, 0.32, np.inf]) >>> idx - Float64Index([1.0, 2.0, , , 0.32, Inf], dtype='float64') + Index([1.0, 2.0, , , 0.32, Inf], dtype='float64') >>> idx.notna() array([ True, True, False, False, True, True]) """ diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index f79a337373e..8e88d994708 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -531,7 +531,8 @@ def agg(self, func): orig_dtypes = tuple(c.dtype for c in columns) # Note: When there are no key columns, the below produces - # a Float64Index, while Pandas returns an Int64Index + # an Index with float64 dtype, while Pandas returns + # an Index with int64 dtype. # (GH: 6945) ( result_columns, diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 783f4012311..c0664d3ca4d 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -9,12 +9,10 @@ from numbers import Number from typing import ( Any, - Dict, List, MutableMapping, Optional, Tuple, - Type, Union, ) @@ -22,6 +20,7 @@ import numpy as np import pandas as pd from pandas._config import get_option +from typing_extensions import Self import cudf from cudf._lib.datetime import extract_quarter, is_leap_year @@ -34,7 +33,6 @@ is_interval_dtype, is_list_like, is_scalar, - is_string_dtype, ) from cudf.core._base_index import BaseIndex, _index_astype_docstring from cudf.core.column import ( @@ -66,8 +64,33 @@ from cudf.core._compat import PANDAS_GE_200 +class IndexMeta(type): + """Custom metaclass for Index that overrides instance/subclass tests.""" + + def __call__(cls, data, *args, **kwargs): + if cls is Index: + return as_index( + arbitrary=data, + *args, + **kwargs, + ) + return super().__call__(data, *args, **kwargs) + + def __instancecheck__(self, instance): + if self is cudf.Index: + return isinstance(instance, BaseIndex) + else: + return False + + def __subclasscheck__(self, subclass): + if self is cudf.Index: + return issubclass(subclass, BaseIndex) + else: + return False + + def _lexsorted_equal_range( - idx: Union[GenericIndex, cudf.MultiIndex], + idx: Union[Index, cudf.MultiIndex], key_as_table: Frame, is_sorted: bool, ) -> Tuple[int, int, Optional[ColumnBase]]: @@ -100,18 +123,13 @@ def _index_from_data(data: MutableMapping, name: Any = None): values = next(iter(data.values())) if isinstance(values, NumericalColumn): - try: - index_class_type: Type[ - Union[GenericIndex, cudf.MultiIndex] - ] = _dtype_to_index[values.dtype.type] - except KeyError: - index_class_type = GenericIndex + index_class_type = Index elif isinstance(values, DatetimeColumn): index_class_type = DatetimeIndex elif isinstance(values, TimeDeltaColumn): index_class_type = TimedeltaIndex elif isinstance(values, StringColumn): - index_class_type = StringIndex + index_class_type = Index elif isinstance(values, CategoricalColumn): index_class_type = CategoricalIndex elif isinstance(values, (IntervalColumn, StructColumn)): @@ -195,8 +213,8 @@ def __init__( self._end = self._start + self._step * (len(self._range) - 1) def _copy_type_metadata( - self: RangeIndex, other: RangeIndex, *, override_dtypes=None - ) -> RangeIndex: + self, other: RangeIndex, *, override_dtypes=None + ) -> Self: # There is no metadata to be copied for RangeIndex since it does not # have an underlying column. return self @@ -564,7 +582,7 @@ def __rmul__(self, other): def _as_int_index(self): # Convert self to an integer index. This method is used to perform ops # that are not defined directly on RangeIndex. - return _dtype_to_index[self.dtype.type]._from_data(self._data) + return cudf.Index._from_data(self._data) @_cudf_nvtx_annotate def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @@ -770,13 +788,13 @@ def sort_values( @_cudf_nvtx_annotate def _gather(self, gather_map, nullify=False, check_bounds=True): gather_map = cudf.core.column.as_column(gather_map) - return _dtype_to_index[self.dtype.type]._from_columns( + return cudf.Index._from_columns( [self._values.take(gather_map, nullify, check_bounds)], [self.name] ) @_cudf_nvtx_annotate def _apply_boolean_mask(self, boolean_mask): - return _dtype_to_index[self.dtype.type]._from_columns( + return cudf.Index._from_columns( [self._values.apply_boolean_mask(boolean_mask)], [self.name] ) @@ -784,7 +802,7 @@ def repeat(self, repeats, axis=None): return self._as_int_index().repeat(repeats, axis) def _split(self, splits): - return _dtype_to_index[self.dtype.type]._from_columns( + return cudf.Index._from_columns( [self._as_int_index()._split(splits)], [self.name] ) @@ -917,7 +935,7 @@ def __abs__(self): return abs(self._as_int_index()) -class GenericIndex(SingleColumnFrame, BaseIndex): +class Index(SingleColumnFrame, BaseIndex, metaclass=IndexMeta): """ An array of orderable values that represent the indices of another Column @@ -939,21 +957,6 @@ class GenericIndex(SingleColumnFrame, BaseIndex): @_cudf_nvtx_annotate def __init__(self, data, **kwargs): kwargs = _setdefault_name(data, **kwargs) - - # normalize the input - if isinstance(data, cudf.Series): - data = data._column - elif isinstance(data, column.ColumnBase): - data = data - else: - if isinstance(data, (list, tuple)): - if len(data) == 0: - data = np.asarray([], dtype="int64") - else: - data = np.asarray(data) - data = column.as_column(data) - assert isinstance(data, (NumericalColumn, StringColumn)) - name = kwargs.get("name") super().__init__({name: data}) @@ -985,8 +988,8 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # pandas returns numpy arrays when the outputs are boolean. for i, o in enumerate(out): # We explicitly _do not_ use isinstance here: we want only - # boolean GenericIndexes, not dtype-specific subclasses. - if type(o) is GenericIndex and o.dtype.kind == "b": + # boolean Indexes, not dtype-specific subclasses. + if type(o) is Index and o.dtype.kind == "b": out[i] = o.values return out[0] if ufunc.nout == 1 else tuple(out) @@ -995,14 +998,21 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @classmethod @_cudf_nvtx_annotate - def _from_data( - cls, data: MutableMapping, name: Any = None - ) -> GenericIndex: + def _from_data(cls, data: MutableMapping, name: Any = None) -> Self: out = super()._from_data(data=data) if name is not None: out.name = name return out + @classmethod + @_cudf_nvtx_annotate + def from_arrow(cls, obj): + try: + return cls(ColumnBase.from_arrow(obj)) + except TypeError: + # Try interpreting object as a MultiIndex before failing. + return cudf.MultiIndex.from_arrow(obj) + def _binaryop( self, other: Frame, @@ -1019,16 +1029,16 @@ def _binaryop( # pandas returns numpy arrays when the outputs are boolean. We # explicitly _do not_ use isinstance here: we want only boolean - # GenericIndexes, not dtype-specific subclasses. - if type(ret) is GenericIndex and ret.dtype.kind == "b": + # Indexes, not dtype-specific subclasses. + if type(ret) is Index and ret.dtype.kind == "b": return ret.values return ret # Override just to make mypy happy. @_cudf_nvtx_annotate def _copy_type_metadata( - self: GenericIndex, other: GenericIndex, *, override_dtypes=None - ) -> GenericIndex: + self, other: Self, *, override_dtypes=None + ) -> Self: return super()._copy_type_metadata( other, override_dtypes=override_dtypes ) @@ -1294,9 +1304,10 @@ def __repr__(self): output = output.replace("nan", cudf._NA_REP) elif preprocess._values.nullable: - output = repr(self._clean_nulls_from_index().to_pandas()) - - if not isinstance(self, StringIndex): + if isinstance(self._values, StringColumn): + output = repr(self.to_pandas(nullable=True)) + else: + output = repr(self._clean_nulls_from_index().to_pandas()) # We should remove all the single quotes # from the output due to the type-cast to # object dtype happening above. @@ -1341,7 +1352,7 @@ def __getitem__(self, index): @_cudf_nvtx_annotate def dtype(self): """ - `dtype` of the underlying values in GenericIndex. + `dtype` of the underlying values in Index. """ return self._values.dtype @@ -1382,19 +1393,21 @@ def get_slice_bound(self, label, side): return self._values.get_slice_bound(label, side) def _is_numeric(self): - return False + return isinstance( + self._values, cudf.core.column.NumericalColumn + ) and self.dtype != cudf.dtype("bool") def _is_boolean(self): - return True + return self.dtype == cudf.dtype("bool") def _is_integer(self): - return False + return cudf.api.types.is_integer_dtype(self.dtype) def _is_floating(self): - return False + return cudf.api.types.is_float_dtype(self.dtype) def _is_object(self): - return False + return isinstance(self._values, cudf.core.column.StringColumn) def _is_categorical(self): return False @@ -1536,333 +1549,19 @@ def isin(self, values): return self._values.isin(values).values - -class NumericIndex(GenericIndex): - """Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Returns - ------- - Index - """ - - # Subclasses must define the dtype they are associated with. - _dtype: Union[None, Type[np.number]] = None - + @copy_docstring(StringMethods) # type: ignore + @property @_cudf_nvtx_annotate - def __init__(self, data=None, dtype=None, copy=False, name=None): - warnings.warn( - f"cudf.{self.__class__.__name__} is deprecated and will be " - "removed from cudf in a future version. Use cudf.Index with the " - "appropriate dtype instead.", - FutureWarning, - ) - - dtype = type(self)._dtype - if copy: - data = column.as_column(data, dtype=dtype).copy() - - kwargs = _setdefault_name(data, name=name) - - data = column.as_column(data, dtype=dtype) - - super().__init__(data, **kwargs) - - def _is_numeric(self): - return True - - def _is_boolean(self): - return False - - def _is_integer(self): - return True - - def _is_floating(self): - return False - - def _is_object(self): - return False - - def _is_categorical(self): - return False - - def _is_interval(self): - return False - - -class Int8Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - Int8Index is a special case of Index with purely - integer(``int8``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Returns - ------- - Int8Index - """ - - _dtype = np.int8 - - -class Int16Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - Int16Index is a special case of Index with purely - integer(``int16``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Returns - ------- - Int16Index - """ - - _dtype = np.int16 - - -class Int32Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - Int32Index is a special case of Index with purely - integer(``int32``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Returns - ------- - Int32Index - """ - - _dtype = np.int32 - - -class Int64Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - Int64Index is a special case of Index with purely - integer(``int64``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Returns - ------- - Int64Index - """ - - _dtype = np.int64 - - -class UInt8Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - UInt8Index is a special case of Index with purely - integer(``uint64``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Returns - ------- - UInt8Index - """ - - _dtype = np.uint8 - - -class UInt16Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - UInt16Index is a special case of Index with purely - integer(``uint16``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Returns - ------- - UInt16Index - """ - - _dtype = np.uint16 - - -class UInt32Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - UInt32Index is a special case of Index with purely - integer(``uint32``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Returns - ------- - UInt32Index - """ - - _dtype = np.uint32 - - -class UInt64Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - UInt64Index is a special case of Index with purely - integer(``uint64``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Returns - ------- - UInt64Index - """ - - _dtype = np.uint64 - - -class Float32Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - Float32Index is a special case of Index with purely - float(``float32``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Returns - ------- - Float32Index - """ - - _dtype = np.float32 - - def _is_integer(self): - return False - - def _is_floating(self): - return True - - -class Float64Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - Float64Index is a special case of Index with purely - float(``float64``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Returns - ------- - Float64Index - """ - - _dtype = np.float64 - - def _is_integer(self): - return False - - def _is_floating(self): - return True + def str(self): + if isinstance(self._values, cudf.core.column.StringColumn): + return StringMethods(parent=self) + else: + raise AttributeError( + "Can only use .str accessor with string values!" + ) -class DatetimeIndex(GenericIndex): +class DatetimeIndex(Index): """ Immutable , ordered and sliceable sequence of datetime64 data, represented internally as int64. @@ -1952,7 +1651,6 @@ def __init__( if copy: data = data.copy() - super().__init__(data, **kwargs) @property # type: ignore @@ -1970,7 +1668,7 @@ def year(self): >>> datetime_index DatetimeIndex(['2000-12-31', '2001-12-31', '2002-12-31'], dtype='datetime64[ns]') >>> datetime_index.year - Int16Index([2000, 2001, 2002], dtype='int16') + Index([2000, 2001, 2002], dtype='int16') """ # noqa: E501 return self._get_dt_field("year") @@ -1989,7 +1687,7 @@ def month(self): >>> datetime_index DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31'], dtype='datetime64[ns]') >>> datetime_index.month - Int16Index([1, 2, 3], dtype='int16') + Index([1, 2, 3], dtype='int16') """ # noqa: E501 return self._get_dt_field("month") @@ -2008,7 +1706,7 @@ def day(self): >>> datetime_index DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03'], dtype='datetime64[ns]') >>> datetime_index.day - Int16Index([1, 2, 3], dtype='int16') + Index([1, 2, 3], dtype='int16') """ # noqa: E501 return self._get_dt_field("day") @@ -2029,7 +1727,7 @@ def hour(self): '2000-01-01 02:00:00'], dtype='datetime64[ns]') >>> datetime_index.hour - Int16Index([0, 1, 2], dtype='int16') + Index([0, 1, 2], dtype='int16') """ return self._get_dt_field("hour") @@ -2050,7 +1748,7 @@ def minute(self): '2000-01-01 00:02:00'], dtype='datetime64[ns]') >>> datetime_index.minute - Int16Index([0, 1, 2], dtype='int16') + Index([0, 1, 2], dtype='int16') """ return self._get_dt_field("minute") @@ -2071,7 +1769,7 @@ def second(self): '2000-01-01 00:00:02'], dtype='datetime64[ns]') >>> datetime_index.second - Int16Index([0, 1, 2], dtype='int16') + Index([0, 1, 2], dtype='int16') """ return self._get_dt_field("second") @@ -2092,7 +1790,7 @@ def microsecond(self): '2000-01-01 00:00:00.000002'], dtype='datetime64[ns]') >>> datetime_index.microsecond - Int32Index([0, 1, 2], dtype='int32') + Index([0, 1, 2], dtype='int32') """ # noqa: E501 return as_index( ( @@ -2124,7 +1822,7 @@ def nanosecond(self): '2000-01-01 00:00:00.000000002'], dtype='datetime64[ns]') >>> datetime_index.nanosecond - Int16Index([0, 1, 2], dtype='int16') + Index([0, 1, 2], dtype='int16') """ return self._get_dt_field("nanosecond") @@ -2146,7 +1844,7 @@ def weekday(self): '2017-01-08'], dtype='datetime64[ns]') >>> datetime_index.weekday - Int16Index([5, 6, 0, 1, 2, 3, 4, 5, 6], dtype='int16') + Index([5, 6, 0, 1, 2, 3, 4, 5, 6], dtype='int16') """ return self._get_dt_field("weekday") @@ -2168,7 +1866,7 @@ def dayofweek(self): '2017-01-08'], dtype='datetime64[ns]') >>> datetime_index.dayofweek - Int16Index([5, 6, 0, 1, 2, 3, 4, 5, 6], dtype='int16') + Index([5, 6, 0, 1, 2, 3, 4, 5, 6], dtype='int16') """ return self._get_dt_field("weekday") @@ -2191,7 +1889,7 @@ def dayofyear(self): '2017-01-08'], dtype='datetime64[ns]') >>> datetime_index.dayofyear - Int16Index([366, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int16') + Index([366, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int16') """ return self._get_dt_field("day_of_year") @@ -2214,7 +1912,7 @@ def day_of_year(self): '2017-01-08'], dtype='datetime64[ns]') >>> datetime_index.day_of_year - Int16Index([366, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int16') + Index([366, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int16') """ return self._get_dt_field("day_of_year") @@ -2249,7 +1947,7 @@ def quarter(self): Returns ------- - Int8Index + Index Integer indicating which quarter the date belongs to. Examples @@ -2258,7 +1956,7 @@ def quarter(self): >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:00:00", ... "1999-12-31 18:40:00"]) >>> gIndex.quarter - Int8Index([2, 4], dtype='int8') + Index([2, 4], dtype='int8') """ res = extract_quarter(self._values) return Index(res, dtype="int8") @@ -2303,7 +2001,7 @@ def to_pandas(self, nullable=False): def _get_dt_field(self, field): out_column = self._values.get_dt_field(field) # column.column_empty_like always returns a Column object - # but we need a NumericalColumn for GenericIndex.. + # but we need a NumericalColumn for Index.. # how should this be handled? out_column = column.build_column( data=out_column.base_data, @@ -2515,7 +2213,7 @@ def tz_convert(self, tz): return DatetimeIndex._from_data({self.name: result_col}) -class TimedeltaIndex(GenericIndex): +class TimedeltaIndex(Index): """ Immutable, ordered and sliceable sequence of timedelta64 data, represented internally as int64. @@ -2588,7 +2286,6 @@ def __init__( if copy: data = data.copy() - super().__init__(data, **kwargs) @_cudf_nvtx_annotate @@ -2605,8 +2302,9 @@ def days(self): """ Number of days for each element. """ + # Need to specifically return `int64` to avoid overflow. return as_index( - arbitrary=self._values.days, name=self.name, dtype="int32" + arbitrary=self._values.days, name=self.name, dtype="int64" ) @property # type: ignore @@ -2664,7 +2362,7 @@ def _is_boolean(self): return False -class CategoricalIndex(GenericIndex): +class CategoricalIndex(Index): """ A categorical of orderable values that represent the indices of another Column @@ -2759,7 +2457,6 @@ def __init__( data = data.as_ordered() elif ordered is False and data.ordered is True: data = data.as_unordered() - super().__init__(data, **kwargs) @property # type: ignore @@ -2929,7 +2626,7 @@ def interval_range( return IntervalIndex(interval_col) -class IntervalIndex(GenericIndex): +class IntervalIndex(Index): """ Immutable index of intervals that are closed on the same side. @@ -3043,80 +2740,6 @@ def _is_boolean(self): return False -class StringIndex(GenericIndex): - """String defined indices into another Column - - .. deprecated:: 23.06 - `StringIndex` is deprecated, use `Index` instead. - - Attributes - ---------- - _values: A StringColumn object or NDArray of strings - name: A string - """ - - @_cudf_nvtx_annotate - def __init__(self, values, copy=False, **kwargs): - warnings.warn( - f"cudf.{self.__class__.__name__} is deprecated and will be " - "removed from cudf in a future version. Use cudf.Index with the " - "appropriate dtype instead.", - FutureWarning, - ) - kwargs = _setdefault_name(values, **kwargs) - if isinstance(values, StringColumn): - values = values.copy(deep=copy) - elif isinstance(values, StringIndex): - values = values._values.copy(deep=copy) - else: - values = column.as_column(values, dtype="str") - if not is_string_dtype(values.dtype): - raise ValueError( - "Couldn't create StringIndex from passed in object" - ) - - super().__init__(values, **kwargs) - - @_cudf_nvtx_annotate - def to_pandas(self, nullable=False): - return pd.Index( - self.to_numpy(na_value=None), - name=self.name, - dtype=pd.StringDtype() if nullable else "object", - ) - - @_cudf_nvtx_annotate - def __repr__(self): - return ( - f"{self.__class__.__name__}({self._values.values_host}," - f" dtype='object'" - + ( - f", name={pd.io.formats.printing.default_pprint(self.name)}" - if self.name is not None - else "" - ) - + ")" - ) - - @copy_docstring(StringMethods) # type: ignore - @property - @_cudf_nvtx_annotate - def str(self): - return StringMethods(parent=self) - - def _clean_nulls_from_index(self): - if self._values.has_nulls(): - return self.fillna(cudf._NA_REP) - else: - return self - - def _is_boolean(self): - return False - - def _is_object(self): - return True - - @_cudf_nvtx_annotate def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex: """Create an Index from an arbitrary object @@ -3137,7 +2760,7 @@ def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex: result : subclass of Index - CategoricalIndex for Categorical input. - DatetimeIndex for Datetime input. - - GenericIndex for all other inputs. + - Index for all other inputs. """ kwargs = _setdefault_name(arbitrary, **kwargs) if isinstance(arbitrary, cudf.MultiIndex): @@ -3174,119 +2797,12 @@ def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex: ) -_dtype_to_index: Dict[Any, Type[NumericIndex]] = { - np.int8: Int8Index, - np.int16: Int16Index, - np.int32: Int32Index, - np.int64: Int64Index, - np.uint8: UInt8Index, - np.uint16: UInt16Index, - np.uint32: UInt32Index, - np.uint64: UInt64Index, - np.float32: Float32Index, - np.float64: Float64Index, -} - - def _setdefault_name(values, **kwargs): if kwargs.get("name") is None: kwargs["name"] = getattr(values, "name", None) return kwargs -class IndexMeta(type): - """Custom metaclass for Index that overrides instance/subclass tests.""" - - def __instancecheck__(self, instance): - return isinstance(instance, BaseIndex) - - def __subclasscheck__(self, subclass): - return issubclass(subclass, BaseIndex) - - -class Index(BaseIndex, metaclass=IndexMeta): - """The basic object storing row labels for all cuDF objects. - - Parameters - ---------- - data : array-like (1-dimensional)/ DataFrame - If it is a DataFrame, it will return a MultiIndex - dtype : NumPy dtype (default: object) - If dtype is None, we find the dtype that best fits the data. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - tupleize_cols : bool (default: True) - When True, attempt to create a MultiIndex if possible. - tupleize_cols == False is not yet supported. - nan_as_null : bool, Default True - If ``None``/``True``, converts ``np.nan`` values to - ``null`` values. - If ``False``, leaves ``np.nan`` values as is. - - Returns - ------- - Index - cudf Index - - Warnings - -------- - This class should not be subclassed. It is designed as a factory for - different subclasses of :class:`BaseIndex` depending on the provided input. - If you absolutely must, and if you're intimately familiar with the - internals of cuDF, subclass :class:`BaseIndex` instead. - - Examples - -------- - >>> import cudf - >>> cudf.Index([1, 2, 3], dtype="uint64", name="a") - UInt64Index([1, 2, 3], dtype='uint64', name='a') - - >>> cudf.Index(cudf.DataFrame({"a":[1, 2], "b":[2, 3]})) - MultiIndex([(1, 2), - (2, 3)], - names=['a', 'b']) - """ - - @_cudf_nvtx_annotate - def __new__( - cls, - data=None, - dtype=None, - copy=False, - name=None, - tupleize_cols=True, - nan_as_null=True, - **kwargs, - ): - assert ( - cls is Index - ), "Index cannot be subclassed, extend BaseIndex instead." - if tupleize_cols is not True: - raise NotImplementedError( - "tupleize_cols != True is not yet supported" - ) - - return as_index( - data, - copy=copy, - dtype=dtype, - name=name, - nan_as_null=nan_as_null, - **kwargs, - ) - - @classmethod - @_cudf_nvtx_annotate - def from_arrow(cls, obj): - try: - return cls(ColumnBase.from_arrow(obj)) - except TypeError: - # Try interpreting object as a MultiIndex before failing. - return cudf.MultiIndex.from_arrow(obj) - - @_cudf_nvtx_annotate def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex: """ diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 40330b45e5b..e406ef14080 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -948,7 +948,7 @@ def _copy_type_metadata( self._index, cudf.core.index.CategoricalIndex ): self._index = cudf.Index( - cast(cudf.core.index.NumericIndex, self._index)._column, + cast("cudf.Index", self._index)._column, name=self._index.name, ) elif isinstance(other._index, cudf.MultiIndex) and not isinstance( diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 0498aa474b6..cdc120935ee 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1000,11 +1000,11 @@ def _concat(cls, objs): obj.columns = colnames source_data = cudf.DataFrame._concat(source_data) - names = [None] * source_data._num_columns - objs = list(filter(lambda o: o.names is not None, objs)) - for o in range(len(objs)): - for i, name in enumerate(objs[o].names): - names[i] = names[i] or name + try: + # Only set names if all objs have the same names + (names,) = {o.names for o in objs} - {None} + except ValueError: + names = [None] * source_data._num_columns return cudf.MultiIndex.from_frame(source_data, names=names) @classmethod @@ -1377,7 +1377,7 @@ def droplevel(self, level=-1): Dropping multiple levels: >>> idx.droplevel(["first", "second"]) - Int64Index([0, 1, 2, 0, 1, 2], dtype='int64', name='third') + Index([0, 1, 2, 0, 1, 2], dtype='int64', name='third') """ mi = self.copy(deep=False) mi._poplevels(level) @@ -1779,7 +1779,7 @@ def _union(self, other, sort=None): # TODO: When to_frame is refactored to return a # deep copy in future, we should push most of the common # logic between MultiIndex._union & BaseIndex._union into - # GenericIndex._union. + # Index._union. other_df = other.copy(deep=True).to_frame(index=False) self_df = self.copy(deep=True).to_frame(index=False) col_names = list(range(0, self.nlevels)) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index b8164255e6d..d3cd84465ca 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -83,7 +83,7 @@ def _get_combined_index(indexes, intersect: bool = False, sort=None): else: index = indexes[0] if sort is None: - sort = not isinstance(index, cudf.StringIndex) + sort = not index._is_object() for other in indexes[1:]: index = index.union(other, sort=False) @@ -427,7 +427,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): elif typ is cudf.MultiIndex: return cudf.MultiIndex._concat(objs) elif issubclass(typ, cudf.Index): - return cudf.core.index.GenericIndex._concat(objs) + return cudf.core.index.Index._concat(objs) else: raise TypeError(f"cannot concatenate object of type {typ}") diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 34936253bf0..4af8aee171c 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1421,9 +1421,7 @@ def _concat(cls, objs, axis=0, index=True): if isinstance(objs[0].index, cudf.MultiIndex): index = cudf.MultiIndex._concat([o.index for o in objs]) else: - index = cudf.core.index.GenericIndex._concat( - [o.index for o in objs] - ) + index = cudf.core.index.Index._concat([o.index for o in objs]) names = {obj.name for obj in objs} if len(names) == 1: @@ -3327,7 +3325,7 @@ def keys(self): c 3 dtype: int64 >>> sr.keys() - StringIndex(['a' 'b' 'c'], dtype='object') + Index(['a', 'b', 'c'], dtype='object') """ return self.index diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index d058d4cee75..27cd1085fa7 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -167,7 +167,7 @@ def from_arrow(cls, array): >>> import cudf >>> import pyarrow as pa >>> cudf.Index.from_arrow(pa.array(["a", "b", None])) - StringIndex(['a' 'b' None], dtype='object') + Index(['a' 'b' None], dtype='object') >>> cudf.Series.from_arrow(pa.array(["a", "b", None])) 0 a 1 b @@ -274,7 +274,7 @@ def factorize(self, sort=False, use_na_sentinel=True): >>> codes array([0, 0, 1], dtype=int8) >>> uniques - StringIndex(['a' 'c'], dtype='object') + Index(['a' 'c'], dtype='object') """ return cudf.core.algorithms.factorize( self, diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 484c013f774..0f54391b426 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -65,25 +65,17 @@ def _check_types( if not exact or exact == "equiv": if ( isinstance(left, cudf.RangeIndex) - and isinstance( - right, - ( - cudf.Int8Index, - cudf.Int16Index, - cudf.Int32Index, - cudf.Int64Index, - ), + and ( + isinstance(right, cudf.Index) + and hasattr(right, "dtype") + and right.dtype.kind == "i" ) ) or ( isinstance(right, cudf.RangeIndex) - and isinstance( - left, - ( - cudf.Int8Index, - cudf.Int16Index, - cudf.Int32Index, - cudf.Int64Index, - ), + and ( + isinstance(left, cudf.Index) + and hasattr(left, "dtype") + and left.dtype.kind == "i" ) ): return @@ -324,7 +316,7 @@ def assert_index_equal( exact : bool or {'equiv'}, default 'equiv' Whether to check the Index class, dtype and inferred_type are identical. If 'equiv', then RangeIndex can be substituted - for Int8Index, Int16Index, Int32Index, Int64Index as well. + for Index with an int8/int32/int64 dtype as well. check_names : bool, default True Whether to check the names attribute. check_less_precise : bool or int, default False diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index e62f19f7877..c74d1fdd85b 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -664,11 +664,11 @@ def test_different_shapes_and_columns_with_unaligned_indices(binop): # Test with a RangeIndex pdf1 = pd.DataFrame({"x": [4, 3, 2, 1], "y": [7, 3, 8, 6]}) - # Test with a GenericIndex + # Test with a Index pdf2 = pd.DataFrame( {"x": [1, 2, 3, 7], "y": [4, 5, 6, 7]}, index=[0, 1, 3, 4] ) - # Test with a GenericIndex in a different order + # Test with a Index in a different order pdf3 = pd.DataFrame( {"x": [4, 5, 6, 7], "y": [1, 2, 3, 7], "z": [0, 5, 3, 7]}, index=[0, 3, 5, 3], diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 5875959b0c2..e6f2f9ec448 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -6518,7 +6518,7 @@ def test_dataframe_info_basic(): str_cmp = textwrap.dedent( """\ - StringIndex: 10 entries, a to 1111 + Index: 10 entries, a to 1111 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- @@ -6591,7 +6591,7 @@ def test_dataframe_info_verbose_mem_usage(): str_cmp = textwrap.dedent( """\ - StringIndex: 3 entries, sdfdsf to dsfdf + Index: 3 entries, sdfdsf to dsfdf Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 5583b2290ae..7c610eca88c 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -33,6 +33,7 @@ TIMEDELTA_TYPES, assert_eq, assert_exceptions_equal, + expect_warning_if, ) from cudf.testing.dataset_generator import rand_dataframe @@ -1290,7 +1291,7 @@ def test_groupby_index_type(): df["string_col"] = ["a", "b", "c"] df["counts"] = [1, 2, 3] res = df.groupby(by="string_col").counts.sum() - assert isinstance(res.index, cudf.StringIndex) + assert res.index.dtype == cudf.dtype("object") @pytest.mark.parametrize( @@ -2020,7 +2021,7 @@ def test_groupby_no_keys(pdf): pdf.groupby([]).max(), gdf.groupby([]).max(), check_dtype=False, - check_index_type=False, # Int64Index v/s Float64Index + check_index_type=False, # Int64 v/s Float64 **kwargs, ) @@ -2038,7 +2039,7 @@ def test_groupby_apply_no_keys(pdf): assert_groupby_results_equal( pdf.groupby([], group_keys=False).apply(lambda x: x.max()), gdf.groupby([]).apply(lambda x: x.max()), - check_index_type=False, # Int64Index v/s Float64Index + check_index_type=False, # Int64 v/s Float64 **kwargs, ) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 0bfd486ae74..de4c72389cf 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -15,7 +15,7 @@ from cudf.core.index import ( CategoricalIndex, DatetimeIndex, - GenericIndex, + Index, IntervalIndex, RangeIndex, as_index, @@ -204,9 +204,9 @@ def test_pandas_as_index(): gdf_category_index = as_index(pdf_category_index) # Check instance types - assert isinstance(gdf_int_index, GenericIndex) - assert isinstance(gdf_uint_index, GenericIndex) - assert isinstance(gdf_float_index, GenericIndex) + assert isinstance(gdf_int_index, Index) + assert isinstance(gdf_uint_index, Index) + assert isinstance(gdf_float_index, Index) assert isinstance(gdf_datetime_index, DatetimeIndex) assert isinstance(gdf_category_index, CategoricalIndex) @@ -329,7 +329,7 @@ def test_index_copy_datetime(name, deep=True): @pytest.mark.parametrize("name", ["x"]) def test_index_copy_string(name, deep=True): - cidx = cudf.StringIndex(["a", "b", "c"]) + cidx = cudf.Index(["a", "b", "c"]) pidx = cidx.to_pandas() pidx_copy = pidx.copy(name=name, deep=deep) @@ -393,12 +393,12 @@ def test_index_copy_deep(idx, deep, copy_on_write): original_cow_setting = cudf.get_option("copy_on_write") cudf.set_option("copy_on_write", copy_on_write) if ( - isinstance(idx, cudf.StringIndex) + isinstance(idx._values, cudf.core.column.StringColumn) or not deep or (cudf.get_option("copy_on_write") and not deep) ): # StringColumn is immutable hence, deep copies of a - # StringIndex will share the same StringColumn. + # Index with string dtype will share the same StringColumn. # When `copy_on_write` is turned on, Index objects will # have unique column object but they all point to same @@ -1207,91 +1207,48 @@ def test_index_basic(data, dtype, name): @pytest.mark.parametrize("name", [1, "a", None]) @pytest.mark.parametrize("dtype", SIGNED_INTEGER_TYPES) def test_integer_index_apis(data, name, dtype): - with pytest.warns(FutureWarning): - pindex = pd.Int64Index(data, dtype=dtype, name=name) - # Int8Index - with pytest.warns(FutureWarning): - gindex = cudf.Int8Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("int8") - - # Int16Index - with pytest.warns(FutureWarning): - gindex = cudf.Int16Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("int16") - - # Int32Index - with pytest.warns(FutureWarning): - gindex = cudf.Int32Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("int32") + if PANDAS_GE_200: + pindex = pd.Index(data, dtype=dtype, name=name) + else: + with pytest.warns(FutureWarning): + pindex = pd.Int64Index(data, dtype=dtype, name=name) - # Int64Index - with pytest.warns(FutureWarning): - gindex = cudf.Int64Index(data, dtype=dtype, name=name) + gindex = cudf.Index(data, dtype=dtype, name=name) assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("int64") + assert gindex.dtype == dtype @pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) @pytest.mark.parametrize("name", [1, "a", None]) @pytest.mark.parametrize("dtype", UNSIGNED_TYPES) def test_unsigned_integer_index_apis(data, name, dtype): - with pytest.warns(FutureWarning): - pindex = pd.UInt64Index(data, dtype=dtype, name=name) - # UInt8Index - with pytest.warns(FutureWarning): - gindex = cudf.UInt8Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("uint8") - - # UInt16Index - with pytest.warns(FutureWarning): - gindex = cudf.UInt16Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("uint16") - - # UInt32Index - with pytest.warns(FutureWarning): - gindex = cudf.UInt32Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("uint32") + if PANDAS_GE_200: + pindex = pd.Index(data, dtype=dtype, name=name) + else: + with pytest.warns(FutureWarning): + pindex = pd.UInt64Index(data, dtype=dtype, name=name) - # UInt64Index - with pytest.warns(FutureWarning): - gindex = cudf.UInt64Index(data, dtype=dtype, name=name) + gindex = cudf.Index(data, dtype=dtype, name=name) assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("uint64") + assert gindex.dtype == dtype @pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) @pytest.mark.parametrize("name", [1, "a", None]) @pytest.mark.parametrize("dtype", FLOAT_TYPES) def test_float_index_apis(data, name, dtype): - with pytest.warns(FutureWarning): - pindex = pd.Float64Index(data, dtype=dtype, name=name) - # Float32Index - with pytest.warns(FutureWarning): - gindex = cudf.Float32Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("float32") + if PANDAS_GE_200: + pindex = pd.Index(data, dtype=dtype, name=name) + else: + with pytest.warns(FutureWarning): + pindex = pd.Float64Index(data, dtype=dtype, name=name) - # Float64Index - with pytest.warns(FutureWarning): - gindex = cudf.Float64Index(data, dtype=dtype, name=name) + gindex = cudf.Index(data, dtype=dtype, name=name) assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("float64") + assert gindex.dtype == dtype @pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) @@ -1591,6 +1548,9 @@ def test_interval_index_from_breaks(closed): [[1, 2, 3, 4], ["yellow", "violet", "pink", "white"]], names=("number1", "color2"), ), + pd.MultiIndex.from_arrays( + [[1, 1, 2, 2], ["red", "blue", "red", "blue"]], + ), ], ) @pytest.mark.parametrize( @@ -1604,6 +1564,9 @@ def test_interval_index_from_breaks(closed): [[1, 2, 3, 4], ["yellow", "violet", "pink", "white"]], names=("number1", "color2"), ), + pd.MultiIndex.from_arrays( + [[1, 1, 2, 2], ["red", "blue", "red", "blue"]], + ), ], ) def test_multiindex_append(data, other): @@ -1726,7 +1689,7 @@ def test_index_fillna(data, fill_value): assert_eq( pdi.fillna(fill_value), gdi.fillna(fill_value), exact=False - ) # Int64Index v/s Float64Index + ) # Int64 v/s Float64 @pytest.mark.parametrize( @@ -1764,7 +1727,7 @@ def test_index_from_arrow(data): arrow_array = pa.Array.from_pandas(pdi) expected_index = pd.Index(arrow_array.to_pandas()) gdi = cudf.Index.from_arrow(arrow_array) - if PANDAS_GE_200: + if PANDAS_GE_200 and gdi.dtype == cudf.dtype("datetime64[s]"): # Arrow bug: # https://github.com/apache/arrow/issues/33321 # arrow cannot convert non-nanosecond diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py index 1fcf41389dc..db7e4588e95 100644 --- a/python/cudf/cudf/tests/test_monotonic.py +++ b/python/cudf/cudf/tests/test_monotonic.py @@ -13,7 +13,7 @@ from cudf.core.index import ( CategoricalIndex, DatetimeIndex, - GenericIndex, + Index, RangeIndex, ) from cudf.testing._utils import assert_eq @@ -49,7 +49,7 @@ def test_range_index(testrange): ) def test_generic_index(testlist): - index = GenericIndex(testlist) + index = Index(testlist) index_pd = pd.Index(testlist) assert index.is_unique == index_pd.is_unique @@ -222,7 +222,7 @@ def test_multiindex_tuples(testarr): ) @pytest.mark.parametrize("side", ["left", "right"]) def test_get_slice_bound(testlist, side): - index = GenericIndex(testlist) + index = Index(testlist) index_pd = pd.Index(testlist) for label in testlist: expect = index_pd.get_slice_bound(label, side) @@ -269,7 +269,7 @@ def test_rangeindex_get_slice_bound_step(bounds, label, side): @pytest.mark.parametrize("side", ["left", "right"]) def test_get_slice_bound_missing(label, side): mylist = [2, 4, 6, 8, 10] - index = GenericIndex(mylist) + index = Index(mylist) index_pd = pd.Index(mylist) expect = index_pd.get_slice_bound(label, side) @@ -284,7 +284,7 @@ def test_get_slice_bound_missing_str(label, side): # Slicing for monotonic string indices not yet supported # when missing values are specified (allowed in pandas) mylist = ["b", "d", "f"] - index = GenericIndex(mylist) + index = Index(mylist) index_pd = pd.Index(mylist) got = index.get_slice_bound(label, side) expect = index_pd.get_slice_bound(label, side) diff --git a/python/cudf/cudf/tests/test_pack.py b/python/cudf/cudf/tests/test_pack.py index 9b5a8c19cf5..9011efebedb 100644 --- a/python/cudf/cudf/tests/test_pack.py +++ b/python/cudf/cudf/tests/test_pack.py @@ -18,7 +18,7 @@ import numpy as np import pandas as pd -from cudf import DataFrame, GenericIndex, Series +from cudf import DataFrame, Index, Series from cudf._lib.copying import pack, unpack from cudf.testing._utils import assert_eq @@ -52,7 +52,7 @@ def check_packed_equality(df): assert_packed_frame_equality(df[2:-2]) # sorted sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, GenericIndex) + assert isinstance(sortvaldf.index, Index) assert_packed_frame_equality(sortvaldf) @@ -120,7 +120,7 @@ def check_packed_unique_pointers(df): assert_packed_frame_unique_pointers(df[2:-2]) # sorted sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, GenericIndex) + assert isinstance(sortvaldf.index, Index) assert_packed_frame_unique_pointers(sortvaldf) @@ -188,7 +188,7 @@ def check_packed_pickled_equality(df): assert_packed_frame_picklable(df[2:-2]) # sorted sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, GenericIndex) + assert isinstance(sortvaldf.index, Index) assert_packed_frame_picklable(sortvaldf) # out-of-band buffers = [] @@ -261,7 +261,7 @@ def check_packed_serialized_equality(df): assert_packed_frame_serializable(df[2:-2]) # sorted sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, GenericIndex) + assert isinstance(sortvaldf.index, Index) assert_packed_frame_serializable(sortvaldf) diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py index 71c1f206a64..69ccb5be860 100644 --- a/python/cudf/cudf/tests/test_pickling.py +++ b/python/cudf/cudf/tests/test_pickling.py @@ -6,7 +6,7 @@ import pandas as pd import pytest -from cudf import DataFrame, GenericIndex, RangeIndex, Series +from cudf import DataFrame, Index, RangeIndex, Series from cudf.core.buffer import as_buffer from cudf.testing._utils import assert_eq @@ -22,7 +22,7 @@ def check_serialization(df): assert_frame_picklable(df[2:-2]) # sorted sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, (GenericIndex, RangeIndex)) + assert isinstance(sortvaldf.index, (Index, RangeIndex)) assert_frame_picklable(sortvaldf) # out-of-band buffers = [] @@ -80,7 +80,7 @@ def test_memory_usage_dataframe(): def test_pickle_index(): nelem = 10 - idx = GenericIndex(np.arange(nelem), name="a") + idx = Index(np.arange(nelem), name="a") pickled = pickle.dumps(idx) out = pickle.loads(pickled) assert (idx == out).all() diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index e7fa401f1ec..7a67fddd87b 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -302,39 +302,40 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows): [ ( cudf.Index([1, 2, 3, None]), - "Int64Index([1, 2, 3, ], dtype='int64')", + "Index([1, 2, 3, ], dtype='int64')", ), ( cudf.Index([None, 2.2, 3.324342, None]), - "Float64Index([, 2.2, 3.324342, ], dtype='float64')", + "Index([, 2.2, 3.324342, ], dtype='float64')", ), ( cudf.Index([None, None, None], name="hello"), - "StringIndex([None None None], dtype='object', name='hello')", + "Index([, , ], dtype='object', name='hello')", ), ( cudf.Index([None, None, None], dtype="float", name="hello"), - "Float64Index([, , ], dtype='float64', name='hello')", + "Index([, , ], dtype='float64', name='hello')", ), ( cudf.Index([None], dtype="float64", name="hello"), - "Float64Index([], dtype='float64', name='hello')", + "Index([], dtype='float64', name='hello')", ), ( cudf.Index([None], dtype="int8", name="hello"), - "Int8Index([], dtype='int8', name='hello')", + "Index([], dtype='int8', name='hello')", ), ( cudf.Index([None] * 50, dtype="object"), - "StringIndex([None None None None None None None None " - "None None None None None None\n None None None None None None " - "None None None None None None None None\n None None None None " - "None None None None None None None None None None\n None None " - "None None None None None None], dtype='object')", + "Index([, , , , , , , , , " + ", , ,\n , , , , , , , " + ", , , , ,\n , , , , " + ", , , , , , , ,\n , " + ", , , , , , , , , , " + ",\n , ],\n dtype='object')", ), ( cudf.Index([None] * 20, dtype="uint32"), - "UInt32Index([, , , , , , , , " + "Index([, , , , , , , , " ",\n , , , , , , , , " ",\n , ],\n dtype='uint32')", ), @@ -342,7 +343,7 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows): cudf.Index( [None, 111, 22, 33, None, 23, 34, 2343, None], dtype="int16" ), - "Int16Index([, 111, 22, 33, , 23, 34, 2343, ], " + "Index([, 111, 22, 33, , 23, 34, 2343, ], " "dtype='int16')", ), ( @@ -482,7 +483,7 @@ def test_dataframe_null_index_repr(df, pandas_special_case): actual_repr = repr(gdf) if pandas_special_case: - # Pandas inconsistently print StringIndex null values + # Pandas inconsistently print Index null values # as `None` at some places and `NaN` at few other places # Whereas cudf is consistent with strings `null` values # to be printed as `None` everywhere. @@ -561,7 +562,7 @@ def test_series_null_index_repr(sr, pandas_special_case): actual_repr = repr(gsr) if pandas_special_case: - # Pandas inconsistently print StringIndex null values + # Pandas inconsistently print Index null values # as `None` at some places and `NaN` at few other places # Whereas cudf is consistent with strings `null` values # to be printed as `None` everywhere. diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py index e7f26e259c6..2fdc3ef441b 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/test_serialize.py @@ -193,8 +193,8 @@ def test_serialize_range_index(): def test_serialize_generic_index(): - index = cudf.core.index.GenericIndex(cudf.Series(np.arange(10))) - outindex = cudf.core.index.GenericIndex.deserialize(*index.serialize()) + index = cudf.core.index.Index(cudf.Series(np.arange(10))) + outindex = cudf.core.index.Index.deserialize(*index.serialize()) assert_eq(index, outindex) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 200bd30cb12..618f94ed25b 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -17,7 +17,7 @@ from cudf import concat from cudf.core._compat import PANDAS_GE_150 from cudf.core.column.string import StringColumn -from cudf.core.index import StringIndex, as_index +from cudf.core.index import Index, as_index from cudf.testing._utils import ( DATETIME_TYPES, NUMERIC_TYPES, @@ -1075,8 +1075,7 @@ def test_string_index(): pdf.index = stringIndex gdf.index = stringIndex assert_eq(pdf, gdf) - with pytest.warns(FutureWarning): - stringIndex = StringIndex(["a", "b", "c", "d", "e"], name="name") + stringIndex = Index(["a", "b", "c", "d", "e"], name="name") pdf.index = stringIndex.to_pandas() gdf.index = stringIndex assert_eq(pdf, gdf) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 821ec103204..e44775e56df 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -57,8 +57,8 @@ def _nonempty_index(idx): data = np.array([start, "1970-01-02"], dtype=idx.dtype) values = cudf.core.column.as_column(data) return cudf.core.index.DatetimeIndex(values, name=idx.name) - elif isinstance(idx, cudf.StringIndex): - return cudf.StringIndex(["cat", "dog"], name=idx.name) + elif isinstance(idx._column, cudf.core.column.StringColumn): + return cudf.Index(["cat", "dog"], name=idx.name) elif isinstance(idx, cudf.core.index.CategoricalIndex): key = tuple(idx._data.keys()) assert len(key) == 1 @@ -69,8 +69,8 @@ def _nonempty_index(idx): categories=categories, codes=codes, ordered=ordered ) return cudf.core.index.CategoricalIndex(values, name=idx.name) - elif isinstance(idx, cudf.core.index.GenericIndex): - return cudf.core.index.GenericIndex( + elif isinstance(idx, cudf.core.index.Index): + return cudf.core.index.Index( np.arange(2, dtype=idx.dtype), name=idx.name ) elif isinstance(idx, cudf.core.multiindex.MultiIndex):