From 3a95a191fea55cd3985f312f3154452be3e3d2b6 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 28 Apr 2021 14:09:05 -0700 Subject: [PATCH 01/20] Create a new 1d frame class and implement a name for them. --- python/cudf/cudf/core/frame.py | 17 +++++++++++++++++ python/cudf/cudf/core/index.py | 34 +++++++++++---------------------- python/cudf/cudf/core/series.py | 15 ++------------- 3 files changed, 30 insertions(+), 36 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 5c4186c4ac7..4fc2e3990fb 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3295,6 +3295,23 @@ def _reindex( return self._mimic_inplace(result, inplace=inplace) +class FrameOneD(Frame): + """A one-dimensional frame. + + Frames with only a single dimension share certain logic that is encoded in + this class. + """ + + @property + def name(self): + """The name of this object.""" + return next(iter(self._data.names)) + + @name.setter + def name(self, value): + self._data[value] = self._data.pop(self.name) + + def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] ) -> Tuple[Dict[Any, bool], Dict[Any, Any], Dict[Any, Any]]: diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 0ffe0c11fef..786bf44bb74 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -29,7 +29,7 @@ ) from cudf.core.column.string import StringMethods as StringMethods from cudf.core.dtypes import IntervalDtype -from cudf.core.frame import Frame +from cudf.core.frame import FrameOneD from cudf.utils import ioutils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( @@ -73,7 +73,7 @@ def _to_frame(this_index, index=True, name=None): ) -class Index(Frame, Serializable): +class Index(FrameOneD, Serializable): dtype: DtypeObj @@ -388,18 +388,6 @@ def names(self, values): self.name = values[0] - @property - def name(self): - """ - Returns the name of the Index. - """ - return next(iter(self._data.names)) - - @name.setter - def name(self, value): - col = self._data.pop(self.name) - self._data[value] = col - def dropna(self, how="any"): """ Return an Index with null values removed. @@ -1557,7 +1545,7 @@ def _from_table(cls, table): @classmethod def _from_data(cls, data, index=None): - return cls._from_table(Frame(data=data)) + return cls._from_table(FrameOneD(data=data)) _accessors = set() # type: Set[Any] @@ -1606,7 +1594,7 @@ def __new__( if step == 0: raise ValueError("Step must not be zero.") - out = Frame.__new__(cls) + out = FrameOneD.__new__(cls) if isinstance(start, range): therange = start start = therange.start @@ -1978,7 +1966,7 @@ def __new__(cls, values, **kwargs): Column's name. Otherwise if this name is different from the value Column's, the values Column will be cloned to adopt this name. """ - out = Frame.__new__(cls) + out = FrameOneD.__new__(cls) out._initialize(values, **kwargs) return out @@ -2201,7 +2189,7 @@ class NumericIndex(GenericIndex): def __new__(cls, data=None, dtype=None, copy=False, name=None): - out = Frame.__new__(cls) + out = FrameOneD.__new__(cls) dtype = _index_to_dtype[cls] if copy: data = column.as_column(data, dtype=dtype).copy() @@ -2323,7 +2311,7 @@ def __new__( # pandas dtindex creation first which. For now # just make sure we handle np.datetime64 arrays # and then just dispatch upstream - out = Frame.__new__(cls) + out = FrameOneD.__new__(cls) if freq is not None: raise NotImplementedError("Freq is not yet supported") @@ -2578,7 +2566,7 @@ def __new__( name=None, ) -> "TimedeltaIndex": - out = Frame.__new__(cls) + out = FrameOneD.__new__(cls) if freq is not None: raise NotImplementedError("freq is not yet supported") @@ -2710,7 +2698,7 @@ def __new__( ) if copy: data = column.as_column(data, dtype=dtype).copy(deep=True) - out = Frame.__new__(cls) + out = FrameOneD.__new__(cls) kwargs = _setdefault_name(data, name=name) if isinstance(data, CategoricalColumn): data = data @@ -2936,7 +2924,7 @@ def __new__( ) -> "IntervalIndex": if copy: data = column.as_column(data, dtype=dtype).copy() - out = Frame.__new__(cls) + out = FrameOneD.__new__(cls) kwargs = _setdefault_name(data, name=name) if isinstance(data, IntervalColumn): data = data @@ -3009,7 +2997,7 @@ class StringIndex(GenericIndex): """ def __new__(cls, values, copy=False, **kwargs): - out = Frame.__new__(cls) + out = FrameOneD.__new__(cls) kwargs = _setdefault_name(values, **kwargs) if isinstance(values, StringColumn): values = values.copy(deep=copy) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 4cc5fb56a4c..afe4f8e25ed 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -38,7 +38,7 @@ from cudf.core.column.string import StringMethods from cudf.core.column.struct import StructMethods from cudf.core.column_accessor import ColumnAccessor -from cudf.core.frame import Frame, _drop_rows_by_labels +from cudf.core.frame import FrameOneD, _drop_rows_by_labels from cudf.core.groupby.groupby import SeriesGroupBy from cudf.core.index import Index, RangeIndex, as_index from cudf.core.indexing import _SeriesIlocIndexer, _SeriesLocIndexer @@ -61,7 +61,7 @@ ) -class Series(Frame, Serializable): +class Series(FrameOneD, Serializable): @property def _constructor(self): return Series @@ -441,17 +441,6 @@ def ndim(self): """ return 1 - @property - def name(self): - """Returns name of the Series. - """ - return self._data.names[0] - - @name.setter - def name(self, value): - col = self._data.pop(self.name) - self._data[value] = col - @classmethod def deserialize(cls, header, frames): index_nframes = header["index_frame_count"] From a2718d179717dcdbd39943e8a50654fcf208fd4a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 28 Apr 2021 14:11:25 -0700 Subject: [PATCH 02/20] Move ndim. --- python/cudf/cudf/core/frame.py | 5 +++++ python/cudf/cudf/core/index.py | 6 ------ python/cudf/cudf/core/series.py | 6 ------ 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 4fc2e3990fb..807663559ca 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3311,6 +3311,11 @@ def name(self): def name(self, value): self._data[value] = self._data.pop(self.name) + @property + def ndim(self): + """Dimension of the data (always 1).""" + return 1 + def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 786bf44bb74..af649a883b8 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -362,12 +362,6 @@ def deserialize(cls, header, frames): index = col_typ.deserialize(h, frames[: header["frame_count"]]) return idx_typ(index, name=name) - @property - def ndim(self): - """Dimension of the data. Apart from MultiIndex ndim is always 1. - """ - return 1 - @property def names(self): """ diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index afe4f8e25ed..c63baf1be6f 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -435,12 +435,6 @@ def dt(self): "Can only use .dt accessor with datetimelike values" ) - @property - def ndim(self): - """Dimension of the data. Series ndim is always 1. - """ - return 1 - @classmethod def deserialize(cls, header, frames): index_nframes = header["index_frame_count"] From b176e5480938d5e6dde06dfc97a875448f74851b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 28 Apr 2021 14:17:38 -0700 Subject: [PATCH 03/20] Remove unnecessary constructor property. --- python/cudf/cudf/core/dataframe.py | 6 +----- python/cudf/cudf/core/frame.py | 2 +- python/cudf/cudf/core/series.py | 8 ++------ python/cudf/cudf/tests/test_dataframe.py | 6 ------ 4 files changed, 4 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 131463e8871..179dc14ccb3 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -554,10 +554,6 @@ def _align_input_series_indices(data, index): return data, index - @property - def _constructor(self): - return DataFrame - @property def _constructor_sliced(self): return Series @@ -1456,7 +1452,7 @@ def _get_columns_by_label(self, labels, downcast=False): new_data, index=self.index, name=labels ) return out - out = self._constructor()._from_data( + out = self.__class__()._from_data( new_data, index=self.index, columns=new_data.to_pandas_index() ) return out diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 807663559ca..0147d452811 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -608,7 +608,7 @@ def _get_columns_by_index(self, indices): """ data = self._data.select_by_index(indices) - return self._constructor( + return self.__class__( data, columns=data.to_pandas_index(), index=self.index ) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index c63baf1be6f..124a0f5f7ac 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -62,10 +62,6 @@ class Series(FrameOneD, Serializable): - @property - def _constructor(self): - return Series - @property def _constructor_sliced(self): raise NotImplementedError( @@ -470,9 +466,9 @@ def _get_columns_by_label(self, labels, downcast=False): new_data = super()._get_columns_by_label(labels, downcast) return ( - self._constructor(data=new_data, index=self.index) + self.__class__(data=new_data, index=self.index) if len(new_data) > 0 - else self._constructor(dtype=self.dtype, name=self.name) + else self.__class__(dtype=self.dtype, name=self.name) ) @classmethod diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 06b08d79093..5ae678d6839 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -4377,12 +4377,6 @@ def test_constructor_properties(): df[key1] = val1 df[key2] = val2 - # Correct use of _constructor (for DataFrame) - assert_eq(df, df._constructor({key1: val1, key2: val2})) - - # Correct use of _constructor (for cudf.Series) - assert_eq(df[key1], df[key2]._constructor(val1, name=key1)) - # Correct use of _constructor_sliced (for DataFrame) assert_eq(df[key1], df._constructor_sliced(val1, name=key1)) From 90e112c5338c7f0cef69808fc85154cfb6fd1999 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 28 Apr 2021 14:20:44 -0700 Subject: [PATCH 04/20] Move shape. --- python/cudf/cudf/core/frame.py | 6 ++++++ python/cudf/cudf/core/index.py | 6 ------ python/cudf/cudf/core/series.py | 6 ------ 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 0147d452811..0957c1466c5 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3316,6 +3316,12 @@ def ndim(self): """Dimension of the data (always 1).""" return 1 + @property + def shape(self): + """Returns a tuple representing the dimensionality of the Index. + """ + return (len(self),) + def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index af649a883b8..57cf977a2c9 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -180,12 +180,6 @@ def drop_duplicates(self, keep="first"): """ # noqa: E501 return super().drop_duplicates(keep=keep) - @property - def shape(self): - """Returns a tuple representing the dimensionality of the Index. - """ - return (len(self),) - def serialize(self): header = {} header["index_column"] = {} diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 124a0f5f7ac..f020be286ae 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -397,12 +397,6 @@ def serialize(self): return header, frames - @property - def shape(self): - """Returns a tuple representing the dimensionality of the Series. - """ - return (len(self),) - @property def dt(self): """ From b99444fc3656eb3a3d48e45e10f8aa1db0a70e2e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 28 Apr 2021 15:11:01 -0700 Subject: [PATCH 05/20] Move __iter__. --- python/cudf/cudf/core/frame.py | 3 +++ python/cudf/cudf/core/index.py | 3 --- python/cudf/cudf/core/series.py | 7 ++----- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 0957c1466c5..db782ad7371 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3322,6 +3322,9 @@ def shape(self): """ return (len(self),) + def __iter__(self): + cudf.utils.utils.raise_iteration_error(obj=self) + def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 57cf977a2c9..d583d22cb60 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -271,9 +271,6 @@ def get_level_values(self, level): else: raise KeyError(f"Requested level with name {level} " "not found") - def __iter__(self): - cudf.utils.utils.raise_iteration_error(obj=self) - @classmethod def from_arrow(cls, array): """Convert PyArrow Array/ChunkedArray to Index diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index f020be286ae..3e9a7d98963 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1159,12 +1159,9 @@ def __getitem__(self, arg): else: return self.loc[arg] - def __iter__(self): - cudf.utils.utils.raise_iteration_error(obj=self) + iteritems = FrameOneD.__iter__ - iteritems = __iter__ - - items = __iter__ + items = FrameOneD.__iter__ def to_dict(self, into=dict): raise TypeError( From 58f0a236d42ed3d10a7dccf79011dcaf2a741a55 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 28 Apr 2021 15:46:22 -0700 Subject: [PATCH 06/20] Rename index._values to index._column. --- python/cudf/cudf/core/column/categorical.py | 12 +- python/cudf/cudf/core/column/column.py | 2 +- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/frame.py | 4 +- python/cudf/cudf/core/index.py | 126 ++++++++++---------- python/cudf/cudf/core/indexing.py | 6 +- python/cudf/cudf/core/series.py | 4 +- python/cudf/cudf/tests/test_dataframe.py | 10 +- python/cudf/cudf/tests/test_index.py | 30 ++--- 9 files changed, 96 insertions(+), 100 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 3cd1a599ddc..1e11138fcf0 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -918,7 +918,7 @@ def as_numerical(self) -> NumericalColumn: @property def categories(self) -> ColumnBase: - return self.dtype.categories._values + return self.dtype.categories._column @categories.setter def categories(self, value): @@ -1044,7 +1044,7 @@ def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn: self._encode(other), size=len(self), dtype=self.codes.dtype ) col = column.build_categorical_column( - categories=self.dtype.categories._values, + categories=self.dtype.categories._column, codes=column.as_column(ary), mask=self.base_mask, ordered=self.dtype.ordered, @@ -1056,7 +1056,7 @@ def sort_by_values( ) -> Tuple[CategoricalColumn, NumericalColumn]: codes, inds = self.as_numerical.sort_by_values(ascending, na_position) col = column.build_categorical_column( - categories=self.dtype.categories._values, + categories=self.dtype.categories._column, codes=column.as_column(codes.base_data, dtype=codes.dtype), mask=codes.base_mask, size=codes.size, @@ -1317,7 +1317,7 @@ def fillna( result = super().fillna(value=fill_value, method=method) result = column.build_categorical_column( - categories=self.dtype.categories._values, + categories=self.dtype.categories._column, codes=column.as_column(result.base_data, dtype=result.dtype), offset=result.offset, size=result.size, @@ -1381,7 +1381,7 @@ def as_categorical_column( if not isinstance(dtype, CategoricalDtype): raise ValueError("dtype must be CategoricalDtype") - if not isinstance(self.categories, type(dtype.categories._values)): + if not isinstance(self.categories, type(dtype.categories._column)): # If both categories are of different Column types, # return a column full of Nulls. return _create_empty_categorical_column(self, dtype) @@ -1434,7 +1434,7 @@ def copy(self, deep: bool = True) -> CategoricalColumn: ) else: return column.build_categorical_column( - categories=self.dtype.categories._values, + categories=self.dtype.categories._column, codes=column.as_column( self.codes.base_data, dtype=self.codes.dtype ), diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 0f039b137bc..509925d7a97 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1802,7 +1802,7 @@ def as_column( if dtype is not None: data = data.astype(dtype) elif isinstance(arbitrary, cudf.Index): - data = arbitrary._values + data = arbitrary._column if dtype is not None: data = data.astype(dtype) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 179dc14ccb3..fc087d56b4d 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -8001,7 +8001,7 @@ def _get_union_of_indices(indexes): else: merged_index = cudf.core.Index._concat(indexes) merged_index = merged_index.drop_duplicates() - _, inds = merged_index._values.sort_by_values() + _, inds = merged_index._column.sort_by_values() return merged_index.take(inds) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index db782ad7371..3dc381ac4ca 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -475,9 +475,9 @@ def _concat( ) if not isinstance( out._index, cudf.MultiIndex - ) and is_categorical_dtype(out._index._values.dtype): + ) and is_categorical_dtype(out._index._column.dtype): out = out.set_index( - cudf.core.index.as_index(out.index._values) + cudf.core.index.as_index(out.index._column) ) # Reassign index and column names diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index d583d22cb60..e91cb630da0 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -69,7 +69,7 @@ def _to_frame(this_index, index=True, name=None): col_name = this_index.name return cudf.DataFrame( - {col_name: this_index._values}, index=this_index if index else None + {col_name: this_index._column}, index=this_index if index else None ) @@ -145,9 +145,9 @@ def __init__( """ pass - @cached_property - def _values(self) -> ColumnBase: - raise NotImplementedError + @property + def _column(self): + return self._data[self.name] def __getitem__(self, key): raise NotImplementedError() @@ -185,7 +185,7 @@ def serialize(self): header["index_column"] = {} # store metadata values of index separately # Indexes: Numerical/DateTime/String are often GPU backed - header["index_column"], frames = self._values.serialize() + header["index_column"], frames = self._column.serialize() header["name"] = pickle.dumps(self.name) header["dtype"] = pickle.dumps(self.dtype) @@ -194,7 +194,7 @@ def serialize(self): return header, frames def __contains__(self, item): - return item in self._values + return item in self._column @annotate("INDEX_EQUALS", color="green", domain="cudf_python") def equals(self, other, **kwargs): @@ -341,7 +341,7 @@ def values_host(self): >>> type(index.values_host) """ - return self._values.values_host + return self._column.values_host @classmethod def deserialize(cls, header, frames): @@ -428,9 +428,9 @@ def _clean_nulls_from_index(self): methods using this method to replace or handle representation of the actual types correctly. """ - if self._values.has_nulls: + if self._column.has_nulls: return cudf.Index( - self._values.astype("str").fillna(cudf._NA_REP), name=self.name + self._column.astype("str").fillna(cudf._NA_REP), name=self.name ) else: return self @@ -611,7 +611,7 @@ def argsort(self, ascending=True, **kwargs): >>> index.argsort(ascending=False) array([3, 2, 1, 0, 4], dtype=int32) """ - indices = self._values.argsort(ascending=ascending, **kwargs) + indices = self._column.argsort(ascending=ascending, **kwargs) return cupy.asarray(indices) @property @@ -632,13 +632,13 @@ def values(self): >>> type(index.values) """ - return self._values.values + return self._column.values def any(self): """ Return whether any elements is True in Index. """ - return self._values.any() + return self._column.any() def to_pandas(self): """ @@ -657,7 +657,7 @@ def to_pandas(self): >>> type(idx) """ - return pd.Index(self._values.to_pandas(), name=self.name) + return pd.Index(self._column.to_pandas(), name=self.name) def tolist(self): @@ -680,7 +680,7 @@ def gpu_values(self): """ View the data as a numba device array object """ - return self._values.data_array_view + return self._column.data_array_view def min(self): """ @@ -705,7 +705,7 @@ def min(self): >>> idx.min() 1 """ - return self._values.min() + return self._column.min() def max(self): """ @@ -730,7 +730,7 @@ def max(self): >>> idx.max() 3 """ - return self._values.max() + return self._column.max() def sum(self): """ @@ -748,11 +748,11 @@ def sum(self): >>> idx.sum() 6 """ - return self._values.sum() + return self._column.sum() @classmethod def _concat(cls, objs): - data = ColumnBase._concat([o._values for o in objs]) + data = ColumnBase._concat([o._column for o in objs]) names = {obj.name for obj in objs} if len(names) == 1: [name] = names @@ -820,7 +820,7 @@ def append(self, other): f"either one of them to same dtypes." ) - if isinstance(self._values, cudf.core.column.NumericalColumn): + if isinstance(self._column, cudf.core.column.NumericalColumn): if self.dtype != other.dtype: this, other = numeric_normalize_types(self, other) to_concat = [this, other] @@ -974,7 +974,7 @@ def sort_values(self, return_indexer=False, ascending=True, key=None): if key is not None: raise NotImplementedError("key parameter is not yet implemented.") - indices = self._values.argsort(ascending=ascending) + indices = self._column.argsort(ascending=ascending) index_sorted = as_index(self.take(indices), name=self.name) if return_indexer: @@ -990,7 +990,7 @@ def unique(self): ------- Index without duplicates """ - return as_index(self._values.unique(), name=self.name) + return as_index(self._column.unique(), name=self.name) def __add__(self, other): return self._apply_op("__add__", other) @@ -1219,7 +1219,7 @@ def astype(self, dtype, copy=False): return self.copy(deep=copy) return as_index( - self.copy(deep=copy)._values.astype(dtype), name=self.name + self.copy(deep=copy)._column.astype(dtype), name=self.name ) def to_array(self, fillna=None): @@ -1238,7 +1238,7 @@ def to_array(self, fillna=None): if ``fillna`` is ``None``, null values are skipped. Therefore, the output size could be smaller. """ - return self._values.to_array(fillna=fillna) + return self._column.to_array(fillna=fillna) def to_series(self, index=None, name=None): """ @@ -1260,7 +1260,7 @@ def to_series(self, index=None, name=None): """ return cudf.Series( - self._values, + self._column, index=self.copy(deep=False) if index is None else index, name=self.name if name is None else name, ) @@ -1285,7 +1285,7 @@ def is_monotonic_increasing(self): Return if the index is monotonic increasing (only equal or increasing) values. """ - return self._values.is_monotonic_increasing + return self._column.is_monotonic_increasing @property def is_monotonic_decreasing(self): @@ -1293,7 +1293,7 @@ def is_monotonic_decreasing(self): Return if the index is monotonic decreasing (only equal or decreasing) values. """ - return self._values.is_monotonic_decreasing + return self._column.is_monotonic_decreasing @property def empty(self): @@ -1452,7 +1452,7 @@ def memory_usage(self, deep=False): ------- bytes used """ - return self._values._memory_usage(deep=deep) + return self._column._memory_usage(deep=deep) @classmethod def from_pandas(cls, index, nan_as_null=None): @@ -1636,7 +1636,7 @@ def _num_rows(self): return len(self) @cached_property - def _values(self): + def _column(self): if len(self) > 0: return column.arange( self._start, self._stop, self._step, dtype=self.dtype @@ -1647,7 +1647,7 @@ def _values(self): @property def _data(self): return cudf.core.column_accessor.ColumnAccessor( - {self.name: self._values} + {self.name: self._column} ) def __contains__(self, item): @@ -1729,7 +1729,7 @@ def __getitem__(self, index): index = np.min_scalar_type(index).type(index) index = column.as_column(index) - return as_index(self._values[index], name=self.name) + return as_index(self._column[index], name=self.name) def __eq__(self, other): return super(type(self), self).__eq__(other) @@ -1846,7 +1846,7 @@ def to_gpu_array(self, fillna=None): if ``fillna`` is ``None``, null values are skipped. Therefore, the output size could be smaller. """ - return self._values.to_gpu_array(fillna=fillna) + return self._column.to_gpu_array(fillna=fillna) def to_pandas(self): return pd.RangeIndex( @@ -1916,7 +1916,7 @@ def get_slice_bound(self, label, side, kind=None): @property def __cuda_array_interface__(self): - return self._values.__cuda_array_interface__ + return self._column.__cuda_array_interface__ def memory_usage(self, **kwargs): return 0 @@ -1936,7 +1936,7 @@ class GenericIndex(Index): Attributes ---------- - _values: A Column object + _column: A Column object name: A string """ @@ -1977,10 +1977,6 @@ def _initialize(self, values, **kwargs): name = kwargs.get("name") super(Index, self).__init__({name: values}) - @property - def _values(self): - return next(iter(self._data.columns)) - def copy(self, name=None, deep=False, dtype=None, names=None): """ Make a copy of this object. @@ -2006,18 +2002,18 @@ def copy(self, name=None, deep=False, dtype=None, names=None): name = self.name if name is None else name if isinstance(self, (StringIndex, CategoricalIndex)): - result = as_index(self._values.astype(dtype), name=name, copy=deep) + result = as_index(self._column.astype(dtype), name=name, copy=deep) else: result = as_index( - self._values.copy(deep=deep).astype(dtype), name=name + self._column.copy(deep=deep).astype(dtype), name=name ) return result def __sizeof__(self): - return self._values.__sizeof__() + return self._column.__sizeof__() def __len__(self): - return len(self._values) + return len(self._column) def __repr__(self): max_seq_items = get_option("max_seq_items") or len(self) @@ -2059,7 +2055,7 @@ def __repr__(self): output = preprocess.to_pandas().__repr__() output = output.replace("nan", cudf._NA_REP) - elif preprocess._values.nullable: + elif preprocess._column.nullable: output = self._clean_nulls_from_index().to_pandas().__repr__() if not isinstance(self, StringIndex): @@ -2100,7 +2096,7 @@ def __getitem__(self, index): raise NotImplementedError( "Getting a scalar from an IntervalIndex is not yet supported" ) - res = self._values[index] + res = self._column[index] if not isinstance(index, int): res = as_index(res) res.name = self.name @@ -2117,7 +2113,7 @@ def dtype(self): """ `dtype` of the underlying values in GenericIndex. """ - return self._values.dtype + return self._column.dtype def find_label_range(self, first, last): """Find range that starts with *first* and ends with *last*, @@ -2129,7 +2125,7 @@ def find_label_range(self, first, last): The starting index and the ending index. The *last* value occurs at ``end - 1`` position. """ - col = self._values + col = self._column begin, end = None, None if first is not None: begin = col.find_first_value(first, closest=True) @@ -2143,14 +2139,14 @@ def is_unique(self): """ Return if the index has unique values. """ - return self._values.is_unique + return self._column.is_unique def get_slice_bound(self, label, side, kind): - return self._values.get_slice_bound(label, side, kind) + return self._column.get_slice_bound(label, side, kind) @property def __cuda_array_interface__(self): - return self._values.__cuda_array_interface__ + return self._column.__cuda_array_interface__ class NumericIndex(GenericIndex): @@ -2482,11 +2478,11 @@ def dayofweek(self): return self._get_dt_field("weekday") def to_pandas(self): - nanos = self._values.astype("datetime64[ns]") + nanos = self._column.astype("datetime64[ns]") return pd.DatetimeIndex(nanos.to_pandas(), name=self.name) def _get_dt_field(self, field): - out_column = self._values.get_dt_field(field) + out_column = self._column.get_dt_field(field) # column.column_empty_like always returns a Column object # but we need a NumericalColumn for GenericIndex.. # how should this be handled? @@ -2576,9 +2572,9 @@ def __new__( def to_pandas(self): return pd.TimedeltaIndex( - self._values.to_pandas(), + self._column.to_pandas(), name=self.name, - unit=self._values.time_unit, + unit=self._column.time_unit, ) @property @@ -2586,21 +2582,21 @@ def days(self): """ Number of days for each element. """ - return as_index(arbitrary=self._values.days, name=self.name) + return as_index(arbitrary=self._column.days, name=self.name) @property def seconds(self): """ Number of seconds (>= 0 and less than 1 day) for each element. """ - return as_index(arbitrary=self._values.seconds, name=self.name) + return as_index(arbitrary=self._column.seconds, name=self.name) @property def microseconds(self): """ Number of microseconds (>= 0 and less than 1 second) for each element. """ - return as_index(arbitrary=self._values.microseconds, name=self.name) + return as_index(arbitrary=self._column.microseconds, name=self.name) @property def nanoseconds(self): @@ -2608,7 +2604,7 @@ def nanoseconds(self): Number of nanoseconds (>= 0 and less than 1 microsecond) for each element. """ - return as_index(arbitrary=self._values.nanoseconds, name=self.name) + return as_index(arbitrary=self._column.nanoseconds, name=self.name) @property def components(self): @@ -2616,7 +2612,7 @@ def components(self): Return a dataframe of the components (days, hours, minutes, seconds, milliseconds, microseconds, nanoseconds) of the Timedeltas. """ - return self._values.components() + return self._column.components() @property def inferred_freq(self): @@ -2732,14 +2728,14 @@ def codes(self): """ The category codes of this categorical. """ - return self._values.cat().codes + return self._column.cat().codes @property def categories(self): """ The categories of this categorical. """ - return self._values.cat().categories + return self._column.cat().categories def interval_range( @@ -2977,7 +2973,7 @@ class StringIndex(GenericIndex): Attributes ---------- - _values: A StringColumn object or NDArray of strings + _column: A StringColumn object or NDArray of strings name: A string """ @@ -2987,7 +2983,7 @@ def __new__(cls, values, copy=False, **kwargs): if isinstance(values, StringColumn): values = values.copy(deep=copy) elif isinstance(values, StringIndex): - values = values._values.copy(deep=copy) + values = values._column.copy(deep=copy) else: values = column.as_column(values, dtype="str") if not pd.api.types.is_string_dtype(values.dtype): @@ -3002,11 +2998,11 @@ def to_pandas(self): return pd.Index(self.to_array(), name=self.name, dtype="object") def take(self, indices): - return self._values[indices] + return self._column[indices] def __repr__(self): return ( - f"{self.__class__.__name__}({self._values.to_array()}," + f"{self.__class__.__name__}({self._column.to_array()}," f" dtype='object'" + ( f", name={pd.io.formats.printing.default_pprint(self.name)}" @@ -3019,14 +3015,14 @@ def __repr__(self): @copy_docstring(StringMethods.__init__) # type: ignore @property def str(self): - return StringMethods(column=self._values, parent=self) + return StringMethods(column=self._column, parent=self) def _clean_nulls_from_index(self): """ Convert all na values(if any) in Index object to `` as a preprocessing step to `__repr__` methods. """ - if self._values.has_nulls: + if self._column.has_nulls: return self.fillna(cudf._NA_REP) else: return self diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index 7970b9fa3dc..42d309d7054 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -30,7 +30,7 @@ def indices_from_labels(obj, labels): if is_categorical_dtype(obj.index): labels = labels.astype("category") - codes = labels.codes.astype(obj.index._values.codes.dtype) + codes = labels.codes.astype(obj.index._column.codes.dtype) labels = column.build_categorical_column( categories=labels.dtype.categories, codes=codes, @@ -154,7 +154,7 @@ def __setitem__(self, key, value): and not isinstance(self._sr.index, cudf.MultiIndex) and is_scalar(value) ): - _append_new_row_inplace(self._sr.index._values, key) + _append_new_row_inplace(self._sr.index._column, key) _append_new_row_inplace(self._sr._column, value) return else: @@ -177,7 +177,7 @@ def _loc_to_iloc(self, arg): found_index = arg return found_index try: - found_index = self._sr.index._values.find_first_value( + found_index = self._sr.index._column.find_first_value( arg, closest=False ) return found_index diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 3e9a7d98963..bb04353832a 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -217,7 +217,7 @@ def __init__( data = data.values elif isinstance(data, Index): name = data.name - data = data._values + data = data._column if dtype is not None: data = data.astype(dtype) elif isinstance(data, ColumnAccessor): @@ -4045,7 +4045,7 @@ def reverse(self): """ rinds = column.arange((self._column.size - 1), -1, -1, dtype=np.int32) col = self._column[rinds] - index = self.index._values[rinds] + index = self.index._column[rinds] return self._copy_construct(data=col, index=index) def one_hot_encoding(self, cats, dtype="float64"): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 5ae678d6839..aac541d5750 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1087,7 +1087,7 @@ def test_dataframe_append_to_empty(): def test_dataframe_setitem_index_len1(): gdf = cudf.DataFrame() gdf["a"] = [1] - gdf["b"] = gdf.index._values + gdf["b"] = gdf.index._column np.testing.assert_equal(gdf.b.to_array(), [0]) @@ -2181,12 +2181,12 @@ def query_GPU_memory(note=""): cudaDF = cudaDF[boolmask] assert ( - cudaDF.index._values.data_array_view.device_ctypes_pointer - == cudaDF["col0"].index._values.data_array_view.device_ctypes_pointer + cudaDF.index._column.data_array_view.device_ctypes_pointer + == cudaDF["col0"].index._column.data_array_view.device_ctypes_pointer ) assert ( - cudaDF.index._values.data_array_view.device_ctypes_pointer - == cudaDF["col1"].index._values.data_array_view.device_ctypes_pointer + cudaDF.index._column.data_array_view.device_ctypes_pointer + == cudaDF["col1"].index._column.data_array_view.device_ctypes_pointer ) assert memory_used == query_GPU_memory() diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 21a431dd540..961493c6a42 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -247,13 +247,13 @@ def test_index_rename_inplace(): # inplace=False should yield a deep copy gds_renamed_deep = gds.rename("new_name", inplace=False) - assert gds_renamed_deep._values.data_ptr != gds._values.data_ptr + assert gds_renamed_deep._column.data_ptr != gds._column.data_ptr # inplace=True returns none - expected_ptr = gds._values.data_ptr + expected_ptr = gds._column.data_ptr gds.rename("new_name", inplace=True) - assert expected_ptr == gds._values.data_ptr + assert expected_ptr == gds._column.data_ptr def test_index_rename_preserves_arg(): @@ -282,7 +282,7 @@ def test_set_index_as_property(): # Check set_index(Series) cdf.index = cdf["b"] - assert_eq(cdf.index._values.to_array(), col2) + assert_eq(cdf.index._column.to_array(), col2) with pytest.raises(ValueError): cdf.index = [list(range(10))] @@ -403,14 +403,14 @@ def test_index_copy_deep(idx, deep): same_ref = not deep if isinstance(idx, cudf.CategoricalIndex): assert ( - idx._values.codes.base_data.ptr - == idx_copy._values.codes.base_data.ptr + idx._column.codes.base_data.ptr + == idx_copy._column.codes.base_data.ptr ) == same_ref if isinstance( - idx._values.categories, cudf.core.column.string.StringColumn + idx._column.categories, cudf.core.column.string.StringColumn ): - children = idx._values.categories._base_children - copy_children = idx_copy._values.categories._base_children + children = idx._column.categories._base_children + copy_children = idx_copy._column.categories._base_children assert all( [ ( @@ -422,15 +422,15 @@ def test_index_copy_deep(idx, deep): ] ) elif isinstance( - idx._values.categories, cudf.core.column.numerical.NumericalColumn + idx._column.categories, cudf.core.column.numerical.NumericalColumn ): assert ( - idx._values.categories.base_data.ptr - == idx_copy._values.categories.base_data.ptr + idx._column.categories.base_data.ptr + == idx_copy._column.categories.base_data.ptr ) == same_ref elif isinstance(idx, cudf.core.index.StringIndex): - children = idx._values._base_children - copy_children = idx_copy._values._base_children + children = idx._column._base_children + copy_children = idx_copy._column._base_children assert all( [ ( @@ -445,7 +445,7 @@ def test_index_copy_deep(idx, deep): ) else: assert ( - idx._values.base_data.ptr == idx_copy._values.base_data.ptr + idx._column.base_data.ptr == idx_copy._column.base_data.ptr ) == same_ref From fa81cd45bb0eb54b2d0ebf9a118fdc6df9721e1f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 28 Apr 2021 15:47:27 -0700 Subject: [PATCH 07/20] Move _column to FrameOneD. --- python/cudf/cudf/core/frame.py | 8 ++++++++ python/cudf/cudf/core/index.py | 4 ---- python/cudf/cudf/core/series.py | 8 -------- 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 3dc381ac4ca..7ad5071a85e 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3325,6 +3325,14 @@ def shape(self): def __iter__(self): cudf.utils.utils.raise_iteration_error(obj=self) + @property + def _column(self): + return self._data[self.name] + + @_column.setter + def _column(self, value): + self._data[self.name] = value + def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index e91cb630da0..384500f4d16 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -145,10 +145,6 @@ def __init__( """ pass - @property - def _column(self): - return self._data[self.name] - def __getitem__(self, key): raise NotImplementedError() diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index bb04353832a..06fd6e3d6aa 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -285,14 +285,6 @@ def _from_data( out.name = name return out - @property - def _column(self): - return self._data[self.name] - - @_column.setter - def _column(self, value): - self._data[self.name] = value - def __contains__(self, item): return item in self._index From 8f330c766d5e059248261247211ddc37677e0f7b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 28 Apr 2021 16:03:27 -0700 Subject: [PATCH 08/20] Move _accessors and values. --- python/cudf/cudf/core/dataframe.py | 4 +--- python/cudf/cudf/core/frame.py | 32 ++++++++++++++++++++++++++++++ python/cudf/cudf/core/index.py | 24 +--------------------- python/cudf/cudf/core/series.py | 27 +------------------------ 4 files changed, 35 insertions(+), 52 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index fc087d56b4d..e16c0fac30d 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -10,7 +10,7 @@ import warnings from collections import defaultdict from collections.abc import Iterable, Sequence -from typing import Any, Optional, Set, TypeVar +from typing import Any, Optional, TypeVar import cupy import numpy as np @@ -7765,8 +7765,6 @@ def explode(self, column, ignore_index=False): return super()._explode(column, ignore_index) - _accessors = set() # type: Set[Any] - def from_pandas(obj, nan_as_null=None): """ diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 7ad5071a85e..3f5f371ac2e 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -48,6 +48,12 @@ class Frame(libcudf.table.Table): _data: "ColumnAccessor" + @classmethod + def __init_subclass__(cls): + # All subclasses contain a set _accessors that is used to hold custom + # accessors defined by user APIs (see cudf/api/extensions/accessor.py). + cls._accessors = set() + @classmethod def _from_table(cls, table: Frame): return cls(table._data, index=table._index) @@ -3333,6 +3339,32 @@ def _column(self): def _column(self, value): self._data[self.name] = value + @property + def values(self): + """ + Return a CuPy representation of the data. + + Returns + ------- + out : cupy.ndarray + A representation of the underlying data. + + Examples + -------- + >>> import cudf + >>> ser = cudf.Series([1, -10, 100, 20]) + >>> ser.values + array([ 1, -10, 100, 20]) + >>> type(ser.values) + + >>> index = cudf.Index([1, -10, 100, 20]) + >>> index.values + array([ 1, -10, 100, 20]) + >>> type(index.values) + + """ + return self._column.values + def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 384500f4d16..3f73cd0e016 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -4,7 +4,7 @@ import pickle from numbers import Number -from typing import Any, Dict, Set, Type +from typing import Any, Dict, Type import cupy import numpy as np @@ -610,26 +610,6 @@ def argsort(self, ascending=True, **kwargs): indices = self._column.argsort(ascending=ascending, **kwargs) return cupy.asarray(indices) - @property - def values(self): - """ - Return an array representing the data in the Index. - - Returns - ------- - array : A cupy array of data in the Index. - - Examples - -------- - >>> import cudf - >>> index = cudf.Index([1, -10, 100, 20]) - >>> index.values - array([ 1, -10, 100, 20]) - >>> type(index.values) - - """ - return self._column.values - def any(self): """ Return whether any elements is True in Index. @@ -1528,8 +1508,6 @@ def _from_table(cls, table): def _from_data(cls, data, index=None): return cls._from_table(FrameOneD(data=data)) - _accessors = set() # type: Set[Any] - @property def _constructor_expanddim(self): return cudf.MultiIndex diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 06fd6e3d6aa..3fbdd8ff412 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -7,7 +7,7 @@ from collections import abc as abc from numbers import Number from shutil import get_terminal_size -from typing import Any, Optional, Set +from typing import Any, Optional from uuid import uuid4 import cupy @@ -329,29 +329,6 @@ def from_pandas(cls, s, nan_as_null=None): """ return cls(s, nan_as_null=nan_as_null) - @property - def values(self): - """ - Return a CuPy representation of the Series. - - Only the values in the Series will be returned. - - Returns - ------- - out : cupy.ndarray - The values of the Series. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, -10, 100, 20]) - >>> ser.values - array([ 1, -10, 100, 20]) - >>> type(ser.values) - - """ - return self._column.values - @property def values_host(self): """ @@ -6489,8 +6466,6 @@ def explode(self, ignore_index=False): return super()._explode(self._column_names[0], ignore_index) - _accessors = set() # type: Set[Any] - truediv_int_dtype_corrections = { "int8": "float32", From 6f28392db8c72163cab3c3c1972521ba538f8886 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 28 Apr 2021 16:06:10 -0700 Subject: [PATCH 09/20] Move values_host. --- python/cudf/cudf/core/frame.py | 28 +++++++++++++++++++++++++++- python/cudf/cudf/core/index.py | 23 ----------------------- python/cudf/cudf/core/series.py | 23 ----------------------- 3 files changed, 27 insertions(+), 47 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 3f5f371ac2e..5d00d4a97bf 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3347,7 +3347,7 @@ def values(self): Returns ------- out : cupy.ndarray - A representation of the underlying data. + A device representation of the underlying data. Examples -------- @@ -3365,6 +3365,32 @@ def values(self): """ return self._column.values + @property + def values_host(self): + """ + Return a NumPy representation of the data. + + Returns + ------- + out : numpy.ndarray + A host representation of the underlying data. + + Examples + -------- + >>> import cudf + >>> ser = cudf.Series([1, -10, 100, 20]) + >>> ser.values_host + array([ 1, -10, 100, 20]) + >>> type(ser.values_host) + + >>> index = cudf.Index([1, -10, 100, 20]) + >>> index.values_host + array([ 1, -10, 100, 20]) + >>> type(index.values_host) + + """ + return self._column.values_host + def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 3f73cd0e016..e854e2551d7 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -316,29 +316,6 @@ def to_arrow(self): return self._data.columns[0].to_arrow() - @property - def values_host(self): - """ - Return a numpy representation of the Index. - - Only the values in the Index will be returned. - - Returns - ------- - out : numpy.ndarray - The values of the Index. - - Examples - -------- - >>> import cudf - >>> index = cudf.Index([1, -10, 100, 20]) - >>> index.values_host - array([ 1, -10, 100, 20]) - >>> type(index.values_host) - - """ - return self._column.values_host - @classmethod def deserialize(cls, header, frames): h = header["index_column"] diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 3fbdd8ff412..cbaf1508150 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -329,29 +329,6 @@ def from_pandas(cls, s, nan_as_null=None): """ return cls(s, nan_as_null=nan_as_null) - @property - def values_host(self): - """ - Return a numpy representation of the Series. - - Only the values in the Series will be returned. - - Returns - ------- - out : numpy.ndarray - The values of the Series. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, -10, 100, 20]) - >>> ser.values_host - array([ 1, -10, 100, 20]) - >>> type(ser.values_host) - - """ - return self._column.values_host - def serialize(self): header = {} frames = [] From 04f93414ba375f5ac89bc81093ddb143d38d5a50 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 28 Apr 2021 16:42:04 -0700 Subject: [PATCH 10/20] Remove redundant copy method overrides. --- python/cudf/cudf/core/dataframe.py | 14 -------------- python/cudf/cudf/core/index.py | 2 +- python/cudf/cudf/core/series.py | 8 -------- 3 files changed, 1 insertion(+), 23 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e16c0fac30d..72ccd3a78af 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3154,20 +3154,6 @@ def take(self, positions, keep_index=True): out.columns = self.columns return out - def __copy__(self): - return self.copy(deep=True) - - def __deepcopy__(self, memo=None): - """ - Parameters - ---------- - memo, default None - Standard signature. Unused - """ - if memo is None: - memo = {} - return self.copy(deep=True) - @annotate("INSERT", color="green", domain="cudf_python") def insert(self, loc, name, value): """ Add a column to DataFrame at the index specified by loc. diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index e854e2551d7..279d916997e 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -314,7 +314,7 @@ def to_arrow(self): ] """ - return self._data.columns[0].to_arrow() + return self._column.to_arrow() @classmethod def deserialize(cls, header, frames): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index cbaf1508150..6d282c95db2 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -586,14 +586,6 @@ def drop( if not inplace: return out - def __copy__(self, deep=True): - return self.copy(deep) - - def __deepcopy__(self, memo=None): - if memo is None: - memo = {} - return self.copy() - def append(self, to_append, ignore_index=False, verify_integrity=False): """Append values from another ``Series`` or array-like object. If ``ignore_index=True``, the index is reset. From 8feb36bdf908a597f26b7c1f7f2ee526314f1c47 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 28 Apr 2021 16:49:02 -0700 Subject: [PATCH 11/20] Move len. --- python/cudf/cudf/core/frame.py | 3 +++ python/cudf/cudf/core/index.py | 3 --- python/cudf/cudf/core/series.py | 5 ----- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 5d00d4a97bf..875d61c5153 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3331,6 +3331,9 @@ def shape(self): def __iter__(self): cudf.utils.utils.raise_iteration_error(obj=self) + def __len__(self): + return len(self._column) + @property def _column(self): return self._data[self.name] diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 279d916997e..bf9adad4e11 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1963,9 +1963,6 @@ def copy(self, name=None, deep=False, dtype=None, names=None): def __sizeof__(self): return self._column.__sizeof__() - def __len__(self): - return len(self._column) - def __repr__(self): max_seq_items = get_option("max_seq_items") or len(self) mr = 0 diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 6d282c95db2..b1b96642421 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -958,11 +958,6 @@ def memory_usage(self, index=True, deep=False): n += self._index.memory_usage(deep=deep) return n - def __len__(self): - """Returns the size of the ``Series`` including null values. - """ - return len(self._column) - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): if method == "__call__": return get_appropriate_dispatched_func( From 2623c96bdb23f7ccc3c18c7920ec4218e5b22167 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 28 Apr 2021 16:59:45 -0700 Subject: [PATCH 12/20] Move __bool__, to_gpu_array, tolist, and to_list. --- python/cudf/cudf/core/frame.py | 50 +++++++++++++++++++++++++++ python/cudf/cudf/core/index.py | 25 -------------- python/cudf/cudf/core/series.py | 50 --------------------------- python/cudf/cudf/tests/test_index.py | 2 +- python/cudf/cudf/tests/test_series.py | 2 +- 5 files changed, 52 insertions(+), 77 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 875d61c5153..5d1506d24df 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3334,6 +3334,12 @@ def __iter__(self): def __len__(self): return len(self._column) + def __bool__(self): + raise TypeError( + f"The truth value of a {type(self)} is ambiguous. Use " + "a.empty, a.bool(), a.item(), a.any() or a.all()." + ) + @property def _column(self): return self._data[self.name] @@ -3394,6 +3400,50 @@ def values_host(self): """ return self._column.values_host + def tolist(self): + + raise TypeError( + "cuDF does not support conversion to host memory " + "via the `tolist()` method. Consider using " + "`.to_arrow().to_pylist()` to construct a Python list." + ) + + to_list = tolist + + def to_gpu_array(self, fillna=None): + """Get a dense numba device array for the data. + + Parameters + ---------- + fillna : str or None + See *fillna* in ``.to_array``. + + Notes + ----- + + if ``fillna`` is ``None``, null values are skipped. Therefore, the + output size could be smaller. + + Returns + ------- + numba.DeviceNDArray + + Examples + -------- + >>> import cudf + >>> s = cudf.Series([10, 20, 30, 40, 50]) + >>> s + 0 10 + 1 20 + 2 30 + 3 40 + 4 50 + dtype: int64 + >>> s.to_gpu_array() + + """ + return self._column.to_gpu_array(fillna=fillna) + def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index bf9adad4e11..ded7b826191 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -612,16 +612,6 @@ def to_pandas(self): """ return pd.Index(self._column.to_pandas(), name=self.name) - def tolist(self): - - raise TypeError( - "cuDF does not support conversion to host memory " - "via `tolist()` method. Consider using " - "`.to_arrow().to_pylist()` to construct a Python list." - ) - - to_list = tolist - @ioutils.doc_to_dlpack() def to_dlpack(self): """{docstring}""" @@ -1784,21 +1774,6 @@ def find_label_range(self, first=None, last=None): def to_frame(self, index=True, name=None): return _to_frame(self, index, name) - def to_gpu_array(self, fillna=None): - """Get a dense numba device array for the data. - - Parameters - ---------- - fillna : str or None - Replacement value to fill in place of nulls. - - Notes - ----- - if ``fillna`` is ``None``, null values are skipped. Therefore, the - output size could be smaller. - """ - return self._column.to_gpu_array(fillna=fillna) - def to_pandas(self): return pd.RangeIndex( start=self._start, diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index b1b96642421..a44d5dd6b2c 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1154,22 +1154,6 @@ def take(self, indices, keep_index=True): data = self._column.take(col_inds, keep_index=False) return self._copy_construct(data=data, index=None) - def __bool__(self): - """Always raise TypeError when converting a Series - into a boolean. - """ - raise TypeError(f"can't compute boolean for {type(self)}") - - def tolist(self): - - raise TypeError( - "cuDF does not support conversion to host memory " - "via `tolist()` method. Consider using " - "`.to_arrow().to_pylist()` to construct a Python list." - ) - - to_list = tolist - def head(self, n=5): """ Return the first `n` rows. @@ -3052,40 +3036,6 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): return result_series._column.any() - def to_gpu_array(self, fillna=None): - """Get a dense numba device array for the data. - - Parameters - ---------- - fillna : str or None - See *fillna* in ``.to_array``. - - Notes - ----- - - if ``fillna`` is ``None``, null values are skipped. Therefore, the - output size could be smaller. - - Returns - ------- - numba DeviceNDArray - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([10, 20, 30, 40, 50]) - >>> s - 0 10 - 1 20 - 2 30 - 3 40 - 4 50 - dtype: int64 - >>> s.to_gpu_array() - - """ - return self._column.to_gpu_array(fillna=fillna) - def to_pandas(self, index=True, nullable=False, **kwargs): """ Convert to a Pandas Series. diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 961493c6a42..160c72ec461 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1799,7 +1799,7 @@ def test_index_tolist(data, dtype): TypeError, match=re.escape( r"cuDF does not support conversion to host memory " - r"via `tolist()` method. Consider using " + r"via the `tolist()` method. Consider using " r"`.to_arrow().to_pylist()` to construct a Python list." ), ): diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 0dc53fa29e9..0cc0ad57745 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -372,7 +372,7 @@ def test_series_tolist(data): TypeError, match=re.escape( r"cuDF does not support conversion to host memory " - r"via `tolist()` method. Consider using " + r"via the `tolist()` method. Consider using " r"`.to_arrow().to_pylist()` to construct a Python list." ), ): From a6c15f27c932834d5f6f0684effdd3ef7e3f2cea Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 29 Apr 2021 09:23:55 -0700 Subject: [PATCH 13/20] Move from_arrow. --- python/cudf/cudf/core/frame.py | 31 +++++++++++++++++++++++++++++ python/cudf/cudf/core/index.py | 27 ------------------------- python/cudf/cudf/core/multiindex.py | 6 +++--- python/cudf/cudf/core/series.py | 31 ----------------------------- 4 files changed, 34 insertions(+), 61 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 5d1506d24df..1505155a506 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3444,6 +3444,37 @@ def to_gpu_array(self, fillna=None): """ return self._column.to_gpu_array(fillna=fillna) + @classmethod + def from_arrow(cls, array): + """Create from PyArrow Array/ChunkedArray. + + Parameters + ---------- + array : PyArrow Array/ChunkedArray + PyArrow Object which has to be converted. + + Raises + ------ + TypeError for invalid input type. + + Returns + ------- + FrameOneD + + Examples + -------- + >>> import cudf + >>> import pyarrow as pa + >>> cudf.Index.from_arrow(pa.array(["a", "b", None])) + StringIndex(['a' 'b' None], dtype='object') + >>> cudf.Series.from_arrow(pa.array(["a", "b", None])) + 0 a + 1 b + 2 + dtype: object + """ + return cls(cudf.core.column.column.ColumnBase.from_arrow(array)) + def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index ded7b826191..94a28338adc 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -267,33 +267,6 @@ def get_level_values(self, level): else: raise KeyError(f"Requested level with name {level} " "not found") - @classmethod - def from_arrow(cls, array): - """Convert PyArrow Array/ChunkedArray to Index - - Parameters - ---------- - array : PyArrow Array/ChunkedArray - PyArrow Object which has to be converted to Index - - Raises - ------ - TypeError for invalid input type. - - Returns - ------- - cudf Index - - Examples - -------- - >>> import cudf - >>> import pyarrow as pa - >>> cudf.Index.from_arrow(pa.array(["a", "b", None])) - StringIndex(['a' 'b' None], dtype='object') - """ - - return cls(cudf.core.column.column.ColumnBase.from_arrow(array)) - def to_arrow(self): """Convert Index to PyArrow Array diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index a4748632aab..ee38e737e82 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -19,7 +19,7 @@ from cudf.core._compat import PANDAS_GE_120 from cudf.core.column import column from cudf.core.column_accessor import ColumnAccessor -from cudf.core.frame import Frame +from cudf.core.frame import Frame, FrameOneD from cudf.core.index import Index, as_index @@ -572,7 +572,7 @@ def from_arrow(cls, table): names=['a', 'b']) """ - return super(Index, cls).from_arrow(table) + return super(FrameOneD, cls).from_arrow(table) def to_arrow(self): """Convert MultiIndex to PyArrow Table @@ -606,7 +606,7 @@ def to_arrow(self): ] """ - return super(Index, self).to_arrow() + return super(FrameOneD, self).to_arrow() @property def codes(self): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index a44d5dd6b2c..41eaf15f271 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -411,37 +411,6 @@ def _get_columns_by_label(self, labels, downcast=False): else self.__class__(dtype=self.dtype, name=self.name) ) - @classmethod - def from_arrow(cls, array): - """ - Convert from PyArrow Array/ChunkedArray to Series. - - Parameters - ---------- - array : PyArrow Array/ChunkedArray - PyArrow Object which has to be converted to cudf Series. - - Raises - ------ - TypeError for invalid input type. - - Returns - ------- - cudf Series - - Examples - -------- - >>> import cudf - >>> import pyarrow as pa - >>> cudf.Series.from_arrow(pa.array(["a", "b", None])) - 0 a - 1 b - 2 - dtype: object - """ - - return cls(cudf.core.column.ColumnBase.from_arrow(array)) - def to_arrow(self): """ Convert Series to a PyArrow Array. From 808d5359531e4564161e8a7e97d2cef59ad9f98a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 29 Apr 2021 09:27:13 -0700 Subject: [PATCH 14/20] Move to_arrow. --- python/cudf/cudf/core/frame.py | 30 ++++++++++++++++++++++++++++++ python/cudf/cudf/core/index.py | 22 ---------------------- python/cudf/cudf/core/series.py | 22 ---------------------- 3 files changed, 30 insertions(+), 44 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 1505155a506..20b9edfb843 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3475,6 +3475,36 @@ def from_arrow(cls, array): """ return cls(cudf.core.column.column.ColumnBase.from_arrow(array)) + def to_arrow(self): + """ + Convert to a PyArrow Array. + + Returns + ------- + PyArrow Array + + Examples + -------- + >>> import cudf + >>> sr = cudf.Series(["a", "b", None]) + >>> sr.to_arrow() + + [ + "a", + "b", + null + ] + >>> ind = cudf.Index(["a", "b", None]) + >>> ind.to_arrow() + + [ + "a", + "b", + null + ] + """ + return self._column.to_arrow() + def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 94a28338adc..25b00042efd 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -267,28 +267,6 @@ def get_level_values(self, level): else: raise KeyError(f"Requested level with name {level} " "not found") - def to_arrow(self): - """Convert Index to PyArrow Array - - Returns - ------- - PyArrow Array - - Examples - -------- - >>> import cudf - >>> ind = cudf.Index(["a", "b", None]) - >>> ind.to_arrow() - - [ - "a", - "b", - null - ] - """ - - return self._column.to_arrow() - @classmethod def deserialize(cls, header, frames): h = header["index_column"] diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 41eaf15f271..64606695c19 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -411,28 +411,6 @@ def _get_columns_by_label(self, labels, downcast=False): else self.__class__(dtype=self.dtype, name=self.name) ) - def to_arrow(self): - """ - Convert Series to a PyArrow Array. - - Returns - ------- - PyArrow Array - - Examples - -------- - >>> import cudf - >>> sr = cudf.Series(["a", "b", None]) - >>> sr.to_arrow() - - [ - "a", - "b", - null - ] - """ - return self._column.to_arrow() - def drop( self, labels=None, From 7e899cdb664b5d50311feb3c5af30ff0d68db579 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 29 Apr 2021 10:11:11 -0700 Subject: [PATCH 15/20] Clean up to_frame. --- python/cudf/cudf/core/index.py | 65 +++++++++++++---------------- python/cudf/cudf/core/multiindex.py | 26 ------------ 2 files changed, 28 insertions(+), 63 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 25b00042efd..c70b3d04a31 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -45,34 +45,6 @@ from cudf.utils.utils import cached_property, search_range -def _to_frame(this_index, index=True, name=None): - """Create a DataFrame with a column containing this Index - - Parameters - ---------- - index : boolean, default True - Set the index of the returned DataFrame as the original Index - name : str, default None - Name to be used for the column - - Returns - ------- - DataFrame - cudf DataFrame - """ - - if name is not None: - col_name = name - elif this_index.name is None: - col_name = 0 - else: - col_name = this_index.name - - return cudf.DataFrame( - {col_name: this_index._column}, index=this_index if index else None - ) - - class Index(FrameOneD, Serializable): dtype: DtypeObj @@ -544,6 +516,33 @@ def any(self): """ return self._column.any() + def to_frame(self, index=True, name=None): + """Create a DataFrame with a column containing this Index + + Parameters + ---------- + index : boolean, default True + Set the index of the returned DataFrame as the original Index + name : str, default None + Name to be used for the column + + Returns + ------- + DataFrame + cudf DataFrame + """ + + if name is not None: + col_name = name + elif self.name is None: + col_name = 0 + else: + col_name = self.name + + return cudf.DataFrame( + {col_name: self._column}, index=self if index else None + ) + def to_pandas(self): """ Convert to a Pandas Index. @@ -1680,7 +1679,7 @@ def is_contiguous(self): @property def size(self): - return self.__len__() + return len(self) def find_label_range(self, first=None, last=None): """Find subrange in the ``RangeIndex``, marked by their positions, that @@ -1721,10 +1720,6 @@ def find_label_range(self, first=None, last=None): return begin, end - @copy_docstring(_to_frame) # type: ignore - def to_frame(self, index=True, name=None): - return _to_frame(self, index, name) - def to_pandas(self): return pd.RangeIndex( start=self._start, @@ -1978,10 +1973,6 @@ def __getitem__(self, index): else: return res - @copy_docstring(_to_frame) # type: ignore - def to_frame(self, index=True, name=None): - return _to_frame(self, index, name) - @property def dtype(self): """ diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index ee38e737e82..896de51f046 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1048,9 +1048,6 @@ def deserialize(cls, header, frames): names = pickle.loads(header["names"]) return MultiIndex(names=names, source_data=source_data) - def __iter__(self): - cudf.utils.utils.raise_iteration_error(obj=self) - def __getitem__(self, index): # TODO: This should be a take of the _source_data only match = self.take(index) @@ -1107,29 +1104,6 @@ def get_level_values(self, level): ) return level_values - def _to_frame(self): - - # for each column of codes - # replace column with mapping from integers to levels - df = self.codes.copy(deep=False) - for idx, col in enumerate(df.columns): - # use merge as a replace fn - level = cudf.DataFrame( - { - "idx": column.arange( - len(self.levels[idx]), dtype=df[col].dtype - ), - "level": self.levels[idx], - } - ) - code = cudf.DataFrame({"idx": df[col]}) - df[col] = code.merge(level).level - return df - - @property - def _values(self): - return list([i for i in self]) - @classmethod def _concat(cls, objs): From 85efcdbb2b4c89727731e657987ecafe46461dfe Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 29 Apr 2021 12:14:10 -0700 Subject: [PATCH 16/20] Minor improvement. --- python/cudf/cudf/core/series.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 64606695c19..9ef03c7f45f 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -261,8 +261,7 @@ def __init__( @classmethod def _from_table(cls, table, index=None): - name = next(iter(table._data.keys())) - data = next(iter(table._data.values())) + name, data = next(iter(table._data.items())) if index is None: if table._index is not None: index = Index._from_table(table._index) From 1bd0e2bc57016e31d3b0112f0b3bdc595a408f1b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 29 Apr 2021 15:48:47 -0700 Subject: [PATCH 17/20] Revert "Rename index._values to index._column." This reverts commit 58f0a236d42ed3d10a7dccf79011dcaf2a741a55. # Conflicts: # python/cudf/cudf/core/index.py --- python/cudf/cudf/core/column/categorical.py | 12 +- python/cudf/cudf/core/column/column.py | 2 +- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/frame.py | 4 +- python/cudf/cudf/core/index.py | 120 ++++++++++---------- python/cudf/cudf/core/indexing.py | 6 +- python/cudf/cudf/core/series.py | 4 +- python/cudf/cudf/tests/test_dataframe.py | 10 +- python/cudf/cudf/tests/test_index.py | 30 ++--- 9 files changed, 96 insertions(+), 94 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 1e11138fcf0..3cd1a599ddc 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -918,7 +918,7 @@ def as_numerical(self) -> NumericalColumn: @property def categories(self) -> ColumnBase: - return self.dtype.categories._column + return self.dtype.categories._values @categories.setter def categories(self, value): @@ -1044,7 +1044,7 @@ def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn: self._encode(other), size=len(self), dtype=self.codes.dtype ) col = column.build_categorical_column( - categories=self.dtype.categories._column, + categories=self.dtype.categories._values, codes=column.as_column(ary), mask=self.base_mask, ordered=self.dtype.ordered, @@ -1056,7 +1056,7 @@ def sort_by_values( ) -> Tuple[CategoricalColumn, NumericalColumn]: codes, inds = self.as_numerical.sort_by_values(ascending, na_position) col = column.build_categorical_column( - categories=self.dtype.categories._column, + categories=self.dtype.categories._values, codes=column.as_column(codes.base_data, dtype=codes.dtype), mask=codes.base_mask, size=codes.size, @@ -1317,7 +1317,7 @@ def fillna( result = super().fillna(value=fill_value, method=method) result = column.build_categorical_column( - categories=self.dtype.categories._column, + categories=self.dtype.categories._values, codes=column.as_column(result.base_data, dtype=result.dtype), offset=result.offset, size=result.size, @@ -1381,7 +1381,7 @@ def as_categorical_column( if not isinstance(dtype, CategoricalDtype): raise ValueError("dtype must be CategoricalDtype") - if not isinstance(self.categories, type(dtype.categories._column)): + if not isinstance(self.categories, type(dtype.categories._values)): # If both categories are of different Column types, # return a column full of Nulls. return _create_empty_categorical_column(self, dtype) @@ -1434,7 +1434,7 @@ def copy(self, deep: bool = True) -> CategoricalColumn: ) else: return column.build_categorical_column( - categories=self.dtype.categories._column, + categories=self.dtype.categories._values, codes=column.as_column( self.codes.base_data, dtype=self.codes.dtype ), diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 509925d7a97..0f039b137bc 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1802,7 +1802,7 @@ def as_column( if dtype is not None: data = data.astype(dtype) elif isinstance(arbitrary, cudf.Index): - data = arbitrary._column + data = arbitrary._values if dtype is not None: data = data.astype(dtype) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 72ccd3a78af..e5d582fce42 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7985,7 +7985,7 @@ def _get_union_of_indices(indexes): else: merged_index = cudf.core.Index._concat(indexes) merged_index = merged_index.drop_duplicates() - _, inds = merged_index._column.sort_by_values() + _, inds = merged_index._values.sort_by_values() return merged_index.take(inds) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 20b9edfb843..c51d1a80890 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -481,9 +481,9 @@ def _concat( ) if not isinstance( out._index, cudf.MultiIndex - ) and is_categorical_dtype(out._index._column.dtype): + ) and is_categorical_dtype(out._index._values.dtype): out = out.set_index( - cudf.core.index.as_index(out.index._column) + cudf.core.index.as_index(out.index._values) ) # Reassign index and column names diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index c70b3d04a31..5681f824faf 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -117,6 +117,10 @@ def __init__( """ pass + @cached_property + def _values(self) -> ColumnBase: + raise NotImplementedError + def __getitem__(self, key): raise NotImplementedError() @@ -153,7 +157,7 @@ def serialize(self): header["index_column"] = {} # store metadata values of index separately # Indexes: Numerical/DateTime/String are often GPU backed - header["index_column"], frames = self._column.serialize() + header["index_column"], frames = self._values.serialize() header["name"] = pickle.dumps(self.name) header["dtype"] = pickle.dumps(self.dtype) @@ -162,7 +166,7 @@ def serialize(self): return header, frames def __contains__(self, item): - return item in self._column + return item in self._values @annotate("INDEX_EQUALS", color="green", domain="cudf_python") def equals(self, other, **kwargs): @@ -324,9 +328,9 @@ def _clean_nulls_from_index(self): methods using this method to replace or handle representation of the actual types correctly. """ - if self._column.has_nulls: + if self._values.has_nulls: return cudf.Index( - self._column.astype("str").fillna(cudf._NA_REP), name=self.name + self._values.astype("str").fillna(cudf._NA_REP), name=self.name ) else: return self @@ -507,15 +511,9 @@ def argsort(self, ascending=True, **kwargs): >>> index.argsort(ascending=False) array([3, 2, 1, 0, 4], dtype=int32) """ - indices = self._column.argsort(ascending=ascending, **kwargs) + indices = self._values.argsort(ascending=ascending, **kwargs) return cupy.asarray(indices) - def any(self): - """ - Return whether any elements is True in Index. - """ - return self._column.any() - def to_frame(self, index=True, name=None): """Create a DataFrame with a column containing this Index @@ -540,7 +538,7 @@ def to_frame(self, index=True, name=None): col_name = self.name return cudf.DataFrame( - {col_name: self._column}, index=self if index else None + {col_name: self._values}, index=self if index else None ) def to_pandas(self): @@ -560,7 +558,7 @@ def to_pandas(self): >>> type(idx) """ - return pd.Index(self._column.to_pandas(), name=self.name) + return pd.Index(self._values.to_pandas(), name=self.name) @ioutils.doc_to_dlpack() def to_dlpack(self): @@ -573,7 +571,7 @@ def gpu_values(self): """ View the data as a numba device array object """ - return self._column.data_array_view + return self._values.data_array_view def min(self): """ @@ -598,7 +596,7 @@ def min(self): >>> idx.min() 1 """ - return self._column.min() + return self._values.min() def max(self): """ @@ -623,7 +621,7 @@ def max(self): >>> idx.max() 3 """ - return self._column.max() + return self._values.max() def sum(self): """ @@ -641,11 +639,11 @@ def sum(self): >>> idx.sum() 6 """ - return self._column.sum() + return self._values.sum() @classmethod def _concat(cls, objs): - data = ColumnBase._concat([o._column for o in objs]) + data = ColumnBase._concat([o._values for o in objs]) names = {obj.name for obj in objs} if len(names) == 1: [name] = names @@ -713,7 +711,7 @@ def append(self, other): f"either one of them to same dtypes." ) - if isinstance(self._column, cudf.core.column.NumericalColumn): + if isinstance(self._values, cudf.core.column.NumericalColumn): if self.dtype != other.dtype: this, other = numeric_normalize_types(self, other) to_concat = [this, other] @@ -867,7 +865,7 @@ def sort_values(self, return_indexer=False, ascending=True, key=None): if key is not None: raise NotImplementedError("key parameter is not yet implemented.") - indices = self._column.argsort(ascending=ascending) + indices = self._values.argsort(ascending=ascending) index_sorted = as_index(self.take(indices), name=self.name) if return_indexer: @@ -883,7 +881,7 @@ def unique(self): ------- Index without duplicates """ - return as_index(self._column.unique(), name=self.name) + return as_index(self._values.unique(), name=self.name) def __add__(self, other): return self._apply_op("__add__", other) @@ -1112,7 +1110,7 @@ def astype(self, dtype, copy=False): return self.copy(deep=copy) return as_index( - self.copy(deep=copy)._column.astype(dtype), name=self.name + self.copy(deep=copy)._values.astype(dtype), name=self.name ) def to_array(self, fillna=None): @@ -1131,7 +1129,7 @@ def to_array(self, fillna=None): if ``fillna`` is ``None``, null values are skipped. Therefore, the output size could be smaller. """ - return self._column.to_array(fillna=fillna) + return self._values.to_array(fillna=fillna) def to_series(self, index=None, name=None): """ @@ -1153,7 +1151,7 @@ def to_series(self, index=None, name=None): """ return cudf.Series( - self._column, + self._values, index=self.copy(deep=False) if index is None else index, name=self.name if name is None else name, ) @@ -1178,7 +1176,7 @@ def is_monotonic_increasing(self): Return if the index is monotonic increasing (only equal or increasing) values. """ - return self._column.is_monotonic_increasing + return self._values.is_monotonic_increasing @property def is_monotonic_decreasing(self): @@ -1186,7 +1184,7 @@ def is_monotonic_decreasing(self): Return if the index is monotonic decreasing (only equal or decreasing) values. """ - return self._column.is_monotonic_decreasing + return self._values.is_monotonic_decreasing @property def empty(self): @@ -1345,7 +1343,7 @@ def memory_usage(self, deep=False): ------- bytes used """ - return self._column._memory_usage(deep=deep) + return self._values._memory_usage(deep=deep) @classmethod def from_pandas(cls, index, nan_as_null=None): @@ -1527,7 +1525,7 @@ def _num_rows(self): return len(self) @cached_property - def _column(self): + def _values(self): if len(self) > 0: return column.arange( self._start, self._stop, self._step, dtype=self.dtype @@ -1538,7 +1536,7 @@ def _column(self): @property def _data(self): return cudf.core.column_accessor.ColumnAccessor( - {self.name: self._column} + {self.name: self._values} ) def __contains__(self, item): @@ -1620,7 +1618,7 @@ def __getitem__(self, index): index = np.min_scalar_type(index).type(index) index = column.as_column(index) - return as_index(self._column[index], name=self.name) + return as_index(self._values[index], name=self.name) def __eq__(self, other): return super(type(self), self).__eq__(other) @@ -1788,7 +1786,7 @@ def get_slice_bound(self, label, side, kind=None): @property def __cuda_array_interface__(self): - return self._column.__cuda_array_interface__ + return self._values.__cuda_array_interface__ def memory_usage(self, **kwargs): return 0 @@ -1808,7 +1806,7 @@ class GenericIndex(Index): Attributes ---------- - _column: A Column object + _values: A Column object name: A string """ @@ -1849,6 +1847,10 @@ def _initialize(self, values, **kwargs): name = kwargs.get("name") super(Index, self).__init__({name: values}) + @property + def _values(self): + return next(iter(self._data.columns)) + def copy(self, name=None, deep=False, dtype=None, names=None): """ Make a copy of this object. @@ -1874,15 +1876,15 @@ def copy(self, name=None, deep=False, dtype=None, names=None): name = self.name if name is None else name if isinstance(self, (StringIndex, CategoricalIndex)): - result = as_index(self._column.astype(dtype), name=name, copy=deep) + result = as_index(self._values.astype(dtype), name=name, copy=deep) else: result = as_index( - self._column.copy(deep=deep).astype(dtype), name=name + self._values.copy(deep=deep).astype(dtype), name=name ) return result def __sizeof__(self): - return self._column.__sizeof__() + return self._values.__sizeof__() def __repr__(self): max_seq_items = get_option("max_seq_items") or len(self) @@ -1924,7 +1926,7 @@ def __repr__(self): output = preprocess.to_pandas().__repr__() output = output.replace("nan", cudf._NA_REP) - elif preprocess._column.nullable: + elif preprocess._values.nullable: output = self._clean_nulls_from_index().to_pandas().__repr__() if not isinstance(self, StringIndex): @@ -1965,7 +1967,7 @@ def __getitem__(self, index): raise NotImplementedError( "Getting a scalar from an IntervalIndex is not yet supported" ) - res = self._column[index] + res = self._values[index] if not isinstance(index, int): res = as_index(res) res.name = self.name @@ -1978,7 +1980,7 @@ def dtype(self): """ `dtype` of the underlying values in GenericIndex. """ - return self._column.dtype + return self._values.dtype def find_label_range(self, first, last): """Find range that starts with *first* and ends with *last*, @@ -1990,7 +1992,7 @@ def find_label_range(self, first, last): The starting index and the ending index. The *last* value occurs at ``end - 1`` position. """ - col = self._column + col = self._values begin, end = None, None if first is not None: begin = col.find_first_value(first, closest=True) @@ -2004,14 +2006,14 @@ def is_unique(self): """ Return if the index has unique values. """ - return self._column.is_unique + return self._values.is_unique def get_slice_bound(self, label, side, kind): - return self._column.get_slice_bound(label, side, kind) + return self._values.get_slice_bound(label, side, kind) @property def __cuda_array_interface__(self): - return self._column.__cuda_array_interface__ + return self._values.__cuda_array_interface__ class NumericIndex(GenericIndex): @@ -2343,11 +2345,11 @@ def dayofweek(self): return self._get_dt_field("weekday") def to_pandas(self): - nanos = self._column.astype("datetime64[ns]") + nanos = self._values.astype("datetime64[ns]") return pd.DatetimeIndex(nanos.to_pandas(), name=self.name) def _get_dt_field(self, field): - out_column = self._column.get_dt_field(field) + out_column = self._values.get_dt_field(field) # column.column_empty_like always returns a Column object # but we need a NumericalColumn for GenericIndex.. # how should this be handled? @@ -2437,9 +2439,9 @@ def __new__( def to_pandas(self): return pd.TimedeltaIndex( - self._column.to_pandas(), + self._values.to_pandas(), name=self.name, - unit=self._column.time_unit, + unit=self._values.time_unit, ) @property @@ -2447,21 +2449,21 @@ def days(self): """ Number of days for each element. """ - return as_index(arbitrary=self._column.days, name=self.name) + return as_index(arbitrary=self._values.days, name=self.name) @property def seconds(self): """ Number of seconds (>= 0 and less than 1 day) for each element. """ - return as_index(arbitrary=self._column.seconds, name=self.name) + return as_index(arbitrary=self._values.seconds, name=self.name) @property def microseconds(self): """ Number of microseconds (>= 0 and less than 1 second) for each element. """ - return as_index(arbitrary=self._column.microseconds, name=self.name) + return as_index(arbitrary=self._values.microseconds, name=self.name) @property def nanoseconds(self): @@ -2469,7 +2471,7 @@ def nanoseconds(self): Number of nanoseconds (>= 0 and less than 1 microsecond) for each element. """ - return as_index(arbitrary=self._column.nanoseconds, name=self.name) + return as_index(arbitrary=self._values.nanoseconds, name=self.name) @property def components(self): @@ -2477,7 +2479,7 @@ def components(self): Return a dataframe of the components (days, hours, minutes, seconds, milliseconds, microseconds, nanoseconds) of the Timedeltas. """ - return self._column.components() + return self._values.components() @property def inferred_freq(self): @@ -2593,14 +2595,14 @@ def codes(self): """ The category codes of this categorical. """ - return self._column.cat().codes + return self._values.cat().codes @property def categories(self): """ The categories of this categorical. """ - return self._column.cat().categories + return self._values.cat().categories def interval_range( @@ -2838,7 +2840,7 @@ class StringIndex(GenericIndex): Attributes ---------- - _column: A StringColumn object or NDArray of strings + _values: A StringColumn object or NDArray of strings name: A string """ @@ -2848,7 +2850,7 @@ def __new__(cls, values, copy=False, **kwargs): if isinstance(values, StringColumn): values = values.copy(deep=copy) elif isinstance(values, StringIndex): - values = values._column.copy(deep=copy) + values = values._values.copy(deep=copy) else: values = column.as_column(values, dtype="str") if not pd.api.types.is_string_dtype(values.dtype): @@ -2863,11 +2865,11 @@ def to_pandas(self): return pd.Index(self.to_array(), name=self.name, dtype="object") def take(self, indices): - return self._column[indices] + return self._values[indices] def __repr__(self): return ( - f"{self.__class__.__name__}({self._column.to_array()}," + f"{self.__class__.__name__}({self._values.to_array()}," f" dtype='object'" + ( f", name={pd.io.formats.printing.default_pprint(self.name)}" @@ -2880,14 +2882,14 @@ def __repr__(self): @copy_docstring(StringMethods.__init__) # type: ignore @property def str(self): - return StringMethods(column=self._column, parent=self) + return StringMethods(column=self._values, parent=self) def _clean_nulls_from_index(self): """ Convert all na values(if any) in Index object to `` as a preprocessing step to `__repr__` methods. """ - if self._column.has_nulls: + if self._values.has_nulls: return self.fillna(cudf._NA_REP) else: return self diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index 42d309d7054..7970b9fa3dc 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -30,7 +30,7 @@ def indices_from_labels(obj, labels): if is_categorical_dtype(obj.index): labels = labels.astype("category") - codes = labels.codes.astype(obj.index._column.codes.dtype) + codes = labels.codes.astype(obj.index._values.codes.dtype) labels = column.build_categorical_column( categories=labels.dtype.categories, codes=codes, @@ -154,7 +154,7 @@ def __setitem__(self, key, value): and not isinstance(self._sr.index, cudf.MultiIndex) and is_scalar(value) ): - _append_new_row_inplace(self._sr.index._column, key) + _append_new_row_inplace(self._sr.index._values, key) _append_new_row_inplace(self._sr._column, value) return else: @@ -177,7 +177,7 @@ def _loc_to_iloc(self, arg): found_index = arg return found_index try: - found_index = self._sr.index._column.find_first_value( + found_index = self._sr.index._values.find_first_value( arg, closest=False ) return found_index diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 9ef03c7f45f..0a7fe8b69d9 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -217,7 +217,7 @@ def __init__( data = data.values elif isinstance(data, Index): name = data.name - data = data._column + data = data._values if dtype is not None: data = data.astype(dtype) elif isinstance(data, ColumnAccessor): @@ -3874,7 +3874,7 @@ def reverse(self): """ rinds = column.arange((self._column.size - 1), -1, -1, dtype=np.int32) col = self._column[rinds] - index = self.index._column[rinds] + index = self.index._values[rinds] return self._copy_construct(data=col, index=index) def one_hot_encoding(self, cats, dtype="float64"): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index aac541d5750..5ae678d6839 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1087,7 +1087,7 @@ def test_dataframe_append_to_empty(): def test_dataframe_setitem_index_len1(): gdf = cudf.DataFrame() gdf["a"] = [1] - gdf["b"] = gdf.index._column + gdf["b"] = gdf.index._values np.testing.assert_equal(gdf.b.to_array(), [0]) @@ -2181,12 +2181,12 @@ def query_GPU_memory(note=""): cudaDF = cudaDF[boolmask] assert ( - cudaDF.index._column.data_array_view.device_ctypes_pointer - == cudaDF["col0"].index._column.data_array_view.device_ctypes_pointer + cudaDF.index._values.data_array_view.device_ctypes_pointer + == cudaDF["col0"].index._values.data_array_view.device_ctypes_pointer ) assert ( - cudaDF.index._column.data_array_view.device_ctypes_pointer - == cudaDF["col1"].index._column.data_array_view.device_ctypes_pointer + cudaDF.index._values.data_array_view.device_ctypes_pointer + == cudaDF["col1"].index._values.data_array_view.device_ctypes_pointer ) assert memory_used == query_GPU_memory() diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 160c72ec461..158dffc3884 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -247,13 +247,13 @@ def test_index_rename_inplace(): # inplace=False should yield a deep copy gds_renamed_deep = gds.rename("new_name", inplace=False) - assert gds_renamed_deep._column.data_ptr != gds._column.data_ptr + assert gds_renamed_deep._values.data_ptr != gds._values.data_ptr # inplace=True returns none - expected_ptr = gds._column.data_ptr + expected_ptr = gds._values.data_ptr gds.rename("new_name", inplace=True) - assert expected_ptr == gds._column.data_ptr + assert expected_ptr == gds._values.data_ptr def test_index_rename_preserves_arg(): @@ -282,7 +282,7 @@ def test_set_index_as_property(): # Check set_index(Series) cdf.index = cdf["b"] - assert_eq(cdf.index._column.to_array(), col2) + assert_eq(cdf.index._values.to_array(), col2) with pytest.raises(ValueError): cdf.index = [list(range(10))] @@ -403,14 +403,14 @@ def test_index_copy_deep(idx, deep): same_ref = not deep if isinstance(idx, cudf.CategoricalIndex): assert ( - idx._column.codes.base_data.ptr - == idx_copy._column.codes.base_data.ptr + idx._values.codes.base_data.ptr + == idx_copy._values.codes.base_data.ptr ) == same_ref if isinstance( - idx._column.categories, cudf.core.column.string.StringColumn + idx._values.categories, cudf.core.column.string.StringColumn ): - children = idx._column.categories._base_children - copy_children = idx_copy._column.categories._base_children + children = idx._values.categories._base_children + copy_children = idx_copy._values.categories._base_children assert all( [ ( @@ -422,15 +422,15 @@ def test_index_copy_deep(idx, deep): ] ) elif isinstance( - idx._column.categories, cudf.core.column.numerical.NumericalColumn + idx._values.categories, cudf.core.column.numerical.NumericalColumn ): assert ( - idx._column.categories.base_data.ptr - == idx_copy._column.categories.base_data.ptr + idx._values.categories.base_data.ptr + == idx_copy._values.categories.base_data.ptr ) == same_ref elif isinstance(idx, cudf.core.index.StringIndex): - children = idx._column._base_children - copy_children = idx_copy._column._base_children + children = idx._values._base_children + copy_children = idx_copy._values._base_children assert all( [ ( @@ -445,7 +445,7 @@ def test_index_copy_deep(idx, deep): ) else: assert ( - idx._column.base_data.ptr == idx_copy._column.base_data.ptr + idx._values.base_data.ptr == idx_copy._values.base_data.ptr ) == same_ref From d189984336df2c0ee8a41cd40e5bde1ff5f9d940 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 30 Apr 2021 13:02:21 -0700 Subject: [PATCH 18/20] Fix previously incorrect usage of indexes as booleans. --- python/cudf/cudf/core/index.py | 6 ++++++ python/cudf/cudf/core/indexing.py | 5 ++++- python/cudf/cudf/tests/test_pickling.py | 4 +++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 5681f824faf..d82b4f53e20 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -541,6 +541,12 @@ def to_frame(self, index=True, name=None): {col_name: self._values}, index=self if index else None ) + def any(self): + """ + Return whether any elements is True in Index. + """ + return self._values.any() + def to_pandas(self): """ Convert to a Pandas Index. diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index 7970b9fa3dc..a732abc0705 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -2,6 +2,7 @@ from typing import Any, Union +import cupy as cp import numpy as np import pandas as pd from nvtx import annotate @@ -58,7 +59,9 @@ def get_label_range_or_mask(index, start, stop, step): if start is not None and stop is not None: if start > stop: return slice(0, 0, None) - boolean_mask = (index >= start) and (index <= stop) + # TODO: Once Index binary ops are updated to support logical_and, + # can use that instead of using cupy. + boolean_mask = cp.logical_and((index >= start), (index <= stop)) elif start is not None: boolean_mask = index >= start else: diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py index e87ab3730dd..ca819c7f59b 100644 --- a/python/cudf/cudf/tests/test_pickling.py +++ b/python/cudf/cudf/tests/test_pickling.py @@ -90,7 +90,9 @@ def test_pickle_index(): idx = GenericIndex(np.arange(nelem), name="a") pickled = pickle.dumps(idx) out = pickle.loads(pickled) - assert idx == out + # TODO: Once operations like `all` are supported on Index objects, we can + # just use that without calling values first. + assert (idx == out).values.all() def test_pickle_buffer(): From 31ebbdbc5605a438d708a55052d4a7dca1475763 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 30 Apr 2021 16:12:34 -0700 Subject: [PATCH 19/20] Reintroduce _constructor properties for dask. --- python/cudf/cudf/core/dataframe.py | 5 +++++ python/cudf/cudf/core/series.py | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e5d582fce42..f2be0e3bd6e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -554,6 +554,11 @@ def _align_input_series_indices(data, index): return data, index + # The `constructor*` properties are used by `dask` (and `dask_cudf`) + @property + def _constructor(self): + return DataFrame + @property def _constructor_sliced(self): return Series diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 0a7fe8b69d9..74cb3363295 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -62,6 +62,11 @@ class Series(FrameOneD, Serializable): + # The `constructor*` properties are used by `dask` (and `dask_cudf`) + @property + def _constructor(self): + return Series + @property def _constructor_sliced(self): raise NotImplementedError( From 6c2ee13560267557daab72ee3e8dfaac7ae95496 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 3 May 2021 08:42:03 -0700 Subject: [PATCH 20/20] Rename FrameOneD to SingleColumnFrame. --- python/cudf/cudf/core/frame.py | 6 +++--- python/cudf/cudf/core/index.py | 22 +++++++++++----------- python/cudf/cudf/core/multiindex.py | 6 +++--- python/cudf/cudf/core/series.py | 8 ++++---- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index c51d1a80890..4a434be42ce 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3301,10 +3301,10 @@ def _reindex( return self._mimic_inplace(result, inplace=inplace) -class FrameOneD(Frame): +class SingleColumnFrame(Frame): """A one-dimensional frame. - Frames with only a single dimension share certain logic that is encoded in + Frames with only a single column share certain logic that is encoded in this class. """ @@ -3459,7 +3459,7 @@ def from_arrow(cls, array): Returns ------- - FrameOneD + SingleColumnFrame Examples -------- diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index d82b4f53e20..5f390be79e2 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -29,7 +29,7 @@ ) from cudf.core.column.string import StringMethods as StringMethods from cudf.core.dtypes import IntervalDtype -from cudf.core.frame import FrameOneD +from cudf.core.frame import SingleColumnFrame from cudf.utils import ioutils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( @@ -45,7 +45,7 @@ from cudf.utils.utils import cached_property, search_range -class Index(FrameOneD, Serializable): +class Index(SingleColumnFrame, Serializable): dtype: DtypeObj @@ -1427,7 +1427,7 @@ def _from_table(cls, table): @classmethod def _from_data(cls, data, index=None): - return cls._from_table(FrameOneD(data=data)) + return cls._from_table(SingleColumnFrame(data=data)) @property def _constructor_expanddim(self): @@ -1474,7 +1474,7 @@ def __new__( if step == 0: raise ValueError("Step must not be zero.") - out = FrameOneD.__new__(cls) + out = SingleColumnFrame.__new__(cls) if isinstance(start, range): therange = start start = therange.start @@ -1827,7 +1827,7 @@ def __new__(cls, values, **kwargs): Column's name. Otherwise if this name is different from the value Column's, the values Column will be cloned to adopt this name. """ - out = FrameOneD.__new__(cls) + out = SingleColumnFrame.__new__(cls) out._initialize(values, **kwargs) return out @@ -2043,7 +2043,7 @@ class NumericIndex(GenericIndex): def __new__(cls, data=None, dtype=None, copy=False, name=None): - out = FrameOneD.__new__(cls) + out = SingleColumnFrame.__new__(cls) dtype = _index_to_dtype[cls] if copy: data = column.as_column(data, dtype=dtype).copy() @@ -2165,7 +2165,7 @@ def __new__( # pandas dtindex creation first which. For now # just make sure we handle np.datetime64 arrays # and then just dispatch upstream - out = FrameOneD.__new__(cls) + out = SingleColumnFrame.__new__(cls) if freq is not None: raise NotImplementedError("Freq is not yet supported") @@ -2420,7 +2420,7 @@ def __new__( name=None, ) -> "TimedeltaIndex": - out = FrameOneD.__new__(cls) + out = SingleColumnFrame.__new__(cls) if freq is not None: raise NotImplementedError("freq is not yet supported") @@ -2552,7 +2552,7 @@ def __new__( ) if copy: data = column.as_column(data, dtype=dtype).copy(deep=True) - out = FrameOneD.__new__(cls) + out = SingleColumnFrame.__new__(cls) kwargs = _setdefault_name(data, name=name) if isinstance(data, CategoricalColumn): data = data @@ -2778,7 +2778,7 @@ def __new__( ) -> "IntervalIndex": if copy: data = column.as_column(data, dtype=dtype).copy() - out = FrameOneD.__new__(cls) + out = SingleColumnFrame.__new__(cls) kwargs = _setdefault_name(data, name=name) if isinstance(data, IntervalColumn): data = data @@ -2851,7 +2851,7 @@ class StringIndex(GenericIndex): """ def __new__(cls, values, copy=False, **kwargs): - out = FrameOneD.__new__(cls) + out = SingleColumnFrame.__new__(cls) kwargs = _setdefault_name(values, **kwargs) if isinstance(values, StringColumn): values = values.copy(deep=copy) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 896de51f046..ca029198e52 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -19,7 +19,7 @@ from cudf.core._compat import PANDAS_GE_120 from cudf.core.column import column from cudf.core.column_accessor import ColumnAccessor -from cudf.core.frame import Frame, FrameOneD +from cudf.core.frame import Frame, SingleColumnFrame from cudf.core.index import Index, as_index @@ -572,7 +572,7 @@ def from_arrow(cls, table): names=['a', 'b']) """ - return super(FrameOneD, cls).from_arrow(table) + return super(SingleColumnFrame, cls).from_arrow(table) def to_arrow(self): """Convert MultiIndex to PyArrow Table @@ -606,7 +606,7 @@ def to_arrow(self): ] """ - return super(FrameOneD, self).to_arrow() + return super(SingleColumnFrame, self).to_arrow() @property def codes(self): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 74cb3363295..5ee40d576b6 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -38,7 +38,7 @@ from cudf.core.column.string import StringMethods from cudf.core.column.struct import StructMethods from cudf.core.column_accessor import ColumnAccessor -from cudf.core.frame import FrameOneD, _drop_rows_by_labels +from cudf.core.frame import SingleColumnFrame, _drop_rows_by_labels from cudf.core.groupby.groupby import SeriesGroupBy from cudf.core.index import Index, RangeIndex, as_index from cudf.core.indexing import _SeriesIlocIndexer, _SeriesLocIndexer @@ -61,7 +61,7 @@ ) -class Series(FrameOneD, Serializable): +class Series(SingleColumnFrame, Serializable): # The `constructor*` properties are used by `dask` (and `dask_cudf`) @property def _constructor(self): @@ -1043,9 +1043,9 @@ def __getitem__(self, arg): else: return self.loc[arg] - iteritems = FrameOneD.__iter__ + iteritems = SingleColumnFrame.__iter__ - items = FrameOneD.__iter__ + items = SingleColumnFrame.__iter__ def to_dict(self, into=dict): raise TypeError(