From b23b57b686fc5c26bf0dfa60698f5a8ba58c8697 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 2 Mar 2022 17:07:29 -0800 Subject: [PATCH 01/17] Split num_rows implementation between Frame and IndexedFrame. --- python/cudf/cudf/core/frame.py | 6 +----- python/cudf/cudf/core/indexed_frame.py | 5 +++++ 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 07cc3ea71cd..6820fe8fc7a 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -144,11 +144,7 @@ def _num_columns(self) -> int: @property def _num_rows(self) -> int: - if self._index is not None: - return len(self._index) - if len(self._data) == 0: - return 0 - return len(self._data.columns[0]) + return 0 if self._num_columns == 0 else len(self._data.columns[0]) @property def _column_names(self) -> Tuple[Any, ...]: # TODO: Tuple[str]? diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 3fa951241f7..256dc104586 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -175,6 +175,11 @@ def to_dict(self, *args, **kwargs): # noqa: D102 "`.to_pandas().to_dict()` to construct a Python dictionary." ) + @property + def _num_rows(self) -> int: + # Important to use the index because the data may be empty. + return len(self._index) + @property def index(self): """Get the labels for the rows.""" From 696772b38a2dd14c8c8389eb67c9d9cbfe93c82a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 2 Mar 2022 18:08:12 -0800 Subject: [PATCH 02/17] Refactor _num_rows and _from_data. --- python/cudf/cudf/core/dataframe.py | 17 +++++++++++------ python/cudf/cudf/core/frame.py | 17 ++++++++--------- python/cudf/cudf/core/index.py | 11 ++++++++++- python/cudf/cudf/core/indexed_frame.py | 18 ++++++++++++++++-- python/cudf/cudf/core/multiindex.py | 12 +++--------- python/cudf/cudf/core/series.py | 11 +++++------ python/cudf/cudf/tests/test_dataframe.py | 12 ++---------- 7 files changed, 55 insertions(+), 43 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 57d591dd3e7..0de80ed836d 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -612,7 +612,7 @@ def __init__( new_df = self._from_arrays(data, index=index, columns=columns) self._data = new_df._data - self.index = new_df._index + self._index = new_df._index elif hasattr(data, "__array_interface__"): arr_interface = data.__array_interface__ if len(arr_interface["descr"]) == 1: @@ -621,7 +621,7 @@ def __init__( else: new_df = self.from_records(data, index=index, columns=columns) self._data = new_df._data - self.index = new_df._index + self._index = new_df._index else: if is_list_like(data): if len(data) > 0 and is_scalar(data[0]): @@ -632,7 +632,7 @@ def __init__( new_df = DataFrame(data=data, index=index) self._data = new_df._data - self.index = new_df._index + self._index = new_df._index elif len(data) > 0 and isinstance(data[0], Series): self._init_from_series_list( data=data, columns=columns, index=index @@ -650,6 +650,11 @@ def __init__( data, index=index, columns=columns, nan_as_null=nan_as_null ) + if self._data.nrows > 0 and self._data.nrows != len(self._index): + raise ValueError( + f"Shape of passed values is {self.shape}, indices imply " + f"({len(self._index)}, {self._num_columns})" + ) if dtype: self._data = self.astype(dtype)._data @@ -855,10 +860,10 @@ def _from_data( data: MutableMapping, index: Optional[BaseIndex] = None, columns: Any = None, + *args, + **kwargs, ) -> DataFrame: - out = super()._from_data(data, index) - if index is None: - out.index = RangeIndex(out._data.nrows) + out = super()._from_data(data=data, index=index) if columns is not None: out.columns = columns return out diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 6820fe8fc7a..813dff24104 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -181,13 +181,9 @@ def deserialize(cls, header, frames): @classmethod @_cudf_nvtx_annotate - def _from_data( - cls, - data: MutableMapping, - index: Optional[cudf.core.index.BaseIndex] = None, - ): + def _from_data(cls, data: MutableMapping, *args, **kwargs): obj = cls.__new__(cls) - Frame.__init__(obj, data, index) + Frame.__init__(obj, data) return obj @classmethod @@ -1320,10 +1316,13 @@ def fillna( else: filled_data[col_name] = col.copy(deep=True) - return self._mimic_inplace( - self._from_data(data=filled_data, index=self._index), - inplace=inplace, + ret = self._mimic_inplace( + self._from_data(data=filled_data), inplace=inplace, ) + # TODO: Split this logic into the IndexedFrame class. + if isinstance(ret, cudf.core.indexed_frame.IndexedFrame): + ret._index = self._index + return ret @_cudf_nvtx_annotate def _drop_column(self, name): diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 1c68289898f..e944e5e61e9 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -113,7 +113,7 @@ def _index_from_data(data: MutableMapping, name: Any = None): index_class_type = IntervalIndex else: index_class_type = cudf.MultiIndex - return index_class_type._from_data(data, None, name) + return index_class_type._from_data(data, name) def _index_from_columns( @@ -838,7 +838,16 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): return NotImplemented + @classmethod @_cudf_nvtx_annotate + def _from_data( + cls, data: MutableMapping, name: Any = None, *args, **kwargs + ) -> GenericIndex: + out = super()._from_data(data=data) + if name is not None: + out.name = name + return out + def _binaryop( self, other: T, op: str, fill_value: Any = None, *args, **kwargs, ) -> SingleColumnFrame: diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 256dc104586..b9b735fb222 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -8,7 +8,7 @@ import warnings from collections import Counter, abc from functools import cached_property -from typing import Any, Callable, Dict, Optional, Tuple, Type, TypeVar, Union +from typing import Any, Callable, MutableMapping, Dict, Optional, Tuple, Type, TypeVar, Union from uuid import uuid4 import cupy as cp @@ -180,6 +180,18 @@ def _num_rows(self) -> int: # Important to use the index because the data may be empty. return len(self._index) + @classmethod + def _from_data( + cls, + data: MutableMapping, + index: Optional[BaseIndex] = None, + *args, + **kwargs, + ): + out = super()._from_data(data, *args, **kwargs) + out._index = RangeIndex(out._data.nrows) if index is None else index + return out + @property def index(self): """Get the labels for the rows.""" @@ -1067,7 +1079,9 @@ def _align_to_index( result = result.sort_values(sort_col_id) del result[sort_col_id] - result = self.__class__._from_data(result._data, index=result.index) + result = self.__class__._from_data( + data=result._data, index=result.index + ) result._data.multiindex = self._data.multiindex result._data._level_names = self._data._level_names result.index.names = self.index.names diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index c9036db05fa..c1ef8e315be 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -8,7 +8,7 @@ from collections.abc import Sequence from functools import cached_property from numbers import Integral -from typing import Any, List, MutableMapping, Optional, Tuple, Union +from typing import Any, List, MutableMapping, Tuple, Union import cupy import numpy as np @@ -278,14 +278,8 @@ def set_names(self, names, level=None, inplace=False): @classmethod @_cudf_nvtx_annotate - def _from_data( - cls, - data: MutableMapping, - index: Optional[cudf.core.index.BaseIndex] = None, - name: Any = None, - ) -> MultiIndex: - assert index is None - obj = cls.from_frame(cudf.DataFrame._from_data(data)) + def _from_data(cls, data: MutableMapping, name: Any = None,) -> MultiIndex: + obj = cls.from_frame(cudf.DataFrame._from_data(data=data)) if name is not None: obj.name = name return obj diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index b3b73b8961c..5d862213165 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -458,13 +458,12 @@ def _from_data( data: MutableMapping, index: Optional[BaseIndex] = None, name: Any = None, + *args, + **kwargs, ) -> Series: - """ - Construct the Series from a ColumnAccessor - """ - out: Series = super()._from_data(data, index, name) - if index is None: - out._index = RangeIndex(out._data.nrows) + out = super()._from_data(data=data, index=index) + if name is not None: + out.name = name return out @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 5bde75c2e21..136deb59334 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1565,18 +1565,10 @@ def test_dataframe_cupy_wrong_dimensions(): def test_dataframe_cupy_array_wrong_index(): d_ary = cupy.empty((2, 3), dtype=np.int32) - with pytest.raises( - ValueError, - match="Length mismatch: Expected axis has 2 elements, " - "new values have 1 elements", - ): + with pytest.raises(ValueError): cudf.DataFrame(d_ary, index=["a"]) - with pytest.raises( - ValueError, - match="Length mismatch: Expected axis has 2 elements, " - "new values have 1 elements", - ): + with pytest.raises(ValueError): cudf.DataFrame(d_ary, index="a") From b959413f7004528374f82353e5a159944fd14be0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 3 Mar 2022 09:38:19 -0800 Subject: [PATCH 03/17] Split _from_columns and _from_columns_like_self. --- python/cudf/cudf/core/frame.py | 47 ++++------------------- python/cudf/cudf/core/indexed_frame.py | 53 +++++++++++++++++++++++++- 2 files changed, 60 insertions(+), 40 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 813dff24104..d498ca686df 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -189,51 +189,20 @@ def _from_data(cls, data: MutableMapping, *args, **kwargs): @classmethod @_cudf_nvtx_annotate def _from_columns( - cls, - columns: List[ColumnBase], - column_names: abc.Iterable[str], - index_names: Optional[List[str]] = None, + cls, columns: List[ColumnBase], column_names: abc.Iterable[str], ): - """Construct a `Frame` object from a list of columns. - - If `index_names` is set, the first `len(index_names)` columns are - used to construct the index of the frame. - """ - index = None - n_index_columns = 0 - if index_names is not None: - n_index_columns = len(index_names) - index = cudf.core.index._index_from_columns( - columns[:n_index_columns] - ) - if isinstance(index, cudf.MultiIndex): - index.names = index_names - else: - index.name = index_names[0] + """Construct a `Frame` object from a list of columns.""" + data = {name: columns[i] for i, name in enumerate(column_names)} - data = { - name: columns[i + n_index_columns] - for i, name in enumerate(column_names) - } - - return cls._from_data(data, index) + return cls._from_data(data) @_cudf_nvtx_annotate def _from_columns_like_self( - self, - columns: List[ColumnBase], - column_names: abc.Iterable[str], - index_names: Optional[List[str]] = None, + self, columns: List[ColumnBase], column_names: abc.Iterable[str], ): - """Construct a `Frame` from a list of columns with metadata from self. - - If `index_names` is set, the first `len(index_names)` columns are - used to construct the index of the frame. - """ - frame = self.__class__._from_columns( - columns, column_names, index_names - ) - return frame._copy_type_metadata(self, include_index=bool(index_names)) + """Construct a Frame from a list of columns with metadata from self.""" + frame = self.__class__._from_columns(columns, column_names) + return frame._copy_type_metadata(self) def _mimic_inplace( self: T, result: Frame, inplace: bool = False diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index b9b735fb222..6fd2bae978b 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -8,7 +8,7 @@ import warnings from collections import Counter, abc from functools import cached_property -from typing import Any, Callable, MutableMapping, Dict, Optional, Tuple, Type, TypeVar, Union +from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple, Type, TypeVar, Union from uuid import uuid4 import cupy as cp @@ -26,6 +26,7 @@ is_list_dtype, is_list_like, ) +from cudf.core._base_index import BaseIndex from cudf.core.column import ColumnBase from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import Frame, _drop_rows_by_labels @@ -192,6 +193,56 @@ def _from_data( out._index = RangeIndex(out._data.nrows) if index is None else index return out + @classmethod + @annotate("FRAME_FROM_COLUMNS", color="green", domain="cudf_python") + def _from_columns( + cls, + columns: List[ColumnBase], + column_names: List[str], + index_names: Optional[List[str]] = None, + ): + """Construct a `Frame` object from a list of columns. + + If `index_names` is set, the first `len(index_names)` columns are + used to construct the index of the frame. + """ + data_columns = columns + + n_index_columns = len(index_names) if index_names else 0 + index_columns = columns[:n_index_columns] + data_columns = columns[n_index_columns:] + + out = super()._from_columns(data_columns, column_names) + + if index_names is not None: + out._index = cudf.core.index._index_from_columns(index_columns) + if isinstance(out._index, cudf.MultiIndex): + out._index.names = index_names + else: + assert len(index_names) == 1 + out._index.name = index_names[0] + + return out + + @annotate( + "FRAME_FROM_COLUMNS_LIKE_SELF", color="green", domain="cudf_python" + ) + def _from_columns_like_self( + self, + columns: List[ColumnBase], + column_names: List[str], + index_names: Optional[List[str]] = None, + ): + """Construct a `Frame` from a list of columns with metadata from self. + + If `index_names` is set, the first `len(index_names)` columns are + used to construct the index of the frame. + """ + frame = self.__class__._from_columns( + columns, column_names, index_names + ) + return frame._copy_type_metadata(self, include_index=bool(index_names)) + @property def index(self): """Get the labels for the rows.""" From de5ca14ad41dc9d74bca850863d33111e5c28692 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 3 Mar 2022 10:20:51 -0800 Subject: [PATCH 04/17] Fix bug in fillna. --- python/cudf/cudf/core/frame.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index d498ca686df..ed7b7bc56d3 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1285,12 +1285,17 @@ def fillna( else: filled_data[col_name] = col.copy(deep=True) + # TODO: This logic needs to move into the IndexedFrame class. + old_index = self._index ret = self._mimic_inplace( self._from_data(data=filled_data), inplace=inplace, ) # TODO: Split this logic into the IndexedFrame class. - if isinstance(ret, cudf.core.indexed_frame.IndexedFrame): - ret._index = self._index + if isinstance(self, cudf.core.indexed_frame.IndexedFrame): + if inplace: + self._index = old_index + else: + ret._index = old_index return ret @_cudf_nvtx_annotate From 3f2be82da648280c06e64f8e706965ae1a23dca2 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 3 Mar 2022 10:26:01 -0800 Subject: [PATCH 05/17] Remove now unnecessary _as_column. --- python/cudf/cudf/core/frame.py | 13 ------------- python/cudf/cudf/core/indexed_frame.py | 6 ++++-- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index ed7b7bc56d3..0ff4ec42033 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -585,19 +585,6 @@ def _get_columns_by_index(self, indices): data, columns=data.to_pandas_index(), index=self.index ) - def _as_column(self): - """ - _as_column : Converts a single columned Frame to Column - """ - assert ( - self._num_columns == 1 - and self._index is None - and self._column_names[0] is None - ), """There should be only one data column, - no index and None as the name to use this method""" - - return self._data[None].copy(deep=False) - @property def values(self): """ diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 6fd2bae978b..157748889f3 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -194,7 +194,7 @@ def _from_data( return out @classmethod - @annotate("FRAME_FROM_COLUMNS", color="green", domain="cudf_python") + @annotate("INDEXEDFRAME_FROM_COLUMNS", color="green", domain="cudf_python") def _from_columns( cls, columns: List[ColumnBase], @@ -225,7 +225,9 @@ def _from_columns( return out @annotate( - "FRAME_FROM_COLUMNS_LIKE_SELF", color="green", domain="cudf_python" + "INDEXEDFRAME_FROM_COLUMNS_LIKE_SELF", + color="green", + domain="cudf_python", ) def _from_columns_like_self( self, From 3f13f7eab992fc023de51cbd4305b49eda6ee81f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 7 Mar 2022 14:19:17 -0800 Subject: [PATCH 06/17] Fix style. --- python/cudf/cudf/core/indexed_frame.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 157748889f3..30f5b1e0475 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -8,7 +8,18 @@ import warnings from collections import Counter, abc from functools import cached_property -from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple, Type, TypeVar, Union +from typing import ( + Any, + Callable, + Dict, + List, + MutableMapping, + Optional, + Tuple, + Type, + TypeVar, + Union, +) from uuid import uuid4 import cupy as cp From 3fd4cb9744c32626c504111a0343480832223dfd Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 15 Mar 2022 11:35:51 -0700 Subject: [PATCH 07/17] Update annotations. --- python/cudf/cudf/core/indexed_frame.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 30f5b1e0475..3e54279c6d8 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -205,7 +205,7 @@ def _from_data( return out @classmethod - @annotate("INDEXEDFRAME_FROM_COLUMNS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _from_columns( cls, columns: List[ColumnBase], @@ -235,11 +235,7 @@ def _from_columns( return out - @annotate( - "INDEXEDFRAME_FROM_COLUMNS_LIKE_SELF", - color="green", - domain="cudf_python", - ) + @_cudf_nvtx_annotate def _from_columns_like_self( self, columns: List[ColumnBase], From 438152bc57264a45fae5fd7d8fb56a47782ef3fe Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 15 Mar 2022 12:14:11 -0700 Subject: [PATCH 08/17] Move copy and mimic_inplace. --- python/cudf/cudf/core/frame.py | 87 -------------------------- python/cudf/cudf/core/indexed_frame.py | 76 ++++++++++++++++++++++ 2 files changed, 76 insertions(+), 87 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 0ff4ec42033..db52e6ba061 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -214,7 +214,6 @@ def _mimic_inplace( result._data[col], inplace=True ) self._data = result._data - self._index = result._index return None else: return result @@ -385,92 +384,6 @@ def memory_usage(self, deep=False): def __len__(self): return self._num_rows - @_cudf_nvtx_annotate - def copy(self: T, deep: bool = True) -> T: - """ - Make a copy of this object's indices and data. - - When ``deep=True`` (default), a new object will be created with a - copy of the calling object's data and indices. Modifications to - the data or indices of the copy will not be reflected in the - original object (see notes below). - When ``deep=False``, a new object will be created without copying - the calling object's data or index (only references to the data - and index are copied). Any changes to the data of the original - will be reflected in the shallow copy (and vice versa). - - Parameters - ---------- - deep : bool, default True - Make a deep copy, including a copy of the data and the indices. - With ``deep=False`` neither the indices nor the data are copied. - - Returns - ------- - copy : Series or DataFrame - Object type matches caller. - - Examples - -------- - >>> s = cudf.Series([1, 2], index=["a", "b"]) - >>> s - a 1 - b 2 - dtype: int64 - >>> s_copy = s.copy() - >>> s_copy - a 1 - b 2 - dtype: int64 - - **Shallow copy versus default (deep) copy:** - - >>> s = cudf.Series([1, 2], index=["a", "b"]) - >>> deep = s.copy() - >>> shallow = s.copy(deep=False) - - Shallow copy shares data and index with original. - - >>> s is shallow - False - >>> s._column is shallow._column and s.index is shallow.index - True - - Deep copy has own copy of data and index. - - >>> s is deep - False - >>> s.values is deep.values or s.index is deep.index - False - - Updates to the data shared by shallow copy and original is reflected - in both; deep copy remains unchanged. - - >>> s['a'] = 3 - >>> shallow['b'] = 4 - >>> s - a 3 - b 4 - dtype: int64 - >>> shallow - a 3 - b 4 - dtype: int64 - >>> deep - a 1 - b 2 - dtype: int64 - """ - new_frame = self.__class__.__new__(self.__class__) - new_frame._data = self._data.copy(deep=deep) - - if self._index is not None: - new_frame._index = self._index.copy(deep=deep) - else: - new_frame._index = None - - return new_frame - @_cudf_nvtx_annotate def astype(self, dtype, copy=False, **kwargs): result = {} diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 3e54279c6d8..fafdd43c06b 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -19,6 +19,7 @@ Type, TypeVar, Union, + cast, ) from uuid import uuid4 @@ -252,6 +253,81 @@ def _from_columns_like_self( ) return frame._copy_type_metadata(self, include_index=bool(index_names)) + def _mimic_inplace( + self: T, result: Frame, inplace: bool = False + ) -> Optional[Frame]: + # TODO: Is there a better way to make mypy happy? + result = cast(IndexedFrame, result) + if inplace: + self._index = result._index + return super()._mimic_inplace(result, inplace) + + def copy(self: T, deep: bool = True) -> T: + """Make a copy of this object's indices and data. + + When ``deep=True`` (default), a new object will be created with a + copy of the calling object's data and indices. Modifications to + the data or indices of the copy will not be reflected in the + original object (see notes below). + When ``deep=False``, a new object will be created without copying + the calling object's data or index (only references to the data + and index are copied). Any changes to the data of the original + will be reflected in the shallow copy (and vice versa). + + Parameters + ---------- + deep : bool, default True + Make a deep copy, including a copy of the data and the indices. + With ``deep=False`` neither the indices nor the data are copied. + + Returns + ------- + copy : Series or DataFrame + Object type matches caller. + + Examples + -------- + >>> s = cudf.Series([1, 2], index=["a", "b"]) + >>> s + a 1 + b 2 + dtype: int64 + >>> s_copy = s.copy() + >>> s_copy + a 1 + b 2 + dtype: int64 + + **Shallow copy versus default (deep) copy:** + + >>> s = cudf.Series([1, 2], index=["a", "b"]) + >>> deep = s.copy() + >>> shallow = s.copy(deep=False) + + Updates to the data shared by shallow copy and original is reflected + in both; deep copy remains unchanged. + + >>> s['a'] = 3 + >>> shallow['b'] = 4 + >>> s + a 3 + b 4 + dtype: int64 + >>> shallow + a 3 + b 4 + dtype: int64 + >>> deep + a 1 + b 2 + dtype: int64 + """ + return self._from_data( + self._data.copy(deep=deep), + # Indexes are immutable so copies can always be shallow. + self._index.copy(deep=False), + ) + @property def index(self): """Get the labels for the rows.""" From 6da676af01649c2c515be17cfe889c2422d30bb3 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 15 Mar 2022 14:24:36 -0700 Subject: [PATCH 09/17] Standardize equals. --- python/cudf/cudf/core/frame.py | 35 +++++++++----------------- python/cudf/cudf/core/index.py | 18 ++++++++----- python/cudf/cudf/core/indexed_frame.py | 6 +++++ 3 files changed, 30 insertions(+), 29 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index db52e6ba061..e2cf5abb3a2 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -397,7 +397,7 @@ def astype(self, dtype, copy=False, **kwargs): return result @_cudf_nvtx_annotate - def equals(self, other, **kwargs): + def equals(self, other): """ Test whether two objects contain the same elements. This function allows two Series or DataFrames to be compared against @@ -454,30 +454,19 @@ def equals(self, other, **kwargs): >>> df.equals(different_column_type) True """ - if self is other: - return True - - check_types = kwargs.get("check_types", True) - - if check_types: - if type(self) is not type(other): - return False - - if other is None or len(self) != len(other): - return False - - # check data: - for self_col, other_col in zip( - self._data.values(), other._data.values() + if ( + other is None + or not isinstance(other, type(self)) + or len(self) != len(other) ): - if not self_col.equals(other_col, check_dtypes=check_types): - return False + return False - # check index: - if self._index is None: - return other._index is None - else: - return self._index.equals(other._index) + return all( + self_col.equals(other_col, check_dtypes=True) + for self_col, other_col in zip( + self._data.values(), other._data.values() + ) + ) @_cudf_nvtx_annotate def _get_columns_by_label(self, labels, downcast=False): diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index e944e5e61e9..802b25684c2 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -375,7 +375,7 @@ def equals(self, other): other._step, ): return True - return Int64Index._from_data(self._data).equals(other) + return self._as_int64().equals(other) @_cudf_nvtx_annotate def serialize(self): @@ -921,22 +921,28 @@ def equals(self, other, **kwargs): True if “other” is an Index and it has the same elements as calling index; False otherwise. """ - if not isinstance(other, BaseIndex): + if ( + other is None + or not isinstance(other, BaseIndex) + or len(self) != len(other) + ): return False - check_types = False + check_dtypes = False self_is_categorical = isinstance(self, CategoricalIndex) other_is_categorical = isinstance(other, CategoricalIndex) if self_is_categorical and not other_is_categorical: other = other.astype(self.dtype) - check_types = True + check_dtypes = True elif other_is_categorical and not self_is_categorical: self = self.astype(other.dtype) - check_types = True + check_dtypes = True try: - return super().equals(other, check_types=check_types) + return self._column.equals( + other._column, check_dtypes=check_dtypes + ) except TypeError: return False diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index fafdd43c06b..3e09fbdffd4 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -328,6 +328,12 @@ def copy(self: T, deep: bool = True) -> T: self._index.copy(deep=False), ) + @_cudf_nvtx_annotate + def equals(self, other): # noqa: D102 + if not super().equals(other): + return False + return self._index.equals(other._index) + @property def index(self): """Get the labels for the rows.""" From 61f9d07de80f9c83cf9e6c4a4a8380e1d7715dbb Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 15 Mar 2022 15:03:47 -0700 Subject: [PATCH 10/17] Simplify column selection by index. --- python/cudf/cudf/core/column_accessor.py | 28 ++++++++++++++++++------ python/cudf/cudf/core/dataframe.py | 12 +++++----- python/cudf/cudf/core/frame.py | 11 ---------- 3 files changed, 27 insertions(+), 24 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 9cb86ca1cd2..c9c00692174 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -342,6 +342,26 @@ def select_by_label(self, key: Any) -> ColumnAccessor: return self._select_by_label_with_wildcard(key) return self._select_by_label_grouped(key) + def get_labels_by_index(self, index: Any) -> tuple: + """Get the labels corresponding to the provided column indices. + + Parameters + ---------- + index : integer, integer slice, or list-like of integers + The column indexes. + + Returns + ------- + tuple + """ + if isinstance(index, slice): + start, stop, step = index.indices(len(self._data)) + return self.names[start:stop:step] + elif pd.api.types.is_integer(index): + return (self.names[index],) + else: + return tuple(self.names[i] for i in index) + def select_by_index(self, index: Any) -> ColumnAccessor: """ Return a ColumnAccessor composed of the columns @@ -355,13 +375,7 @@ def select_by_index(self, index: Any) -> ColumnAccessor: ------- ColumnAccessor """ - if isinstance(index, slice): - start, stop, step = index.indices(len(self._data)) - keys = self.names[start:stop:step] - elif pd.api.types.is_integer(index): - keys = (self.names[index],) - else: - keys = tuple(self.names[i] for i in index) + keys = self.get_labels_by_index(index) data = {k: self._data[k] for k in keys} return self.__class__( data, multiindex=self.multiindex, level_names=self.level_names, diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0de80ed836d..b93dc2a7993 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -372,9 +372,9 @@ class _DataFrameIlocIndexer(_DataFrameIndexer): def _getitem_tuple_arg(self, arg): # Iloc Step 1: # Gather the columns specified by the second tuple arg - columns_df = self._frame._get_columns_by_index(arg[1]) - - columns_df._index = self._frame._index + columns_df = self._frame._from_data( + self._frame._data.select_by_index(arg[1]), self._frame._index + ) # Iloc Step 2: # Gather the rows specified by the first tuple arg @@ -422,9 +422,9 @@ def _getitem_tuple_arg(self, arg): @_cudf_nvtx_annotate def _setitem_tuple_arg(self, key, value): - columns = self._frame._get_columns_by_index(key[1]) - - for col in columns: + # TODO: Determine if this usage is prevalent enough to expose this + # selection logic at a higher level than ColumnAccessor. + for col in self._frame._data.get_labels_by_index(key[1]): self._frame[col].iloc[key[0]] = value def _getitem_scalar(self, arg): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index e2cf5abb3a2..b54f92af9bd 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -476,17 +476,6 @@ def _get_columns_by_label(self, labels, downcast=False): """ return self._data.select_by_label(labels) - @_cudf_nvtx_annotate - def _get_columns_by_index(self, indices): - """ - Returns columns of the Frame specified by `labels` - - """ - data = self._data.select_by_index(indices) - return self.__class__._from_data( - data, columns=data.to_pandas_index(), index=self.index - ) - @property def values(self): """ From 6c607af2f2f8e5e73965de0d337017814e6b27fb Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 15 Mar 2022 15:27:39 -0700 Subject: [PATCH 11/17] Simplify clip and prep for move to IndexedFrame. --- python/cudf/cudf/core/frame.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index b54f92af9bd..36ec290da42 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -721,6 +721,10 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): 3 4 dtype: int64 """ + if isinstance(self, cudf.BaseIndex): + warnings.warn( + "Index.clip is deprecated and will be removed.", FutureWarning, + ) if axis != 1: raise NotImplementedError("`axis is not yet supported in clip`") @@ -738,13 +742,10 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): if len(lower) != self._num_columns: raise ValueError( - """Length of lower/upper should be - equal to number of columns in - DataFrame/Series/Index/MultiIndex""" + "Length of lower/upper should be equal to number of columns" ) - output = self.copy(deep=False) - if output.ndim == 1: + if self.ndim == 1: # In case of series and Index, # swap lower and upper if lower > upper if ( @@ -754,11 +755,12 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): ): lower[0], upper[0] = upper[0], lower[0] - for i, name in enumerate(self._data): - output._data[name] = self._data[name].clip(lower[i], upper[i]) - + data = { + name: col.clip(lower[i], upper[i]) + for i, (name, col) in enumerate(self._data.items()) + } + output = self._from_data(data, self._index) output._copy_type_metadata(self, include_index=False) - return self._mimic_inplace(output, inplace=inplace) @_cudf_nvtx_annotate From 991ca6a6c4dff44cef1dc82012d574e6fc00a59e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 16 Mar 2022 17:20:33 -0700 Subject: [PATCH 12/17] Address first set of PR reviews. --- python/cudf/cudf/core/frame.py | 7 ++----- python/cudf/cudf/core/indexed_frame.py | 1 - 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 36ec290da42..20da872f117 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1171,11 +1171,8 @@ def fillna( self._from_data(data=filled_data), inplace=inplace, ) # TODO: Split this logic into the IndexedFrame class. - if isinstance(self, cudf.core.indexed_frame.IndexedFrame): - if inplace: - self._index = old_index - else: - ret._index = old_index + if isinstance(self, cudf.core.indexed_frame.IndexedFrame) and not inplace: + ret._index = old_index return ret @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 3e09fbdffd4..7c9b0381c69 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -231,7 +231,6 @@ def _from_columns( if isinstance(out._index, cudf.MultiIndex): out._index.names = index_names else: - assert len(index_names) == 1 out._index.name = index_names[0] return out From 87ed661f8a86b5a23acfab942612233101b6d61d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 16 Mar 2022 17:41:52 -0700 Subject: [PATCH 13/17] Make mypy happy in better ways. --- python/cudf/cudf/core/dataframe.py | 2 -- python/cudf/cudf/core/frame.py | 9 ++++--- python/cudf/cudf/core/index.py | 2 +- python/cudf/cudf/core/indexed_frame.py | 13 +++------- python/cudf/cudf/core/series.py | 2 -- python/cudf/cudf/core/single_column_frame.py | 25 +------------------- 6 files changed, 11 insertions(+), 42 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b93dc2a7993..738ff59c33e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -860,8 +860,6 @@ def _from_data( data: MutableMapping, index: Optional[BaseIndex] = None, columns: Any = None, - *args, - **kwargs, ) -> DataFrame: out = super()._from_data(data=data, index=index) if columns is not None: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 20da872f117..fab0c7fafb0 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -181,7 +181,7 @@ def deserialize(cls, header, frames): @classmethod @_cudf_nvtx_annotate - def _from_data(cls, data: MutableMapping, *args, **kwargs): + def _from_data(cls, data: MutableMapping): obj = cls.__new__(cls) Frame.__init__(obj, data) return obj @@ -205,7 +205,7 @@ def _from_columns_like_self( return frame._copy_type_metadata(self) def _mimic_inplace( - self: T, result: Frame, inplace: bool = False + self: T, result: T, inplace: bool = False ) -> Optional[Frame]: if inplace: for col in self._data: @@ -1171,7 +1171,10 @@ def fillna( self._from_data(data=filled_data), inplace=inplace, ) # TODO: Split this logic into the IndexedFrame class. - if isinstance(self, cudf.core.indexed_frame.IndexedFrame) and not inplace: + if ( + isinstance(self, cudf.core.indexed_frame.IndexedFrame) + and not inplace + ): ret._index = old_index return ret diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 802b25684c2..60bfeec9a72 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -841,7 +841,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @classmethod @_cudf_nvtx_annotate def _from_data( - cls, data: MutableMapping, name: Any = None, *args, **kwargs + cls, data: MutableMapping, name: Any = None ) -> GenericIndex: out = super()._from_data(data=data) if name is not None: diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 7c9b0381c69..adc8818aea3 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -19,7 +19,6 @@ Type, TypeVar, Union, - cast, ) from uuid import uuid4 @@ -195,13 +194,9 @@ def _num_rows(self) -> int: @classmethod def _from_data( - cls, - data: MutableMapping, - index: Optional[BaseIndex] = None, - *args, - **kwargs, + cls, data: MutableMapping, index: Optional[BaseIndex] = None, ): - out = super()._from_data(data, *args, **kwargs) + out = super()._from_data(data) out._index = RangeIndex(out._data.nrows) if index is None else index return out @@ -253,10 +248,8 @@ def _from_columns_like_self( return frame._copy_type_metadata(self, include_index=bool(index_names)) def _mimic_inplace( - self: T, result: Frame, inplace: bool = False + self: T, result: T, inplace: bool = False ) -> Optional[Frame]: - # TODO: Is there a better way to make mypy happy? - result = cast(IndexedFrame, result) if inplace: self._index = result._index return super()._mimic_inplace(result, inplace) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 5d862213165..0bb82b4ddad 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -458,8 +458,6 @@ def _from_data( data: MutableMapping, index: Optional[BaseIndex] = None, name: Any = None, - *args, - **kwargs, ) -> Series: out = super()._from_data(data=data, index=index) if name is not None: diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index b35d653e28f..de10261315c 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -3,16 +3,7 @@ from __future__ import annotations -from typing import ( - Any, - Dict, - MutableMapping, - Optional, - Tuple, - Type, - TypeVar, - Union, -) +from typing import Any, Dict, Optional, Tuple, Type, TypeVar, Union import cupy import numpy as np @@ -67,20 +58,6 @@ def _scan(self, op, axis=None, *args, **kwargs): return super()._scan(op, axis=axis, *args, **kwargs) - @classmethod - @_cudf_nvtx_annotate - def _from_data( - cls, - data: MutableMapping, - index: Optional[cudf.core.index.BaseIndex] = None, - name: Any = None, - ): - - out = super()._from_data(data, index) - if name is not None: - out.name = name - return out - @property # type: ignore @_cudf_nvtx_annotate def name(self): From 292acfa5d7ada9f4b59641e333041ed18d8d0b75 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 17 Mar 2022 09:36:14 -0700 Subject: [PATCH 14/17] Avoid constructing a Frame and just use DataFrame._from_data to construct it fast. --- python/cudf/cudf/core/dataframe.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 738ff59c33e..83530153641 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5603,7 +5603,9 @@ def stack(self, level=-1, dropna=True): """ assert level in (None, -1) repeated_index = self.index.repeat(self.shape[1]) - name_index = Frame({0: self._column_names}).tile(self.shape[0]) + name_index = cudf.DataFrame._from_data({0: self._column_names}).tile( + self.shape[0] + ) new_index = list(repeated_index._columns) + [name_index._columns[0]] if isinstance(self._index, MultiIndex): index_names = self._index.names + [None] From f2bf9aa337c9c003b2dd11fba769a1c46cceded1 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 17 Mar 2022 09:45:09 -0700 Subject: [PATCH 15/17] Fix bug. --- python/cudf/cudf/core/frame.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index fab0c7fafb0..51cbfcebb00 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1171,11 +1171,11 @@ def fillna( self._from_data(data=filled_data), inplace=inplace, ) # TODO: Split this logic into the IndexedFrame class. - if ( - isinstance(self, cudf.core.indexed_frame.IndexedFrame) - and not inplace - ): - ret._index = old_index + if isinstance(self, cudf.core.indexed_frame.IndexedFrame): + if inplace: + self._index = old_index + else: + ret._index = old_index return ret @_cudf_nvtx_annotate From 25b5df9b0601ba9e9c87ac8ed834fba12a2a1ac0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 17 Mar 2022 13:47:31 -0700 Subject: [PATCH 16/17] Move index replacement logic in fillna to IndexedFrame. --- python/cudf/cudf/core/frame.py | 13 ++----------- python/cudf/cudf/core/indexed_frame.py | 12 ++++++++++++ 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 51cbfcebb00..aead63313f7 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1037,7 +1037,7 @@ def fillna( Returns ------- - result : DataFrame + result : DataFrame, Series, or Index Copy with nulls filled. Examples @@ -1165,18 +1165,9 @@ def fillna( else: filled_data[col_name] = col.copy(deep=True) - # TODO: This logic needs to move into the IndexedFrame class. - old_index = self._index - ret = self._mimic_inplace( + return self._mimic_inplace( self._from_data(data=filled_data), inplace=inplace, ) - # TODO: Split this logic into the IndexedFrame class. - if isinstance(self, cudf.core.indexed_frame.IndexedFrame): - if inplace: - self._index = old_index - else: - ret._index = old_index - return ret @_cudf_nvtx_annotate def _drop_column(self, name): diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index adc8818aea3..61efcb6c4bf 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -904,6 +904,18 @@ def _split(self, splits, keep_index=True): for i in range(len(splits) + 1) ] + @_cudf_nvtx_annotate + def fillna( + self, value=None, method=None, axis=None, inplace=False, limit=None + ): # noqa: D102 + old_index = self._index + ret = super().fillna(value, method, axis, inplace, limit) + if inplace: + self._index = old_index + else: + ret._index = old_index + return ret + def add_prefix(self, prefix): """ Prefix labels with string `prefix`. From 21b7ef2a3cdb8a54a28ba898fb80fa8284306156 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 18 Mar 2022 16:48:41 -0700 Subject: [PATCH 17/17] Address PR comments. --- python/cudf/cudf/core/dataframe.py | 17 ++++++++++++----- python/cudf/cudf/core/frame.py | 2 ++ 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 83530153641..6f05e9bd678 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -613,6 +613,7 @@ def __init__( self._data = new_df._data self._index = new_df._index + self._check_data_index_length_match() elif hasattr(data, "__array_interface__"): arr_interface = data.__array_interface__ if len(arr_interface["descr"]) == 1: @@ -622,6 +623,7 @@ def __init__( new_df = self.from_records(data, index=index, columns=columns) self._data = new_df._data self._index = new_df._index + self._check_data_index_length_match() else: if is_list_like(data): if len(data) > 0 and is_scalar(data[0]): @@ -633,6 +635,7 @@ def __init__( self._data = new_df._data self._index = new_df._index + self._check_data_index_length_match() elif len(data) > 0 and isinstance(data[0], Series): self._init_from_series_list( data=data, columns=columns, index=index @@ -650,14 +653,18 @@ def __init__( data, index=index, columns=columns, nan_as_null=nan_as_null ) - if self._data.nrows > 0 and self._data.nrows != len(self._index): - raise ValueError( - f"Shape of passed values is {self.shape}, indices imply " - f"({len(self._index)}, {self._num_columns})" - ) if dtype: self._data = self.astype(dtype)._data + def _check_data_index_length_match(df: DataFrame) -> None: + # Validate that the number of rows in the data matches the index if the + # data is not empty. This is a helper for the constructor. + if df._data.nrows > 0 and df._data.nrows != len(df._index): + raise ValueError( + f"Shape of passed values is {df.shape}, indices imply " + f"({len(df._index)}, {df._num_columns})" + ) + @_cudf_nvtx_annotate def _init_from_series_list(self, data, columns, index): if index is None: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index aead63313f7..0b476d5c982 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -454,6 +454,8 @@ def equals(self, other): >>> df.equals(different_column_type) True """ + if self is other: + return True if ( other is None or not isinstance(other, type(self))