diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index eea8e3c418f..d688b75ed14 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1365,7 +1365,12 @@ def memory_usage(self, deep=False): ------- bytes used """ - return self._values._memory_usage(deep=deep) + if deep: + warnings.warn( + "The deep parameter is ignored and is only included " + "for pandas compatibility." + ) + return self._values.memory_usage() @classmethod def from_pandas(cls, index, nan_as_null=None): diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index a8e868ed521..a2c1f04b2f2 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1323,15 +1323,8 @@ def copy(self, deep: bool = True) -> CategoricalColumn: size=self.size, ) - def __sizeof__(self) -> int: - return self.categories.__sizeof__() + self.codes.__sizeof__() - - def _memory_usage(self, **kwargs) -> int: - deep = kwargs.get("deep", False) - if deep: - return self.__sizeof__() - else: - return self.categories._memory_usage() + self.codes._memory_usage() + def memory_usage(self) -> int: + return self.categories.memory_usage() + self.codes.memory_usage() def _mimic_inplace( self, other_col: ColumnBase, inplace: bool = False diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d8c99a87f92..0a3688248bd 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -199,14 +199,6 @@ def any(self, skipna: bool = True) -> bool: return result_col - def __sizeof__(self) -> int: - n = 0 - if self.data is not None: - n += self.data.size - if self.nullable: - n += bitmask_allocation_size_bytes(self.size) - return n - def dropna(self, drop_nan: bool = False) -> ColumnBase: if drop_nan: col = self.nans_to_nulls() @@ -313,13 +305,18 @@ def _get_mask_as_column(self) -> ColumnBase: self.base_mask, self.offset, self.offset + len(self) ) - def _memory_usage(self, **kwargs) -> int: - return self.__sizeof__() + def memory_usage(self) -> int: + n = 0 + if self.data is not None: + n += self.data.size + if self.nullable: + n += bitmask_allocation_size_bytes(self.size) + return n def _default_na_value(self) -> Any: raise NotImplementedError() - # TODO: This method is decpreated and can be removed when the associated + # TODO: This method is deprecated and can be removed when the associated # Frame methods are removed. def to_gpu_array(self, fillna=None) -> "cuda.devicearray.DeviceNDArray": """Get a dense numba device array for the data. diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index da51ce3becc..8a2e03edeaf 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -42,41 +42,34 @@ def __init__( children=children, ) - def __sizeof__(self): - if self._cached_sizeof is None: - n = 0 - if self.nullable: - n += cudf._lib.null_mask.bitmask_allocation_size_bytes( - self.size - ) - - child0_size = (self.size + 1) * self.base_children[ - 0 - ].dtype.itemsize - current_base_child = self.base_children[1] - current_offset = self.offset + def memory_usage(self): + n = 0 + if self.nullable: + n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size) + + child0_size = (self.size + 1) * self.base_children[0].dtype.itemsize + current_base_child = self.base_children[1] + current_offset = self.offset + n += child0_size + while type(current_base_child) is ListColumn: + child0_size = ( + current_base_child.size + 1 - current_offset + ) * current_base_child.base_children[0].dtype.itemsize + current_offset = current_base_child.base_children[0][ + current_offset + ] n += child0_size - while type(current_base_child) is ListColumn: - child0_size = ( - current_base_child.size + 1 - current_offset - ) * current_base_child.base_children[0].dtype.itemsize - current_offset = current_base_child.base_children[0][ - current_offset - ] - n += child0_size - current_base_child = current_base_child.base_children[1] - - n += ( - current_base_child.size - current_offset - ) * current_base_child.dtype.itemsize - - if current_base_child.nullable: - n += cudf._lib.null_mask.bitmask_allocation_size_bytes( - current_base_child.size - ) - self._cached_sizeof = n + current_base_child = current_base_child.base_children[1] + + n += ( + current_base_child.size - current_offset + ) * current_base_child.dtype.itemsize - return self._cached_sizeof + if current_base_child.nullable: + n += cudf._lib.null_mask.bitmask_allocation_size_bytes( + current_base_child.size + ) + return n def __setitem__(self, key, value): if isinstance(value, list): diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 30d762ad5fc..a167383c65c 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5214,26 +5214,21 @@ def end_offset(self) -> int: return self._end_offset - def __sizeof__(self) -> int: - if self._cached_sizeof is None: - n = 0 - if len(self.base_children) == 2: - child0_size = (self.size + 1) * self.base_children[ - 0 - ].dtype.itemsize - - child1_size = ( - self.end_offset - self.start_offset - ) * self.base_children[1].dtype.itemsize - - n += child0_size + child1_size - if self.nullable: - n += cudf._lib.null_mask.bitmask_allocation_size_bytes( - self.size - ) - self._cached_sizeof = n + def memory_usage(self) -> int: + n = 0 + if len(self.base_children) == 2: + child0_size = (self.size + 1) * self.base_children[ + 0 + ].dtype.itemsize + + child1_size = ( + self.end_offset - self.start_offset + ) * self.base_children[1].dtype.itemsize - return self._cached_sizeof + n += child0_size + child1_size + if self.nullable: + n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size) + return n @property def base_size(self) -> int: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 2849536dcdb..8c2e3c8cc7f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1138,11 +1138,6 @@ def __delitem__(self, name): """ self._drop_column(name) - def __sizeof__(self): - columns = sum(col.__sizeof__() for col in self._data.columns) - index = self._index.__sizeof__() - return columns + index - def _slice(self: T, arg: slice) -> T: """ _slice : slice the frame as per the arg @@ -1253,12 +1248,17 @@ def memory_usage(self, index=True, deep=False): >>> df['object'].astype('category').memory_usage(deep=True) 5048 """ + if deep: + warnings.warn( + "The deep parameter is ignored and is only included " + "for pandas compatibility." + ) ind = list(self.columns) - sizes = [col._memory_usage(deep=deep) for col in self._data.columns] + sizes = [col.memory_usage() for col in self._data.columns] if index: ind.append("Index") ind = cudf.Index(ind, dtype="str") - sizes.append(self.index.memory_usage(deep=deep)) + sizes.append(self.index.memory_usage()) return Series(sizes, index=ind) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index de463269743..071344084c2 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -474,7 +474,12 @@ def get_slice_bound(self, label, side, kind=None): pos = search_range(start, stop, label, step, side=side) return pos - def memory_usage(self, **kwargs): + def memory_usage(self, deep=False): + if deep: + warnings.warn( + "The deep parameter is ignored and is only included " + "for pandas compatibility." + ) return 0 def unique(self): @@ -1022,9 +1027,6 @@ def get_loc(self, key, method=None, tolerance=None): mask[true_inds] = True return mask - def __sizeof__(self): - return self._values.__sizeof__() - def __repr__(self): max_seq_items = get_option("max_seq_items") or len(self) mr = 0 diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 2dd70b336e9..961476309c7 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -979,9 +979,6 @@ def set_mask(self, mask, null_count=None): {self.name: self._column.set_mask(mask)}, self._index ) - def __sizeof__(self): - return self._column.__sizeof__() + self._index.__sizeof__() - def memory_usage(self, index=True, deep=False): """ Return the memory usage of the Series. @@ -1020,9 +1017,14 @@ def memory_usage(self, index=True, deep=False): >>> s.memory_usage(index=False) 24 """ - n = self._column._memory_usage(deep=deep) + if deep: + warnings.warn( + "The deep parameter is ignored and is only included " + "for pandas compatibility." + ) + n = self._column.memory_usage() if index: - n += self._index.memory_usage(deep=deep) + n += self._index.memory_usage() return n def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 2f329766936..30edc0fb260 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3649,19 +3649,6 @@ def test_empty_dataframe_any(axis): assert_eq(got, expected, check_index_type=False) -@pytest.mark.parametrize("indexed", [False, True]) -def test_dataframe_sizeof(indexed): - rows = int(1e6) - index = list(i for i in range(rows)) if indexed else None - - gdf = cudf.DataFrame({"A": [8] * rows, "B": [32] * rows}, index=index) - - for c in gdf._data.columns: - assert gdf._index.__sizeof__() == gdf._index.__sizeof__() - cols_sizeof = sum(c.__sizeof__() for c in gdf._data.columns) - assert gdf.__sizeof__() == (gdf._index.__sizeof__() + cols_sizeof) - - @pytest.mark.parametrize("a", [[], ["123"]]) @pytest.mark.parametrize("b", ["123", ["123"]]) @pytest.mark.parametrize( @@ -5394,8 +5381,8 @@ def test_memory_usage_cat(): gdf = cudf.from_pandas(df) expected = ( - gdf.B._column.categories.__sizeof__() - + gdf.B._column.codes.__sizeof__() + gdf.B._column.categories.memory_usage() + + gdf.B._column.codes.memory_usage() ) # Check cat column @@ -5408,8 +5395,8 @@ def test_memory_usage_cat(): def test_memory_usage_list(): df = cudf.DataFrame({"A": [[0, 1, 2, 3], [4, 5, 6], [7, 8], [9]]}) expected = ( - df.A._column.offsets._memory_usage() - + df.A._column.elements._memory_usage() + df.A._column.offsets.memory_usage() + + df.A._column.elements.memory_usage() ) assert expected == df.A.memory_usage() diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py index 28e63ec41f1..8d504edd669 100644 --- a/python/cudf/cudf/tests/test_pickling.py +++ b/python/cudf/cudf/tests/test_pickling.py @@ -68,7 +68,7 @@ def test_pickle_dataframe_categorical(): check_serialization(df) -def test_sizeof_dataframe(): +def test_memory_usage_dataframe(): np.random.seed(0) df = DataFrame() nelem = 1000 @@ -76,7 +76,7 @@ def test_sizeof_dataframe(): df["vals"] = hvals = np.random.random(nelem) nbytes = hkeys.nbytes + hvals.nbytes - sizeof = sys.getsizeof(df) + sizeof = df.memory_usage().sum() assert sizeof >= nbytes serialized_nbytes = len(pickle.dumps(df, protocol=pickle.HIGHEST_PROTOCOL)) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 02a782151db..f81a4743a4a 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -26,6 +26,7 @@ is_scalar, make_meta_obj, ) +from dask.sizeof import sizeof as sizeof_dispatch import cudf from cudf.api.types import is_string_dtype @@ -345,3 +346,13 @@ def group_split_cudf(df, c, k, ignore_index=False): ), ) ) + + +@sizeof_dispatch.register(cudf.DataFrame) +def sizeof_cudf_dataframe(df): + return int(df.memory_usage().sum()) + + +@sizeof_dispatch.register((cudf.Series, cudf.BaseIndex)) +def sizeof_cudf_series_index(obj): + return obj.memory_usage()