Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Refactor memory_usage to improve performance #10537

Merged
merged 9 commits into from
Mar 31, 2022
10 changes: 8 additions & 2 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1339,8 +1339,14 @@ def _slice(self: T, arg: slice) -> T:

@_cudf_nvtx_annotate
def memory_usage(self, index=True, deep=False):
return Series(
{str(k): v for k, v in super().memory_usage(index, deep).items()}
mem_usage = [col.memory_usage for col in self._data.columns]
names = [str(name) for name in self._data.names]
if index:
mem_usage.append(self._index.memory_usage())
names.append("Index")
return Series._from_data(
data={None: as_column(mem_usage)},
index=as_index(names),
)
shwina marked this conversation as resolved.
Show resolved Hide resolved

@_cudf_nvtx_annotate
Expand Down
7 changes: 1 addition & 6 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,12 +339,7 @@ def memory_usage(self, deep=False):
-------
The total bytes used.
"""
if deep:
warnings.warn(
"The deep parameter is ignored and is only included "
"for pandas compatibility."
)
return {name: col.memory_usage for name, col in self._data.items()}
raise NotImplementedError
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this not implemented now?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have moved all their implementations to their respective Series/DataFrame/MultiIndex/Index .memory_usage methods directly. Kept these parent class methods for the sake of docstrings.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok so a user is unlikely to run into this error since they would be using one of these other public facing objects. Is that right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup, that's correct.


def __len__(self):
return self._num_rows
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -914,7 +914,7 @@ def _concat(cls, objs):

@_cudf_nvtx_annotate
def memory_usage(self, deep=False):
return sum(super().memory_usage(deep=deep).values())
return self._column.memory_usage

@_cudf_nvtx_annotate
def equals(self, other, **kwargs):
Expand Down
5 changes: 1 addition & 4 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -704,10 +704,7 @@ def memory_usage(self, index=True, deep=False):
>>> s.memory_usage(index=False)
24
"""
usage = super().memory_usage(deep=deep)
if index:
usage["Index"] = self.index.memory_usage()
return usage
raise NotImplementedError
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved

def hash_values(self, method="murmur3"):
"""Compute the hash of values in this column.
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -1474,7 +1474,7 @@ def _clean_nulls_from_index(self):

@_cudf_nvtx_annotate
def memory_usage(self, deep=False):
usage = sum(super().memory_usage(deep=deep).values())
usage = sum(col.memory_usage for col in self._data.columns)
if self.levels:
for level in self.levels:
usage += level.memory_usage(deep=deep)
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -856,7 +856,9 @@ def to_frame(self, name=None):

@_cudf_nvtx_annotate
def memory_usage(self, index=True, deep=False):
return sum(super().memory_usage(index, deep).values())
return self._column.memory_usage + (
self._index.memory_usage() if index else 0
)

@_cudf_nvtx_annotate
def __array_function__(self, func, types, args, kwargs):
Expand Down
5 changes: 4 additions & 1 deletion python/dask_cudf/dask_cudf/backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,10 @@ def group_split_cudf(df, c, k, ignore_index=False):
@sizeof_dispatch.register(cudf.DataFrame)
@_dask_cudf_nvtx_annotate
def sizeof_cudf_dataframe(df):
return int(df.memory_usage().sum())
return int(
sum(col.memory_usage for col in df._data.columns)
+ df._index.memory_usage()
)


@sizeof_dispatch.register((cudf.Series, cudf.BaseIndex))
Expand Down