Skip to content

Commit

Permalink
Enable caching for memory_usage calculation in Column (NVIDIA#10345)
Browse files Browse the repository at this point in the history
We previously used to cache the `Column.memory_usage` output in `Column._cached_sizeof` but it probably was missed to be included in the recent refactors. This PR re-enables caching of `memory_usage`.

`Column.memory_usage` should be a no-op on consecutive calls and **60% faster**:
```python
In [1]: import cudf

In [2]: s = cudf.Series(["abc ", " d e", None, "10 11 234355"] * 10000000)

In [3]: s
Out[3]: 
0                   abc 
1                    d e
2                   <NA>
3           10 11 234355
4                   abc 
                ...     
39999995    10 11 234355
39999996            abc 
39999997             d e
39999998            <NA>
39999999    10 11 234355
Length: 40000000, dtype: object



# branch-22.04
In [3]: %timeit s.memory_usage()
2.86 µs ± 53.4 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)

# THIS PR

In [3]: %timeit s.memory_usage()
1.77 µs ± 10.8 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
```

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: rapidsai/cudf#10345
  • Loading branch information
galipremsagar authored Feb 23, 2022
1 parent c163886 commit 0ae9dc6
Show file tree
Hide file tree
Showing 8 changed files with 20 additions and 11 deletions.
7 changes: 5 additions & 2 deletions python/cudf/cudf/_lib/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ cdef class Column:
):

self._size = size
self._cached_sizeof = None
self._distinct_count = {}
self._dtype = dtype
self._offset = offset
Expand Down Expand Up @@ -204,7 +203,11 @@ cdef class Column:

def _clear_cache(self):
self._distinct_count = {}
self._cached_sizeof = None
try:
del self.memory_usage
except AttributeError:
# `self.memory_usage` was never called before, So ignore.
pass
self._null_count = None

def set_mask(self, value):
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import pickle
from collections.abc import MutableSequence
from functools import cached_property
from typing import (
TYPE_CHECKING,
Any,
Expand Down Expand Up @@ -1335,8 +1336,9 @@ def copy(self, deep: bool = True) -> CategoricalColumn:
size=self.size,
)

@cached_property
def memory_usage(self) -> int:
return self.categories.memory_usage() + self.codes.memory_usage()
return self.categories.memory_usage + self.codes.memory_usage

def _mimic_inplace(
self, other_col: ColumnBase, inplace: bool = False
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import builtins
import pickle
import warnings
from functools import cached_property
from types import SimpleNamespace
from typing import (
Any,
Expand Down Expand Up @@ -297,6 +298,7 @@ def _get_mask_as_column(self) -> ColumnBase:
self.base_mask, self.offset, self.offset + len(self)
)

@cached_property
def memory_usage(self) -> int:
n = 0
if self.data is not None:
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

import pickle
from functools import cached_property
from typing import List, Sequence

import numpy as np
Expand Down Expand Up @@ -42,6 +43,7 @@ def __init__(
children=children,
)

@cached_property
def memory_usage(self):
n = 0
if self.nullable:
Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pickle
import re
import warnings
from functools import cached_property
from typing import (
TYPE_CHECKING,
Any,
Expand Down Expand Up @@ -5024,7 +5025,6 @@ class StringColumn(column.ColumnBase):

_start_offset: Optional[int]
_end_offset: Optional[int]
_cached_sizeof: Optional[int]

def __init__(
self,
Expand Down Expand Up @@ -5102,6 +5102,7 @@ def end_offset(self) -> int:

return self._end_offset

@cached_property
def memory_usage(self) -> int:
n = 0
if len(self.base_children) == 2:
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ def memory_usage(self, deep=False):
"The deep parameter is ignored and is only included "
"for pandas compatibility."
)
return {name: col.memory_usage() for name, col in self._data.items()}
return {name: col.memory_usage for name, col in self._data.items()}

def __len__(self):
return self._num_rows
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -1415,7 +1415,7 @@ def memory_usage(self, deep=False):
usage += level.memory_usage(deep=deep)
if self.codes:
for col in self.codes._data.columns:
usage += col.memory_usage()
usage += col.memory_usage
return usage

def difference(self, other, sort=None):
Expand Down
7 changes: 3 additions & 4 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5195,8 +5195,8 @@ def test_memory_usage_cat():
gdf = cudf.from_pandas(df)

expected = (
gdf.B._column.categories.memory_usage()
+ gdf.B._column.codes.memory_usage()
gdf.B._column.categories.memory_usage
+ gdf.B._column.codes.memory_usage
)

# Check cat column
Expand All @@ -5209,8 +5209,7 @@ def test_memory_usage_cat():
def test_memory_usage_list():
df = cudf.DataFrame({"A": [[0, 1, 2, 3], [4, 5, 6], [7, 8], [9]]})
expected = (
df.A._column.offsets.memory_usage()
+ df.A._column.elements.memory_usage()
df.A._column.offsets.memory_usage + df.A._column.elements.memory_usage
)
assert expected == df.A.memory_usage()

Expand Down

0 comments on commit 0ae9dc6

Please sign in to comment.