Skip to content

Commit

Permalink
Fix memory usage calculation (#6596)
Browse files Browse the repository at this point in the history
Fixes: #6590
  • Loading branch information
galipremsagar authored Oct 27, 2020
1 parent 4b4d962 commit e620a73
Show file tree
Hide file tree
Showing 7 changed files with 110 additions and 5 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
- PR #6543 Handle `np.nan` values in `isna`/`isnull`/`notna`/`notnull`
- PR #6549 Fix memory_usage calls for list columns
- PR #6575 Fix JNI RMM initialize with no pool allocator limit
- PR #6596 Fix memory usage calculation
- PR #6595 Fix JNI build, broken by to_arrow() signature change


Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ cdef class Column:
):

self._size = size
self._cached_sizeof = None
self._dtype = dtype
self._offset = offset
self._null_count = null_count
Expand Down
6 changes: 2 additions & 4 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,11 +179,9 @@ def any(self):
return bool(libcudf.reduce.reduce("any", self, dtype=np.bool_))

def __sizeof__(self):
n = self.base_data.size if self.base_data is not None else 0
n = self.data.size
if self.nullable:
n += self.base_mask.size
for child in self.base_children:
n += child.__sizeof__()
n += bitmask_allocation_size_bytes(self.size)
return n

@classmethod
Expand Down
37 changes: 37 additions & 0 deletions python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pyarrow as pa

import cudf
from cudf.core.column import ColumnBase
from cudf.core.column.methods import ColumnMethodsMixin
from cudf.utils.dtypes import is_list_dtype
Expand All @@ -21,6 +22,42 @@ def __init__(
children=children,
)

def __sizeof__(self):
if self._cached_sizeof is None:
n = 0
if self.nullable:
n += cudf._lib.null_mask.bitmask_allocation_size_bytes(
self.size
)

child0_size = (self.size + 1) * self.base_children[
0
].dtype.itemsize
current_base_child = self.base_children[1]
current_offset = self.offset
n += child0_size
while type(current_base_child) is ListColumn:
child0_size = (
current_base_child.size + 1 - current_offset
) * current_base_child.base_children[0].dtype.itemsize
current_offset = current_base_child.base_children[0][
current_offset
]
n += child0_size
current_base_child = current_base_child.base_children[1]

n += (
current_base_child.size - current_offset
) * current_base_child.dtype.itemsize

if current_base_child.nullable:
n += cudf._lib.null_mask.bitmask_allocation_size_bytes(
current_base_child.size
)
self._cached_sizeof = n

return self._cached_sizeof

@property
def base_size(self):
return len(self.base_children[0]) - 1
Expand Down
52 changes: 52 additions & 0 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -4506,6 +4506,58 @@ def __init__(
children=children,
)

self._start_offset = None
self._end_offset = None

@property
def start_offset(self):
if self._start_offset is None:
if (
len(self.base_children) == 2
and self.offset < self.base_children[0].size
):
self._start_offset = int(self.base_children[0][self.offset])
else:
self._start_offset = 0

return self._start_offset

@property
def end_offset(self):
if self._end_offset is None:
if (
len(self.base_children) == 2
and (self.offset + self.size) < self.base_children[0].size
):
self._end_offset = int(
self.base_children[0][self.offset + self.size]
)
else:
self._end_offset = 0

return self._end_offset

def __sizeof__(self):
if self._cached_sizeof is None:
n = 0
if len(self.base_children) == 2:
child0_size = (self.size + 1) * self.base_children[
0
].dtype.itemsize

child1_size = (
self.end_offset - self.start_offset
) * self.base_children[1].dtype.itemsize

n += child0_size + child1_size
if self.nullable:
n += cudf._lib.null_mask.bitmask_allocation_size_bytes(
self.size
)
self._cached_sizeof = n

return self._cached_sizeof

@property
def base_size(self):
if len(self.base_children) == 0:
Expand Down
17 changes: 17 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -697,3 +697,20 @@ def test_series_error_equality(sr1, sr2, op):
gsr2 = cudf.from_pandas(sr2)

assert_exceptions_equal(op, op, ([sr1, sr2],), ([gsr1, gsr2],))


def test_series_memory_usage():
sr = cudf.Series([1, 2, 3, 4], dtype="int64")
assert sr.memory_usage() == 32

sliced_sr = sr[2:]
assert sliced_sr.memory_usage() == 16

sliced_sr[3] = None
assert sliced_sr.memory_usage() == 80

sr = cudf.Series(["hello world", "rapids ai", "abc", "z"])
assert sr.memory_usage() == 44

assert sr[3:].memory_usage() == 9 # z
assert sr[:1].memory_usage() == 19 # hello world
1 change: 0 additions & 1 deletion python/dask_cudf/dask_cudf/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,7 +568,6 @@ def test_drop(gdf, gddf):
@pytest.mark.parametrize("deep", [True, False])
@pytest.mark.parametrize("index", [True, False])
def test_memory_usage(gdf, gddf, index, deep):
gddf = gddf.map_partitions(lambda x: x.copy(deep=True))

dd.assert_eq(
gdf.memory_usage(deep=deep, index=index),
Expand Down

0 comments on commit e620a73

Please sign in to comment.