Skip to content

Commit

Permalink
Add memory_usage & items implementation for Struct column & dty…
Browse files Browse the repository at this point in the history
…pe (#12033)

Fixes: #11893 

- [x] This PR implements `StructColumn.memory_usage` and `StructDtype.items`

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: #12033
  • Loading branch information
galipremsagar authored Nov 3, 2022
1 parent baa645d commit b156c25
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 0 deletions.
13 changes: 13 additions & 0 deletions python/cudf/cudf/core/column/struct.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
from __future__ import annotations

from functools import cached_property

import pandas as pd
import pyarrow as pa

Expand Down Expand Up @@ -65,6 +67,17 @@ def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series":
pd_series.index = index
return pd_series

@cached_property
def memory_usage(self):
n = 0
if self.nullable:
n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size)

for child in self.children:
n += child.memory_usage

return n

def element_indexing(self, index: int):
result = super().element_indexing(index)
return {
Expand Down
8 changes: 8 additions & 0 deletions python/cudf/cudf/core/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import operator
import pickle
import textwrap
from functools import cached_property
from typing import Any, Callable, Dict, List, Tuple, Type, Union

import numpy as np
Expand Down Expand Up @@ -627,6 +628,13 @@ def deserialize(cls, header: dict, frames: list):
fields[k] = pickle.loads(dtype)
return cls(fields)

@cached_property
def itemsize(self):
return sum(
cudf.utils.dtypes.cudf_dtype_from_pa_type(field.type).itemsize
for field in self._typ
)


decimal_dtype_template = textwrap.dedent(
"""
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/tests/test_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -864,6 +864,8 @@ def test_memory_usage():
assert s1.memory_usage() == 44
s2 = cudf.Series([[[[1, 2]]], [[[3, 4]]]])
assert s2.memory_usage() == 68
s3 = cudf.Series([[{"b": 1, "a": 10}, {"b": 2, "a": 100}]])
assert s3.memory_usage() == 40


@pytest.mark.parametrize(
Expand Down
21 changes: 21 additions & 0 deletions python/cudf/cudf/tests/test_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,3 +371,24 @@ def test_nested_struct_extract_host_scalars(data, idx, expected):
series = cudf.Series(data)

assert _nested_na_replace(series[idx]) == _nested_na_replace(expected)


def test_struct_memory_usage():
s = cudf.Series([{"a": 1, "b": 10}, {"a": 2, "b": 20}, {"a": 3, "b": 30}])
df = s.struct.explode()

assert_eq(s.memory_usage(), df.memory_usage().sum())


def test_struct_with_null_memory_usage():
df = cudf.DataFrame(
{
"a": cudf.Series([1, 2, -1, -1, 3], dtype="int64"),
"b": cudf.Series([10, 20, -1, -1, 30], dtype="int64"),
}
)
s = df.to_struct()
assert s.memory_usage() == 80

s[2:4] = None
assert s.memory_usage() == 272

0 comments on commit b156c25

Please sign in to comment.