From f117d0395173de8abf684c08f2a16432c302982f Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 31 Oct 2022 16:27:01 -0700 Subject: [PATCH 1/3] add StructColumn.memory_usage --- python/cudf/cudf/core/column/struct.py | 13 +++++++++++++ python/cudf/cudf/core/dtypes.py | 7 +++++++ python/cudf/cudf/tests/test_list.py | 2 ++ python/cudf/cudf/tests/test_struct.py | 15 +++++++++++++++ 4 files changed, 37 insertions(+) diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 67ff3e48dbd..69d70cf427f 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -1,6 +1,8 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. from __future__ import annotations +from functools import cached_property + import pandas as pd import pyarrow as pa @@ -65,6 +67,17 @@ def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series": pd_series.index = index return pd_series + @cached_property + def memory_usage(self): + n = 0 + if self.nullable: + n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size) + + for child in self.children: + n += child.memory_usage + + return n + def element_indexing(self, index: int): result = super().element_indexing(index) return { diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 84f528549e9..201dc253a98 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -627,6 +627,13 @@ def deserialize(cls, header: dict, frames: list): fields[k] = pickle.loads(dtype) return cls(fields) + @property + def itemsize(self): + return sum( + cudf.utils.dtypes.cudf_dtype_from_pa_type(field.type).itemsize + for field in self._typ + ) + decimal_dtype_template = textwrap.dedent( """ diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 8ea11382419..4c2a14fc45c 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -864,6 +864,8 @@ def test_memory_usage(): assert s1.memory_usage() == 44 s2 = cudf.Series([[[[1, 2]]], [[[3, 4]]]]) assert s2.memory_usage() == 68 + s3 = cudf.Series([[{"b": 1, "a": 10}, {"b": 2, "a": 100}]]) + assert s3.memory_usage() == 40 @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index 4c70d20c488..95e02702a5a 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -371,3 +371,18 @@ def test_nested_struct_extract_host_scalars(data, idx, expected): series = cudf.Series(data) assert _nested_na_replace(series[idx]) == _nested_na_replace(expected) + + +def test_struct_memory_usage(): + s = cudf.Series([{"a": 1, "b": 10}, {"a": 2, "b": 20}, {"a": 3, "b": 30}]) + df = s.struct.explode() + + assert_eq(s.memory_usage(), df.memory_usage().sum()) + + +def test_struct_with_null_memory_usage(): + s = cudf.Series( + [{"a": 1, "b": 10}, {"a": 2, "b": 20}, None, None, {"a": 3, "b": 30}] + ) + + assert s.memory_usage() == 272 From 1a07b9204c4cccdd4951b02bc0a3740546d4d6a5 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 1 Nov 2022 05:34:08 -0700 Subject: [PATCH 2/3] address reviews --- python/cudf/cudf/core/dtypes.py | 3 ++- python/cudf/cudf/tests/test_struct.py | 9 +++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 201dc253a98..a56de0cd451 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -4,6 +4,7 @@ import operator import pickle import textwrap +from functools import cached_property from typing import Any, Callable, Dict, List, Tuple, Type, Union import numpy as np @@ -627,7 +628,7 @@ def deserialize(cls, header: dict, frames: list): fields[k] = pickle.loads(dtype) return cls(fields) - @property + @cached_property def itemsize(self): return sum( cudf.utils.dtypes.cudf_dtype_from_pa_type(field.type).itemsize diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index 95e02702a5a..64dcec60d73 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -381,8 +381,13 @@ def test_struct_memory_usage(): def test_struct_with_null_memory_usage(): - s = cudf.Series( - [{"a": 1, "b": 10}, {"a": 2, "b": 20}, None, None, {"a": 3, "b": 30}] + df = cudf.DataFrame( + { + "a": cudf.Series([1, 2, -1, -1, 3], dtype="int64"), + "b": cudf.Series([10, 20, -1, -1, 30], dtype="int64"), + } ) + s = df.to_struct() + s[2:4] = None assert s.memory_usage() == 272 From 668c842bfcf3f935398fc34ee07e25a952c9c756 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 1 Nov 2022 05:38:23 -0700 Subject: [PATCH 3/3] add before and after null mask --- python/cudf/cudf/tests/test_struct.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index 64dcec60d73..eaee1efcbc8 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -388,6 +388,7 @@ def test_struct_with_null_memory_usage(): } ) s = df.to_struct() - s[2:4] = None + assert s.memory_usage() == 80 + s[2:4] = None assert s.memory_usage() == 272