From c1c62f1c02cf3929fb7536d67d14a24a9e2950ea Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 4 Jul 2024 04:31:06 -1000 Subject: [PATCH] Fix `memory_usage` when calculating nested list column (#16193) The offset column of a nested empty list column may be empty as discussed in https://github.com/rapidsai/cudf/issues/16164. `ListColumn.memory_usage` assumed that this column was non-empty Unblocks https://github.com/rapidsai/cuspatial/pull/1400 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/16193 --- python/cudf/cudf/core/column/lists.py | 11 ++++++++--- python/cudf/cudf/tests/test_list.py | 27 +++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index c548db67344..1992d471947 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -73,10 +73,15 @@ def memory_usage(self): child0_size = ( current_base_child.size + 1 - current_offset ) * current_base_child.base_children[0].dtype.itemsize - current_offset = current_base_child.base_children[ - 0 - ].element_indexing(current_offset) n += child0_size + current_offset_col = current_base_child.base_children[0] + if not len(current_offset_col): + # See https://github.com/rapidsai/cudf/issues/16164 why + # offset column can be uninitialized + break + current_offset = current_offset_col.element_indexing( + current_offset + ) current_base_child = current_base_child.base_children[1] n += ( diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index f76143cb381..ec9d7995b05 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -12,6 +12,7 @@ from cudf import NA from cudf._lib.copying import get_element from cudf.api.types import is_scalar +from cudf.core.column.column import column_empty from cudf.testing import assert_eq from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES @@ -926,3 +927,29 @@ def test_list_iterate_error(): def test_list_struct_list_memory_usage(): df = cudf.DataFrame({"a": [[{"b": [1]}]]}) assert df.memory_usage().sum() == 16 + + +def test_empty_nested_list_uninitialized_offsets_memory_usage(): + col = column_empty(0, cudf.ListDtype(cudf.ListDtype("int64"))) + nested_col = col.children[1] + empty_inner = type(nested_col)( + size=nested_col.size, + dtype=nested_col.dtype, + mask=nested_col.mask, + offset=nested_col.offset, + null_count=nested_col.null_count, + children=( + column_empty(0, nested_col.children[0].dtype), + nested_col.children[1], + ), + ) + col_empty_offset = type(col)( + size=col.size, + dtype=col.dtype, + mask=col.mask, + offset=col.offset, + null_count=col.null_count, + children=(column_empty(0, col.children[0].dtype), empty_inner), + ) + ser = cudf.Series._from_data({None: col_empty_offset}) + assert ser.memory_usage() == 8