Skip to content

Commit

Permalink
Fix memory_usage when calculating nested list column (#16193)
Browse files Browse the repository at this point in the history
The offset column of a nested empty list column may be empty as discussed in #16164. `ListColumn.memory_usage` assumed that this column was non-empty

Unblocks rapidsai/cuspatial#1400

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)

URL: #16193
  • Loading branch information
mroeschke authored Jul 4, 2024
1 parent 5f57bc9 commit c1c62f1
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 3 deletions.
11 changes: 8 additions & 3 deletions python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,15 @@ def memory_usage(self):
child0_size = (
current_base_child.size + 1 - current_offset
) * current_base_child.base_children[0].dtype.itemsize
current_offset = current_base_child.base_children[
0
].element_indexing(current_offset)
n += child0_size
current_offset_col = current_base_child.base_children[0]
if not len(current_offset_col):
# See https://github.com/rapidsai/cudf/issues/16164 why
# offset column can be uninitialized
break
current_offset = current_offset_col.element_indexing(
current_offset
)
current_base_child = current_base_child.base_children[1]

n += (
Expand Down
27 changes: 27 additions & 0 deletions python/cudf/cudf/tests/test_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from cudf import NA
from cudf._lib.copying import get_element
from cudf.api.types import is_scalar
from cudf.core.column.column import column_empty
from cudf.testing import assert_eq
from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES

Expand Down Expand Up @@ -926,3 +927,29 @@ def test_list_iterate_error():
def test_list_struct_list_memory_usage():
df = cudf.DataFrame({"a": [[{"b": [1]}]]})
assert df.memory_usage().sum() == 16


def test_empty_nested_list_uninitialized_offsets_memory_usage():
col = column_empty(0, cudf.ListDtype(cudf.ListDtype("int64")))
nested_col = col.children[1]
empty_inner = type(nested_col)(
size=nested_col.size,
dtype=nested_col.dtype,
mask=nested_col.mask,
offset=nested_col.offset,
null_count=nested_col.null_count,
children=(
column_empty(0, nested_col.children[0].dtype),
nested_col.children[1],
),
)
col_empty_offset = type(col)(
size=col.size,
dtype=col.dtype,
mask=col.mask,
offset=col.offset,
null_count=col.null_count,
children=(column_empty(0, col.children[0].dtype), empty_inner),
)
ser = cudf.Series._from_data({None: col_empty_offset})
assert ser.memory_usage() == 8

0 comments on commit c1c62f1

Please sign in to comment.