From bf5b778c265b3bfa712f509be0ba268216bcf3d0 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 28 Oct 2024 23:51:03 -0500 Subject: [PATCH] Check `num_children() == 0` in `Column.from_column_view` (#17193) This fixes a bug where `Column.from_column_view` is not verifying the existence of a string column's offsets child column prior to accessing it, resulting in a segmentation fault when passing a `column_view` from `Column.view()` to `Column.from_column_view(...)`. The issue can be reproduced with: ``` import cudf from cudf.core.column.column import as_column df = cudf.DataFrame({'a': cudf.Series([[]], dtype=cudf.core.dtypes.ListDtype('string'))}) s = df['a'] col = as_column(s) col2 = cudf._lib.column.Column.back_and_forth(col) print(col) print(col2) ``` where `back_and_forth` is defined as: ``` @staticmethod def back_and_forth(Column input_column): cdef column_view input_column_view = input_column.view() return Column.from_column_view(input_column_view, input_column) ``` I don't have the expertise to write the appropriate tests for this without introducing the `back_and_forth` function as an API, which seems undesirable. Authors: - Christopher Harris (https://github.com/cwharris) Approvers: - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17193 --- python/cudf/cudf/_lib/column.pyx | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 065655505b8..94dbdf5534d 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -688,15 +688,18 @@ cdef class Column: # special case for string column is_string_column = (cv.type().id() == libcudf_types.type_id.STRING) if is_string_column: - # get the size from offset child column (device to host copy) - offsets_column_index = 0 - offset_child_column = cv.child(offsets_column_index) - if offset_child_column.size() == 0: + if cv.num_children() == 0: base_nbytes = 0 else: - chars_size = get_element( - offset_child_column, offset_child_column.size()-1).value - base_nbytes = chars_size + # get the size from offset child column (device to host copy) + offsets_column_index = 0 + offset_child_column = cv.child(offsets_column_index) + if offset_child_column.size() == 0: + base_nbytes = 0 + else: + chars_size = get_element( + offset_child_column, offset_child_column.size()-1).value + base_nbytes = chars_size if data_ptr: if data_owner is None: