From 8adf0995f5e16c455e803c18dfd4a9be1ea4c575 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 23 Feb 2024 08:46:18 -1000 Subject: [PATCH] Remove `build_struct|list_column` (#14786) IMO these do not provide much value compared to constructing with `ListColumn` or `StructColumn` cc https://github.com/rapidsai/cudf/pull/14778#discussion_r1457932822 Authors: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Michael Wang (https://github.com/isVoid) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14786 --- python/cudf/cudf/core/column/__init__.py | 2 - python/cudf/cudf/core/column/column.py | 86 ------------------------ python/cudf/cudf/core/column/lists.py | 11 +-- python/cudf/cudf/core/column/struct.py | 9 +-- python/cudf/cudf/core/dataframe.py | 10 +-- python/cudf/cudf/core/groupby/groupby.py | 10 ++- 6 files changed, 24 insertions(+), 104 deletions(-) diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index 3dddcae85dc..a1c86b617b0 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -11,8 +11,6 @@ as_column, build_categorical_column, build_column, - build_list_column, - build_struct_column, column_empty, column_empty_like, column_empty_like_same_mask, diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 191c55a8a68..cecdaf70750 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1690,92 +1690,6 @@ def build_categorical_column( return cast("cudf.core.column.CategoricalColumn", result) -def build_list_column( - indices: ColumnBase, - elements: ColumnBase, - mask: Optional[Buffer] = None, - size: Optional[int] = None, - offset: int = 0, - null_count: Optional[int] = None, -) -> "cudf.core.column.ListColumn": - """ - Build a ListColumn - - Parameters - ---------- - indices : ColumnBase - Column of list indices - elements : ColumnBase - Column of list elements - mask: Buffer - Null mask - size: int, optional - offset: int, optional - """ - dtype = ListDtype(element_type=elements.dtype) - if size is None: - if indices.size == 0: - size = 0 - else: - # one less because the last element of offsets is the number of - # bytes in the data buffer - size = indices.size - 1 - size = size - offset - - result = build_column( - data=None, - dtype=dtype, - mask=mask, - size=size, - offset=offset, - null_count=null_count, - children=(indices, elements), - ) - - return cast("cudf.core.column.ListColumn", result) - - -def build_struct_column( - names: Sequence[str], - children: Tuple[ColumnBase, ...], - dtype: Optional[Dtype] = None, - mask: Optional[Buffer] = None, - size: Optional[int] = None, - offset: int = 0, - null_count: Optional[int] = None, -) -> "cudf.core.column.StructColumn": - """ - Build a StructColumn - - Parameters - ---------- - names : sequence of strings - Field names to map to children dtypes, must be strings. - children : tuple - - mask: Buffer - Null mask - size: int, optional - offset: int, optional - """ - if dtype is None: - dtype = StructDtype( - fields={name: col.dtype for name, col in zip(names, children)} - ) - - result = build_column( - data=None, - dtype=dtype, - mask=mask, - size=size, - offset=offset, - null_count=null_count, - children=children, - ) - - return cast("cudf.core.column.StructColumn", result) - - def _make_copy_replacing_NaT_with_null(column): """Return a copy with NaT values replaced with nulls.""" if np.issubdtype(column.dtype, np.timedelta64): diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index c28489a2f98..b2205af34e8 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -184,15 +184,16 @@ def _with_type_metadata( self: "cudf.core.column.ListColumn", dtype: Dtype ) -> "cudf.core.column.ListColumn": if isinstance(dtype, ListDtype): - return column.build_list_column( - indices=self.base_children[0], - elements=self.base_children[1]._with_type_metadata( - dtype.element_type - ), + elements = self.base_children[1]._with_type_metadata( + dtype.element_type + ) + return ListColumn( + dtype=dtype, mask=self.base_mask, size=self.size, offset=self.offset, null_count=self.null_count, + children=(self.base_children[0], elements), ) return self diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 6cfa8db0d96..69e9a50956b 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from __future__ import annotations from functools import cached_property @@ -9,7 +9,7 @@ import cudf from cudf._typing import Dtype -from cudf.core.column import ColumnBase, build_struct_column +from cudf.core.column import ColumnBase from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import StructDtype from cudf.core.missing import NA @@ -134,8 +134,9 @@ def _with_type_metadata(self: StructColumn, dtype: Dtype) -> StructColumn: if isinstance(dtype, IntervalDtype): return IntervalColumn.from_struct_column(self, closed=dtype.closed) elif isinstance(dtype, StructDtype): - return build_struct_column( - names=dtype.fields.keys(), + return StructColumn( + data=None, + dtype=dtype, children=tuple( self.base_children[i]._with_type_metadata(dtype.fields[f]) for i, f in enumerate(dtype.fields.keys()) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 89abd7be0ba..5b300f5e4db 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -61,6 +61,7 @@ from cudf.core.column import ( CategoricalColumn, ColumnBase, + StructColumn, as_column, build_categorical_column, build_column, @@ -7127,12 +7128,13 @@ def to_struct(self, name=None): "requires field name to be string. Non-string column names " "will be casted to string as the field name." ) - field_names = [str(name) for name in self._data.names] - - col = cudf.core.column.build_struct_column( - names=field_names, + fields = {str(name): col.dtype for name, col in self._data.items()} + col = StructColumn( + data=None, + dtype=cudf.StructDtype(fields=fields), children=tuple(col.copy(deep=True) for col in self._data.columns), size=len(self), + offset=0, ) return cudf.Series._from_data( cudf.core.column_accessor.ColumnAccessor({name: col}), diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index a236a9b6abf..9612349a607 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -25,7 +25,7 @@ from cudf.api.types import is_bool_dtype, is_float_dtype, is_list_like from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable -from cudf.core.column.column import ColumnBase, as_column +from cudf.core.column.column import ColumnBase, StructDtype, as_column from cudf.core.column_accessor import ColumnAccessor from cudf.core.join._join_helpers import _match_join_keys from cudf.core.mixins import Reducible, Scannable @@ -2036,10 +2036,14 @@ def _cov_or_corr(self, func, method_name): ) x, y = str(x), str(y) - column_pair_structs[(x, y)] = cudf.core.column.build_struct_column( - names=(x, y), + column_pair_structs[(x, y)] = cudf.core.column.StructColumn( + data=None, + dtype=StructDtype( + fields={x: self.obj._data[x].dtype, y: self.obj._data[y]} + ), children=(self.obj._data[x], self.obj._data[y]), size=len(self.obj), + offset=0, ) column_pair_groupby = cudf.DataFrame._from_data(