From 58eea85599008b54ec809eda69ea5e573211bed1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 18 Jan 2024 14:19:53 -0800 Subject: [PATCH 1/2] Remove build_struct_column --- python/cudf/cudf/core/column/__init__.py | 1 - python/cudf/cudf/core/column/column.py | 41 ------------------------ python/cudf/cudf/core/column/struct.py | 9 +++--- python/cudf/cudf/core/dataframe.py | 10 +++--- python/cudf/cudf/core/groupby/groupby.py | 10 ++++-- 5 files changed, 18 insertions(+), 53 deletions(-) diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index 3dddcae85dc..35a1693b68f 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -12,7 +12,6 @@ build_categorical_column, build_column, build_list_column, - build_struct_column, column_empty, column_empty_like, column_empty_like_same_mask, diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 705862c502a..eb5c674cfe0 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1788,47 +1788,6 @@ def build_list_column( return cast("cudf.core.column.ListColumn", result) -def build_struct_column( - names: Sequence[str], - children: Tuple[ColumnBase, ...], - dtype: Optional[Dtype] = None, - mask: Optional[Buffer] = None, - size: Optional[int] = None, - offset: int = 0, - null_count: Optional[int] = None, -) -> "cudf.core.column.StructColumn": - """ - Build a StructColumn - - Parameters - ---------- - names : sequence of strings - Field names to map to children dtypes, must be strings. - children : tuple - - mask: Buffer - Null mask - size: int, optional - offset: int, optional - """ - if dtype is None: - dtype = StructDtype( - fields={name: col.dtype for name, col in zip(names, children)} - ) - - result = build_column( - data=None, - dtype=dtype, - mask=mask, - size=size, - offset=offset, - null_count=null_count, - children=children, - ) - - return cast("cudf.core.column.StructColumn", result) - - def _make_copy_replacing_NaT_with_null(column): """Return a copy with NaT values replaced with nulls.""" if np.issubdtype(column.dtype, np.timedelta64): diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 6cfa8db0d96..69e9a50956b 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from __future__ import annotations from functools import cached_property @@ -9,7 +9,7 @@ import cudf from cudf._typing import Dtype -from cudf.core.column import ColumnBase, build_struct_column +from cudf.core.column import ColumnBase from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import StructDtype from cudf.core.missing import NA @@ -134,8 +134,9 @@ def _with_type_metadata(self: StructColumn, dtype: Dtype) -> StructColumn: if isinstance(dtype, IntervalDtype): return IntervalColumn.from_struct_column(self, closed=dtype.closed) elif isinstance(dtype, StructDtype): - return build_struct_column( - names=dtype.fields.keys(), + return StructColumn( + data=None, + dtype=dtype, children=tuple( self.base_children[i]._with_type_metadata(dtype.fields[f]) for i, f in enumerate(dtype.fields.keys()) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index f9cf180ff44..1060d902e39 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -62,6 +62,7 @@ from cudf.core.column import ( CategoricalColumn, ColumnBase, + StructColumn, as_column, build_categorical_column, build_column, @@ -6999,12 +7000,13 @@ def to_struct(self, name=None): "requires field name to be string. Non-string column names " "will be casted to string as the field name." ) - field_names = [str(name) for name in self._data.names] - - col = cudf.core.column.build_struct_column( - names=field_names, + fields = {str(name): col.dtype for name, col in self._data.items()} + col = StructColumn( + data=None, + dtype=cudf.StructDtype(fields=fields), children=tuple(col.copy(deep=True) for col in self._data.columns), size=len(self), + offset=0, ) return cudf.Series._from_data( cudf.core.column_accessor.ColumnAccessor({name: col}), diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 6c83bcd9efb..8afbcc48c75 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -23,7 +23,7 @@ from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType from cudf.api.types import is_bool_dtype, is_float_dtype, is_list_like from cudf.core.abc import Serializable -from cudf.core.column.column import ColumnBase, as_column +from cudf.core.column.column import ColumnBase, StructDtype, as_column from cudf.core.column_accessor import ColumnAccessor from cudf.core.join._join_helpers import _match_join_keys from cudf.core.mixins import Reducible, Scannable @@ -1968,10 +1968,14 @@ def _cov_or_corr(self, func, method_name): ) x, y = str(x), str(y) - column_pair_structs[(x, y)] = cudf.core.column.build_struct_column( - names=(x, y), + column_pair_structs[(x, y)] = cudf.core.column.StructColumn( + data=None, + dtype=StructDtype( + fields={x: self.obj._data[x].dtype, y: self.obj._data[y]} + ), children=(self.obj._data[x], self.obj._data[y]), size=len(self.obj), + offset=0, ) column_pair_groupby = cudf.DataFrame._from_data( From 6e24fe65c4b5dfa3918bf8eb8d743424feca88e3 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 18 Jan 2024 14:31:17 -0800 Subject: [PATCH 2/2] Remove build_list_column --- python/cudf/cudf/core/column/__init__.py | 1 - python/cudf/cudf/core/column/column.py | 45 ------------------------ python/cudf/cudf/core/column/lists.py | 11 +++--- 3 files changed, 6 insertions(+), 51 deletions(-) diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index 35a1693b68f..a1c86b617b0 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -11,7 +11,6 @@ as_column, build_categorical_column, build_column, - build_list_column, column_empty, column_empty_like, column_empty_like_same_mask, diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index eb5c674cfe0..6ae07328156 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1743,51 +1743,6 @@ def build_interval_column( ) -def build_list_column( - indices: ColumnBase, - elements: ColumnBase, - mask: Optional[Buffer] = None, - size: Optional[int] = None, - offset: int = 0, - null_count: Optional[int] = None, -) -> "cudf.core.column.ListColumn": - """ - Build a ListColumn - - Parameters - ---------- - indices : ColumnBase - Column of list indices - elements : ColumnBase - Column of list elements - mask: Buffer - Null mask - size: int, optional - offset: int, optional - """ - dtype = ListDtype(element_type=elements.dtype) - if size is None: - if indices.size == 0: - size = 0 - else: - # one less because the last element of offsets is the number of - # bytes in the data buffer - size = indices.size - 1 - size = size - offset - - result = build_column( - data=None, - dtype=dtype, - mask=mask, - size=size, - offset=offset, - null_count=null_count, - children=(indices, elements), - ) - - return cast("cudf.core.column.ListColumn", result) - - def _make_copy_replacing_NaT_with_null(column): """Return a copy with NaT values replaced with nulls.""" if np.issubdtype(column.dtype, np.timedelta64): diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 0cccec6f28a..1d9b54273e4 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -184,15 +184,16 @@ def _with_type_metadata( self: "cudf.core.column.ListColumn", dtype: Dtype ) -> "cudf.core.column.ListColumn": if isinstance(dtype, ListDtype): - return column.build_list_column( - indices=self.base_children[0], - elements=self.base_children[1]._with_type_metadata( - dtype.element_type - ), + elements = self.base_children[1]._with_type_metadata( + dtype.element_type + ) + return ListColumn( + dtype=dtype, mask=self.base_mask, size=self.size, offset=self.offset, null_count=self.null_count, + children=(self.base_children[0], elements), ) return self