Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove build_struct|list_column #14786

Merged
merged 9 commits into from
Feb 23, 2024
2 changes: 0 additions & 2 deletions python/cudf/cudf/core/column/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
as_column,
build_categorical_column,
build_column,
build_list_column,
build_struct_column,
column_empty,
column_empty_like,
column_empty_like_same_mask,
Expand Down
86 changes: 0 additions & 86 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1690,92 +1690,6 @@ def build_categorical_column(
return cast("cudf.core.column.CategoricalColumn", result)


def build_list_column(
indices: ColumnBase,
elements: ColumnBase,
mask: Optional[Buffer] = None,
size: Optional[int] = None,
offset: int = 0,
null_count: Optional[int] = None,
) -> "cudf.core.column.ListColumn":
"""
Build a ListColumn

Parameters
----------
indices : ColumnBase
Column of list indices
elements : ColumnBase
Column of list elements
mask: Buffer
Null mask
size: int, optional
offset: int, optional
"""
dtype = ListDtype(element_type=elements.dtype)
if size is None:
if indices.size == 0:
size = 0
else:
# one less because the last element of offsets is the number of
# bytes in the data buffer
size = indices.size - 1
size = size - offset

result = build_column(
data=None,
dtype=dtype,
mask=mask,
size=size,
offset=offset,
null_count=null_count,
children=(indices, elements),
)

return cast("cudf.core.column.ListColumn", result)


def build_struct_column(
names: Sequence[str],
children: Tuple[ColumnBase, ...],
dtype: Optional[Dtype] = None,
mask: Optional[Buffer] = None,
size: Optional[int] = None,
offset: int = 0,
null_count: Optional[int] = None,
) -> "cudf.core.column.StructColumn":
"""
Build a StructColumn

Parameters
----------
names : sequence of strings
Field names to map to children dtypes, must be strings.
children : tuple

mask: Buffer
Null mask
size: int, optional
offset: int, optional
"""
if dtype is None:
dtype = StructDtype(
fields={name: col.dtype for name, col in zip(names, children)}
)

result = build_column(
data=None,
dtype=dtype,
mask=mask,
size=size,
offset=offset,
null_count=null_count,
children=children,
)

return cast("cudf.core.column.StructColumn", result)


def _make_copy_replacing_NaT_with_null(column):
"""Return a copy with NaT values replaced with nulls."""
if np.issubdtype(column.dtype, np.timedelta64):
Expand Down
11 changes: 6 additions & 5 deletions python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,15 +184,16 @@ def _with_type_metadata(
self: "cudf.core.column.ListColumn", dtype: Dtype
) -> "cudf.core.column.ListColumn":
if isinstance(dtype, ListDtype):
return column.build_list_column(
indices=self.base_children[0],
elements=self.base_children[1]._with_type_metadata(
dtype.element_type
),
elements = self.base_children[1]._with_type_metadata(
dtype.element_type
)
return ListColumn(
dtype=dtype,
mask=self.base_mask,
size=self.size,
offset=self.offset,
null_count=self.null_count,
children=(self.base_children[0], elements),
)

return self
Expand Down
9 changes: 5 additions & 4 deletions python/cudf/cudf/core/column/struct.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
from __future__ import annotations

from functools import cached_property
Expand All @@ -9,7 +9,7 @@

import cudf
from cudf._typing import Dtype
from cudf.core.column import ColumnBase, build_struct_column
from cudf.core.column import ColumnBase
from cudf.core.column.methods import ColumnMethods
from cudf.core.dtypes import StructDtype
from cudf.core.missing import NA
Expand Down Expand Up @@ -134,8 +134,9 @@ def _with_type_metadata(self: StructColumn, dtype: Dtype) -> StructColumn:
if isinstance(dtype, IntervalDtype):
return IntervalColumn.from_struct_column(self, closed=dtype.closed)
elif isinstance(dtype, StructDtype):
return build_struct_column(
names=dtype.fields.keys(),
return StructColumn(
data=None,
dtype=dtype,
children=tuple(
self.base_children[i]._with_type_metadata(dtype.fields[f])
for i, f in enumerate(dtype.fields.keys())
Expand Down
10 changes: 6 additions & 4 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
from cudf.core.column import (
CategoricalColumn,
ColumnBase,
StructColumn,
as_column,
build_categorical_column,
build_column,
Expand Down Expand Up @@ -7127,12 +7128,13 @@ def to_struct(self, name=None):
"requires field name to be string. Non-string column names "
"will be casted to string as the field name."
)
field_names = [str(name) for name in self._data.names]

col = cudf.core.column.build_struct_column(
names=field_names,
fields = {str(name): col.dtype for name, col in self._data.items()}
col = StructColumn(
data=None,
dtype=cudf.StructDtype(fields=fields),
children=tuple(col.copy(deep=True) for col in self._data.columns),
size=len(self),
offset=0,
)
return cudf.Series._from_data(
cudf.core.column_accessor.ColumnAccessor({name: col}),
Expand Down
10 changes: 7 additions & 3 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from cudf.api.types import is_bool_dtype, is_float_dtype, is_list_like
from cudf.core._compat import PANDAS_LT_300
from cudf.core.abc import Serializable
from cudf.core.column.column import ColumnBase, as_column
from cudf.core.column.column import ColumnBase, StructDtype, as_column
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.join._join_helpers import _match_join_keys
from cudf.core.mixins import Reducible, Scannable
Expand Down Expand Up @@ -2036,10 +2036,14 @@ def _cov_or_corr(self, func, method_name):
)
x, y = str(x), str(y)

column_pair_structs[(x, y)] = cudf.core.column.build_struct_column(
names=(x, y),
column_pair_structs[(x, y)] = cudf.core.column.StructColumn(
data=None,
dtype=StructDtype(
fields={x: self.obj._data[x].dtype, y: self.obj._data[y]}
),
children=(self.obj._data[x], self.obj._data[y]),
size=len(self.obj),
offset=0,
)

column_pair_groupby = cudf.DataFrame._from_data(
Expand Down
Loading