From 253dfdf4ca6cd05569a26744b2560594ab04cbd4 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Thu, 4 Feb 2021 13:35:18 -0500 Subject: [PATCH] Fix copying dtype metadata after calling libcudf functions (#7271) Fixes #7249 Copies dtype metadata after calling `ColumnBase.copy()`. Moves logic for copying dtype metadata after calling libcudf functions from `Frame` to `ColumnBase`. Authors: - Ashwin Srinath (@shwina) Approvers: - Keith Kraus (@kkraus14) - GALI PREM SAGAR (@galipremsagar) URL: https://github.com/rapidsai/cudf/pull/7271 --- python/cudf/cudf/core/column/column.py | 47 +++++++++- python/cudf/cudf/core/column/struct.py | 3 + python/cudf/cudf/core/frame.py | 104 +++++++---------------- python/cudf/cudf/core/groupby/groupby.py | 4 +- python/cudf/cudf/core/reshape.py | 2 +- python/cudf/cudf/tests/test_struct.py | 11 +++ 6 files changed, 93 insertions(+), 78 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 4c4c253d8a7..9fe3283c187 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -520,7 +520,8 @@ def copy(self: T, deep: bool = True) -> T: copies the references of the data and mask. """ if deep: - return libcudf.copying.copy_column(self) + result = libcudf.copying.copy_column(self) + return cast(T, self._copy_type_metadata(result)) else: return cast( T, @@ -1346,6 +1347,50 @@ def scatter_to_table( } ) + def _copy_type_metadata(self: T, other: ColumnBase) -> ColumnBase: + """ + Copies type metadata from self onto other, returning a new column. + + * when `self` is a CategoricalColumn and `other` is not, we assume + other is a column of codes, and return a CategoricalColumn composed + of `other` and the categories of `self`. + * when both `self` and `other` are StructColumns, rename the fields + of `other` to the field names of `self`. + * when `self` and `other` are nested columns of the same type, + recursively apply this function on the children of `self` to the + and the children of `other`. + * if none of the above, return `other` without any changes + """ + if isinstance(self, cudf.core.column.CategoricalColumn) and not ( + isinstance(other, cudf.core.column.CategoricalColumn) + ): + other = build_categorical_column( + categories=self.categories, + codes=as_column(other.base_data, dtype=other.dtype), + mask=other.base_mask, + ordered=self.ordered, + size=other.size, + offset=other.offset, + null_count=other.null_count, + ) + + if isinstance(other, cudf.core.column.StructColumn) and isinstance( + self, cudf.core.column.StructColumn + ): + other = other._rename_fields(self.dtype.fields.keys()) + + if type(self) is type(other): + if self.base_children and other.base_children: + base_children = tuple( + self.base_children[i]._copy_type_metadata( + other.base_children[i] + ) + for i in range(len(self.base_children)) + ) + other.set_base_children(base_children) + + return other + def column_empty_like( column: ColumnBase, diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 577738a2dca..adaf62ffc25 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -1,4 +1,5 @@ # Copyright (c) 2020, NVIDIA CORPORATION. +from __future__ import annotations import pyarrow as pa @@ -7,6 +8,8 @@ class StructColumn(ColumnBase): + dtype: cudf.core.dtypes.StructDtype + @property def base_size(self): if not self.base_children: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 43d6416506d..c64cb7fdbcc 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6,7 +6,7 @@ import operator import warnings from collections import OrderedDict, abc as abc -from typing import Any, Dict, Tuple, overload +from typing import TYPE_CHECKING, Any, Dict, Tuple, overload import cupy import numpy as np @@ -27,6 +27,9 @@ min_scalar_type, ) +if TYPE_CHECKING: + from cudf.core.column_accessor import ColumnAccessor + class Frame(libcudf.table.Table): """ @@ -40,6 +43,8 @@ class Frame(libcudf.table.Table): A Frame representing the (optional) index columns. """ + _data: "ColumnAccessor" + @classmethod def _from_table(cls, table: Frame): return cls(table._data, index=table._index) @@ -506,7 +511,7 @@ def _gather(self, gather_map, keep_index=True): self, as_column(gather_map), keep_index=keep_index ) ) - result._postprocess_columns(self) + result._copy_type_metadata(self) if keep_index and self._index is not None: result._index.names = self._index.names return result @@ -521,7 +526,7 @@ def _hash_partition( self, columns_to_hash, num_partitions, keep_index ) output = self.__class__._from_table(output) - output._postprocess_columns(self, include_index=keep_index) + output._copy_type_metadata(self, include_index=keep_index) return output, offsets def _as_column(self): @@ -540,7 +545,7 @@ def _as_column(self): def _scatter(self, key, value): result = self._from_table(libcudf.copying.scatter(value, key, self)) - result._postprocess_columns(self) + result._copy_type_metadata(self) return result def _empty_like(self, keep_index=True): @@ -548,7 +553,7 @@ def _empty_like(self, keep_index=True): libcudf.copying.table_empty_like(self, keep_index) ) - result._postprocess_columns(self, include_index=keep_index) + result._copy_type_metadata(self, include_index=keep_index) return result def _slice(self, arg): @@ -601,7 +606,7 @@ def _slice(self, arg): )[0] ) - result._postprocess_columns(self, include_index=keep_index) + result._copy_type_metadata(self, include_index=keep_index) # Adding index of type RangeIndex back to # result if keep_index is False and self.index is not None: @@ -732,7 +737,7 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): for i, name in enumerate(self._data): output._data[name] = self._data[name].clip(lower[i], upper[i]) - output._postprocess_columns(self, include_index=False) + output._copy_type_metadata(self, include_index=False) return self._mimic_inplace(output, inplace=inplace) @@ -1075,7 +1080,7 @@ def _partition(self, scatter_map, npartitions, keep_index=True): result = partitioned._split(output_offsets, keep_index=keep_index) for frame in result: - frame._postprocess_columns(self, include_index=keep_index) + frame._copy_type_metadata(self, include_index=keep_index) if npartitions: for _ in range(npartitions - len(result)): @@ -1496,7 +1501,7 @@ def _drop_na_rows( frame, how=how, keys=subset, thresh=thresh ) ) - result._postprocess_columns(frame) + result._copy_type_metadata(frame) return result def _drop_na_columns(self, how="any", subset=None, thresh=None): @@ -1541,7 +1546,7 @@ def _apply_boolean_mask(self, boolean_mask): self, as_column(boolean_mask) ) ) - result._postprocess_columns(self) + result._copy_type_metadata(self) return result def _quantiles( @@ -1573,7 +1578,7 @@ def _quantiles( ) ) - result._postprocess_columns(self) + result._copy_type_metadata(self) return result def rank( @@ -1734,7 +1739,7 @@ def _repeat(self, count): libcudf.filling.repeat(self, count) ) - result._postprocess_columns(self) + result._copy_type_metadata(self) return result def _fill(self, fill_values, begin, end, inplace): @@ -2016,7 +2021,7 @@ def sample( keep_index=keep_index, ) ) - result._postprocess_columns(self) + result._copy_type_metadata(self) return result else: @@ -2305,7 +2310,7 @@ def drop_duplicates( ) ) - result._postprocess_columns(self) + result._copy_type_metadata(self) return result def replace(self, to_replace: Any, replacement: Any) -> Frame: @@ -2344,44 +2349,23 @@ def replace(self, to_replace: Any, replacement: Any) -> Frame: return result - def _copy_categories(self, other, include_index=True): + def _copy_type_metadata( + self, other: Frame, include_index: bool = True + ) -> Frame: """ - Utility that copies category information from `other` - to `self`. + Copy type metadata from each column of `other` to the corresponding + column of `self`. + See `ColumnBase._copy_type_metadata` for more information. """ for name, col, other_col in zip( self._data.keys(), self._data.values(), other._data.values() ): - if isinstance( - other_col, cudf.core.column.CategoricalColumn - ) and not isinstance(col, cudf.core.column.CategoricalColumn): - self._data[name] = build_categorical_column( - categories=other_col.categories, - codes=as_column(col.base_data, dtype=col.dtype), - mask=col.base_mask, - ordered=other_col.ordered, - size=col.size, - offset=col.offset, - null_count=col.null_count, - ) + self._data[name] = other_col._copy_type_metadata(col) + if include_index: - # include_index will still behave as False - # incase of self._index being a RangeIndex - if ( - self._index is not None - and not isinstance(self._index, cudf.core.index.RangeIndex) - and isinstance( - other._index, - (cudf.core.index.CategoricalIndex, cudf.MultiIndex), - ) - ): - self._index._postprocess_columns( - other._index, include_index=False - ) + if self._index is not None and other._index is not None: + self._index._copy_type_metadata(other._index) # When other._index is a CategoricalIndex, there is - # possibility that corresposing self._index be GenericIndex - # with codes. So to update even the class signature, we - # have to reconstruct self._index: if isinstance( other._index, cudf.core.index.CategoricalIndex ) and not isinstance( @@ -2390,37 +2374,9 @@ def _copy_categories(self, other, include_index=True): self._index = cudf.core.index.Index._from_table( self._index ) - return self - - def _copy_struct_names(self, other, include_index=True): - """ - Utility that copies struct field names. - """ - for name, col, other_col in zip( - self._data.keys(), self._data.values(), other._data.values() - ): - if isinstance(other_col, cudf.core.column.StructColumn): - self._data[name] = col._rename_fields( - other_col.dtype.fields.keys() - ) - - if include_index and self._index is not None: - for name, col, other_col in zip( - self._index._data.keys(), - self._index._data.values(), - other._index._data.values(), - ): - if isinstance(other_col, cudf.core.column.StructColumn): - self._index._data[name] = col._rename_fields( - other_col.dtype.fields.keys() - ) return self - def _postprocess_columns(self, other, include_index=True): - self._copy_categories(other, include_index=include_index) - self._copy_struct_names(other, include_index=include_index) - def _unaryop(self, op): data_columns = (col.unary_operator(op) for col in self._columns) data = zip(self._column_names, data_columns) @@ -2651,7 +2607,7 @@ def tile(self, count): The table containing the tiled "rows". """ result = self.__class__._from_table(libcudf.reshape.tile(self, count)) - result._postprocess_columns(self) + result._copy_type_metadata(self) return result def searchsorted( diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 8af3b6f1d81..86e1f5cfe30 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -198,7 +198,7 @@ def agg(self, func): result.index.names = self.grouping.names # copy categorical information from keys to the result index: - result.index._postprocess_columns(self.grouping.keys) + result.index._copy_type_metadata(self.grouping.keys) result._index = cudf.core.index.Index._from_table(result._index) if not self._as_index: @@ -263,7 +263,7 @@ def _grouped(self): grouped_keys = cudf.Index._from_table(grouped_keys) grouped_values = self.obj.__class__._from_table(grouped_values) - grouped_values._postprocess_columns(self.obj) + grouped_values._copy_type_metadata(self.obj) group_names = grouped_keys.unique() return (group_names, offsets, grouped_keys, grouped_values) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 5b67cac7a34..36b412f8c53 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -788,7 +788,7 @@ def merge_sorted( na_position=na_position, ) ) - result._postprocess_columns(objs[0]) + result._copy_type_metadata(objs[0]) return result diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index e0e4631fea8..c7efb55c089 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -23,3 +23,14 @@ def test_create_struct_series(data): expect = pd.Series(data) got = cudf.Series(data) assert_eq(expect, got, check_dtype=False) + + +def test_struct_of_struct_copy(): + sr = cudf.Series([{"a": {"b": 1}}]) + assert_eq(sr, sr.copy()) + + +def test_struct_of_struct_loc(): + df = cudf.DataFrame({"col": [{"a": {"b": 1}}]}) + expect = cudf.Series([{"a": {"b": 1}}], name="col") + assert_eq(expect, df["col"])