Skip to content

Commit

Permalink
Fix copying dtype metadata after calling libcudf functions (#7271)
Browse files Browse the repository at this point in the history
Fixes #7249

Copies dtype metadata after calling `ColumnBase.copy()`. Moves logic for copying dtype metadata after calling libcudf functions from `Frame` to `ColumnBase`.

Authors:
  - Ashwin Srinath (@shwina)

Approvers:
  - Keith Kraus (@kkraus14)
  - GALI PREM SAGAR (@galipremsagar)

URL: #7271
  • Loading branch information
shwina authored Feb 4, 2021
1 parent 8334700 commit 253dfdf
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 78 deletions.
47 changes: 46 additions & 1 deletion python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,7 +520,8 @@ def copy(self: T, deep: bool = True) -> T:
copies the references of the data and mask.
"""
if deep:
return libcudf.copying.copy_column(self)
result = libcudf.copying.copy_column(self)
return cast(T, self._copy_type_metadata(result))
else:
return cast(
T,
Expand Down Expand Up @@ -1346,6 +1347,50 @@ def scatter_to_table(
}
)

def _copy_type_metadata(self: T, other: ColumnBase) -> ColumnBase:
"""
Copies type metadata from self onto other, returning a new column.
* when `self` is a CategoricalColumn and `other` is not, we assume
other is a column of codes, and return a CategoricalColumn composed
of `other` and the categories of `self`.
* when both `self` and `other` are StructColumns, rename the fields
of `other` to the field names of `self`.
* when `self` and `other` are nested columns of the same type,
recursively apply this function on the children of `self` to the
and the children of `other`.
* if none of the above, return `other` without any changes
"""
if isinstance(self, cudf.core.column.CategoricalColumn) and not (
isinstance(other, cudf.core.column.CategoricalColumn)
):
other = build_categorical_column(
categories=self.categories,
codes=as_column(other.base_data, dtype=other.dtype),
mask=other.base_mask,
ordered=self.ordered,
size=other.size,
offset=other.offset,
null_count=other.null_count,
)

if isinstance(other, cudf.core.column.StructColumn) and isinstance(
self, cudf.core.column.StructColumn
):
other = other._rename_fields(self.dtype.fields.keys())

if type(self) is type(other):
if self.base_children and other.base_children:
base_children = tuple(
self.base_children[i]._copy_type_metadata(
other.base_children[i]
)
for i in range(len(self.base_children))
)
other.set_base_children(base_children)

return other


def column_empty_like(
column: ColumnBase,
Expand Down
3 changes: 3 additions & 0 deletions python/cudf/cudf/core/column/struct.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
from __future__ import annotations

import pyarrow as pa

Expand All @@ -7,6 +8,8 @@


class StructColumn(ColumnBase):
dtype: cudf.core.dtypes.StructDtype

@property
def base_size(self):
if not self.base_children:
Expand Down
104 changes: 30 additions & 74 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import operator
import warnings
from collections import OrderedDict, abc as abc
from typing import Any, Dict, Tuple, overload
from typing import TYPE_CHECKING, Any, Dict, Tuple, overload

import cupy
import numpy as np
Expand All @@ -27,6 +27,9 @@
min_scalar_type,
)

if TYPE_CHECKING:
from cudf.core.column_accessor import ColumnAccessor


class Frame(libcudf.table.Table):
"""
Expand All @@ -40,6 +43,8 @@ class Frame(libcudf.table.Table):
A Frame representing the (optional) index columns.
"""

_data: "ColumnAccessor"

@classmethod
def _from_table(cls, table: Frame):
return cls(table._data, index=table._index)
Expand Down Expand Up @@ -506,7 +511,7 @@ def _gather(self, gather_map, keep_index=True):
self, as_column(gather_map), keep_index=keep_index
)
)
result._postprocess_columns(self)
result._copy_type_metadata(self)
if keep_index and self._index is not None:
result._index.names = self._index.names
return result
Expand All @@ -521,7 +526,7 @@ def _hash_partition(
self, columns_to_hash, num_partitions, keep_index
)
output = self.__class__._from_table(output)
output._postprocess_columns(self, include_index=keep_index)
output._copy_type_metadata(self, include_index=keep_index)
return output, offsets

def _as_column(self):
Expand All @@ -540,15 +545,15 @@ def _as_column(self):
def _scatter(self, key, value):
result = self._from_table(libcudf.copying.scatter(value, key, self))

result._postprocess_columns(self)
result._copy_type_metadata(self)
return result

def _empty_like(self, keep_index=True):
result = self._from_table(
libcudf.copying.table_empty_like(self, keep_index)
)

result._postprocess_columns(self, include_index=keep_index)
result._copy_type_metadata(self, include_index=keep_index)
return result

def _slice(self, arg):
Expand Down Expand Up @@ -601,7 +606,7 @@ def _slice(self, arg):
)[0]
)

result._postprocess_columns(self, include_index=keep_index)
result._copy_type_metadata(self, include_index=keep_index)
# Adding index of type RangeIndex back to
# result
if keep_index is False and self.index is not None:
Expand Down Expand Up @@ -732,7 +737,7 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1):
for i, name in enumerate(self._data):
output._data[name] = self._data[name].clip(lower[i], upper[i])

output._postprocess_columns(self, include_index=False)
output._copy_type_metadata(self, include_index=False)

return self._mimic_inplace(output, inplace=inplace)

Expand Down Expand Up @@ -1075,7 +1080,7 @@ def _partition(self, scatter_map, npartitions, keep_index=True):
result = partitioned._split(output_offsets, keep_index=keep_index)

for frame in result:
frame._postprocess_columns(self, include_index=keep_index)
frame._copy_type_metadata(self, include_index=keep_index)

if npartitions:
for _ in range(npartitions - len(result)):
Expand Down Expand Up @@ -1496,7 +1501,7 @@ def _drop_na_rows(
frame, how=how, keys=subset, thresh=thresh
)
)
result._postprocess_columns(frame)
result._copy_type_metadata(frame)
return result

def _drop_na_columns(self, how="any", subset=None, thresh=None):
Expand Down Expand Up @@ -1541,7 +1546,7 @@ def _apply_boolean_mask(self, boolean_mask):
self, as_column(boolean_mask)
)
)
result._postprocess_columns(self)
result._copy_type_metadata(self)
return result

def _quantiles(
Expand Down Expand Up @@ -1573,7 +1578,7 @@ def _quantiles(
)
)

result._postprocess_columns(self)
result._copy_type_metadata(self)
return result

def rank(
Expand Down Expand Up @@ -1734,7 +1739,7 @@ def _repeat(self, count):
libcudf.filling.repeat(self, count)
)

result._postprocess_columns(self)
result._copy_type_metadata(self)
return result

def _fill(self, fill_values, begin, end, inplace):
Expand Down Expand Up @@ -2016,7 +2021,7 @@ def sample(
keep_index=keep_index,
)
)
result._postprocess_columns(self)
result._copy_type_metadata(self)

return result
else:
Expand Down Expand Up @@ -2305,7 +2310,7 @@ def drop_duplicates(
)
)

result._postprocess_columns(self)
result._copy_type_metadata(self)
return result

def replace(self, to_replace: Any, replacement: Any) -> Frame:
Expand Down Expand Up @@ -2344,44 +2349,23 @@ def replace(self, to_replace: Any, replacement: Any) -> Frame:

return result

def _copy_categories(self, other, include_index=True):
def _copy_type_metadata(
self, other: Frame, include_index: bool = True
) -> Frame:
"""
Utility that copies category information from `other`
to `self`.
Copy type metadata from each column of `other` to the corresponding
column of `self`.
See `ColumnBase._copy_type_metadata` for more information.
"""
for name, col, other_col in zip(
self._data.keys(), self._data.values(), other._data.values()
):
if isinstance(
other_col, cudf.core.column.CategoricalColumn
) and not isinstance(col, cudf.core.column.CategoricalColumn):
self._data[name] = build_categorical_column(
categories=other_col.categories,
codes=as_column(col.base_data, dtype=col.dtype),
mask=col.base_mask,
ordered=other_col.ordered,
size=col.size,
offset=col.offset,
null_count=col.null_count,
)
self._data[name] = other_col._copy_type_metadata(col)

if include_index:
# include_index will still behave as False
# incase of self._index being a RangeIndex
if (
self._index is not None
and not isinstance(self._index, cudf.core.index.RangeIndex)
and isinstance(
other._index,
(cudf.core.index.CategoricalIndex, cudf.MultiIndex),
)
):
self._index._postprocess_columns(
other._index, include_index=False
)
if self._index is not None and other._index is not None:
self._index._copy_type_metadata(other._index)
# When other._index is a CategoricalIndex, there is
# possibility that corresposing self._index be GenericIndex
# with codes. So to update even the class signature, we
# have to reconstruct self._index:
if isinstance(
other._index, cudf.core.index.CategoricalIndex
) and not isinstance(
Expand All @@ -2390,37 +2374,9 @@ def _copy_categories(self, other, include_index=True):
self._index = cudf.core.index.Index._from_table(
self._index
)
return self

def _copy_struct_names(self, other, include_index=True):
"""
Utility that copies struct field names.
"""
for name, col, other_col in zip(
self._data.keys(), self._data.values(), other._data.values()
):
if isinstance(other_col, cudf.core.column.StructColumn):
self._data[name] = col._rename_fields(
other_col.dtype.fields.keys()
)

if include_index and self._index is not None:
for name, col, other_col in zip(
self._index._data.keys(),
self._index._data.values(),
other._index._data.values(),
):
if isinstance(other_col, cudf.core.column.StructColumn):
self._index._data[name] = col._rename_fields(
other_col.dtype.fields.keys()
)

return self

def _postprocess_columns(self, other, include_index=True):
self._copy_categories(other, include_index=include_index)
self._copy_struct_names(other, include_index=include_index)

def _unaryop(self, op):
data_columns = (col.unary_operator(op) for col in self._columns)
data = zip(self._column_names, data_columns)
Expand Down Expand Up @@ -2651,7 +2607,7 @@ def tile(self, count):
The table containing the tiled "rows".
"""
result = self.__class__._from_table(libcudf.reshape.tile(self, count))
result._postprocess_columns(self)
result._copy_type_metadata(self)
return result

def searchsorted(
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def agg(self, func):
result.index.names = self.grouping.names

# copy categorical information from keys to the result index:
result.index._postprocess_columns(self.grouping.keys)
result.index._copy_type_metadata(self.grouping.keys)
result._index = cudf.core.index.Index._from_table(result._index)

if not self._as_index:
Expand Down Expand Up @@ -263,7 +263,7 @@ def _grouped(self):

grouped_keys = cudf.Index._from_table(grouped_keys)
grouped_values = self.obj.__class__._from_table(grouped_values)
grouped_values._postprocess_columns(self.obj)
grouped_values._copy_type_metadata(self.obj)
group_names = grouped_keys.unique()
return (group_names, offsets, grouped_keys, grouped_values)

Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -788,7 +788,7 @@ def merge_sorted(
na_position=na_position,
)
)
result._postprocess_columns(objs[0])
result._copy_type_metadata(objs[0])
return result


Expand Down
11 changes: 11 additions & 0 deletions python/cudf/cudf/tests/test_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,14 @@ def test_create_struct_series(data):
expect = pd.Series(data)
got = cudf.Series(data)
assert_eq(expect, got, check_dtype=False)


def test_struct_of_struct_copy():
sr = cudf.Series([{"a": {"b": 1}}])
assert_eq(sr, sr.copy())


def test_struct_of_struct_loc():
df = cudf.DataFrame({"col": [{"a": {"b": 1}}]})
expect = cudf.Series([{"a": {"b": 1}}], name="col")
assert_eq(expect, df["col"])

0 comments on commit 253dfdf

Please sign in to comment.