From 253dfdf4ca6cd05569a26744b2560594ab04cbd4 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Thu, 4 Feb 2021 13:35:18 -0500
Subject: [PATCH] Fix copying dtype metadata after calling libcudf functions
 (#7271)

Fixes #7249

Copies dtype metadata after calling `ColumnBase.copy()`. Moves logic for copying dtype metadata after calling libcudf functions from `Frame` to `ColumnBase`.

Authors:
  - Ashwin Srinath (@shwina)

Approvers:
  - Keith Kraus (@kkraus14)
  - GALI PREM SAGAR (@galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/7271
---
 python/cudf/cudf/core/column/column.py   |  47 +++++++++-
 python/cudf/cudf/core/column/struct.py   |   3 +
 python/cudf/cudf/core/frame.py           | 104 +++++++----------------
 python/cudf/cudf/core/groupby/groupby.py |   4 +-
 python/cudf/cudf/core/reshape.py         |   2 +-
 python/cudf/cudf/tests/test_struct.py    |  11 +++
 6 files changed, 93 insertions(+), 78 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 4c4c253d8a7..9fe3283c187 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -520,7 +520,8 @@ def copy(self: T, deep: bool = True) -> T:
         copies the references of the data and mask.
         """
         if deep:
-            return libcudf.copying.copy_column(self)
+            result = libcudf.copying.copy_column(self)
+            return cast(T, self._copy_type_metadata(result))
         else:
             return cast(
                 T,
@@ -1346,6 +1347,50 @@ def scatter_to_table(
             }
         )
 
+    def _copy_type_metadata(self: T, other: ColumnBase) -> ColumnBase:
+        """
+        Copies type metadata from self onto other, returning a new column.
+
+        * when `self` is a CategoricalColumn and `other` is not, we assume
+          other is a column of codes, and return a CategoricalColumn composed
+          of `other`  and the categories of `self`.
+        * when both `self` and `other` are StructColumns, rename the fields
+          of `other` to the field names of `self`.
+        * when `self` and `other` are nested columns of the same type,
+          recursively apply this function on the children of `self` to the
+          and the children of `other`.
+        * if none of the above, return `other` without any changes
+        """
+        if isinstance(self, cudf.core.column.CategoricalColumn) and not (
+            isinstance(other, cudf.core.column.CategoricalColumn)
+        ):
+            other = build_categorical_column(
+                categories=self.categories,
+                codes=as_column(other.base_data, dtype=other.dtype),
+                mask=other.base_mask,
+                ordered=self.ordered,
+                size=other.size,
+                offset=other.offset,
+                null_count=other.null_count,
+            )
+
+        if isinstance(other, cudf.core.column.StructColumn) and isinstance(
+            self, cudf.core.column.StructColumn
+        ):
+            other = other._rename_fields(self.dtype.fields.keys())
+
+        if type(self) is type(other):
+            if self.base_children and other.base_children:
+                base_children = tuple(
+                    self.base_children[i]._copy_type_metadata(
+                        other.base_children[i]
+                    )
+                    for i in range(len(self.base_children))
+                )
+                other.set_base_children(base_children)
+
+        return other
+
 
 def column_empty_like(
     column: ColumnBase,
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index 577738a2dca..adaf62ffc25 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import pyarrow as pa
 
@@ -7,6 +8,8 @@
 
 
 class StructColumn(ColumnBase):
+    dtype: cudf.core.dtypes.StructDtype
+
     @property
     def base_size(self):
         if not self.base_children:
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 43d6416506d..c64cb7fdbcc 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -6,7 +6,7 @@
 import operator
 import warnings
 from collections import OrderedDict, abc as abc
-from typing import Any, Dict, Tuple, overload
+from typing import TYPE_CHECKING, Any, Dict, Tuple, overload
 
 import cupy
 import numpy as np
@@ -27,6 +27,9 @@
     min_scalar_type,
 )
 
+if TYPE_CHECKING:
+    from cudf.core.column_accessor import ColumnAccessor
+
 
 class Frame(libcudf.table.Table):
     """
@@ -40,6 +43,8 @@ class Frame(libcudf.table.Table):
         A Frame representing the (optional) index columns.
     """
 
+    _data: "ColumnAccessor"
+
     @classmethod
     def _from_table(cls, table: Frame):
         return cls(table._data, index=table._index)
@@ -506,7 +511,7 @@ def _gather(self, gather_map, keep_index=True):
                 self, as_column(gather_map), keep_index=keep_index
             )
         )
-        result._postprocess_columns(self)
+        result._copy_type_metadata(self)
         if keep_index and self._index is not None:
             result._index.names = self._index.names
         return result
@@ -521,7 +526,7 @@ def _hash_partition(
             self, columns_to_hash, num_partitions, keep_index
         )
         output = self.__class__._from_table(output)
-        output._postprocess_columns(self, include_index=keep_index)
+        output._copy_type_metadata(self, include_index=keep_index)
         return output, offsets
 
     def _as_column(self):
@@ -540,7 +545,7 @@ def _as_column(self):
     def _scatter(self, key, value):
         result = self._from_table(libcudf.copying.scatter(value, key, self))
 
-        result._postprocess_columns(self)
+        result._copy_type_metadata(self)
         return result
 
     def _empty_like(self, keep_index=True):
@@ -548,7 +553,7 @@ def _empty_like(self, keep_index=True):
             libcudf.copying.table_empty_like(self, keep_index)
         )
 
-        result._postprocess_columns(self, include_index=keep_index)
+        result._copy_type_metadata(self, include_index=keep_index)
         return result
 
     def _slice(self, arg):
@@ -601,7 +606,7 @@ def _slice(self, arg):
                     )[0]
                 )
 
-                result._postprocess_columns(self, include_index=keep_index)
+                result._copy_type_metadata(self, include_index=keep_index)
                 # Adding index of type RangeIndex back to
                 # result
                 if keep_index is False and self.index is not None:
@@ -732,7 +737,7 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1):
         for i, name in enumerate(self._data):
             output._data[name] = self._data[name].clip(lower[i], upper[i])
 
-        output._postprocess_columns(self, include_index=False)
+        output._copy_type_metadata(self, include_index=False)
 
         return self._mimic_inplace(output, inplace=inplace)
 
@@ -1075,7 +1080,7 @@ def _partition(self, scatter_map, npartitions, keep_index=True):
         result = partitioned._split(output_offsets, keep_index=keep_index)
 
         for frame in result:
-            frame._postprocess_columns(self, include_index=keep_index)
+            frame._copy_type_metadata(self, include_index=keep_index)
 
         if npartitions:
             for _ in range(npartitions - len(result)):
@@ -1496,7 +1501,7 @@ def _drop_na_rows(
                 frame, how=how, keys=subset, thresh=thresh
             )
         )
-        result._postprocess_columns(frame)
+        result._copy_type_metadata(frame)
         return result
 
     def _drop_na_columns(self, how="any", subset=None, thresh=None):
@@ -1541,7 +1546,7 @@ def _apply_boolean_mask(self, boolean_mask):
                 self, as_column(boolean_mask)
             )
         )
-        result._postprocess_columns(self)
+        result._copy_type_metadata(self)
         return result
 
     def _quantiles(
@@ -1573,7 +1578,7 @@ def _quantiles(
             )
         )
 
-        result._postprocess_columns(self)
+        result._copy_type_metadata(self)
         return result
 
     def rank(
@@ -1734,7 +1739,7 @@ def _repeat(self, count):
             libcudf.filling.repeat(self, count)
         )
 
-        result._postprocess_columns(self)
+        result._copy_type_metadata(self)
         return result
 
     def _fill(self, fill_values, begin, end, inplace):
@@ -2016,7 +2021,7 @@ def sample(
                     keep_index=keep_index,
                 )
             )
-            result._postprocess_columns(self)
+            result._copy_type_metadata(self)
 
             return result
         else:
@@ -2305,7 +2310,7 @@ def drop_duplicates(
             )
         )
 
-        result._postprocess_columns(self)
+        result._copy_type_metadata(self)
         return result
 
     def replace(self, to_replace: Any, replacement: Any) -> Frame:
@@ -2344,44 +2349,23 @@ def replace(self, to_replace: Any, replacement: Any) -> Frame:
 
         return result
 
-    def _copy_categories(self, other, include_index=True):
+    def _copy_type_metadata(
+        self, other: Frame, include_index: bool = True
+    ) -> Frame:
         """
-        Utility that copies category information from `other`
-        to `self`.
+        Copy type metadata from each column of `other` to the corresponding
+        column of `self`.
+        See `ColumnBase._copy_type_metadata` for more information.
         """
         for name, col, other_col in zip(
             self._data.keys(), self._data.values(), other._data.values()
         ):
-            if isinstance(
-                other_col, cudf.core.column.CategoricalColumn
-            ) and not isinstance(col, cudf.core.column.CategoricalColumn):
-                self._data[name] = build_categorical_column(
-                    categories=other_col.categories,
-                    codes=as_column(col.base_data, dtype=col.dtype),
-                    mask=col.base_mask,
-                    ordered=other_col.ordered,
-                    size=col.size,
-                    offset=col.offset,
-                    null_count=col.null_count,
-                )
+            self._data[name] = other_col._copy_type_metadata(col)
+
         if include_index:
-            # include_index will still behave as False
-            # incase of self._index being a RangeIndex
-            if (
-                self._index is not None
-                and not isinstance(self._index, cudf.core.index.RangeIndex)
-                and isinstance(
-                    other._index,
-                    (cudf.core.index.CategoricalIndex, cudf.MultiIndex),
-                )
-            ):
-                self._index._postprocess_columns(
-                    other._index, include_index=False
-                )
+            if self._index is not None and other._index is not None:
+                self._index._copy_type_metadata(other._index)
                 # When other._index is a CategoricalIndex, there is
-                # possibility that corresposing self._index be GenericIndex
-                # with codes. So to update even the class signature, we
-                # have to reconstruct self._index:
                 if isinstance(
                     other._index, cudf.core.index.CategoricalIndex
                 ) and not isinstance(
@@ -2390,37 +2374,9 @@ def _copy_categories(self, other, include_index=True):
                     self._index = cudf.core.index.Index._from_table(
                         self._index
                     )
-        return self
-
-    def _copy_struct_names(self, other, include_index=True):
-        """
-        Utility that copies struct field names.
-        """
-        for name, col, other_col in zip(
-            self._data.keys(), self._data.values(), other._data.values()
-        ):
-            if isinstance(other_col, cudf.core.column.StructColumn):
-                self._data[name] = col._rename_fields(
-                    other_col.dtype.fields.keys()
-                )
-
-        if include_index and self._index is not None:
-            for name, col, other_col in zip(
-                self._index._data.keys(),
-                self._index._data.values(),
-                other._index._data.values(),
-            ):
-                if isinstance(other_col, cudf.core.column.StructColumn):
-                    self._index._data[name] = col._rename_fields(
-                        other_col.dtype.fields.keys()
-                    )
 
         return self
 
-    def _postprocess_columns(self, other, include_index=True):
-        self._copy_categories(other, include_index=include_index)
-        self._copy_struct_names(other, include_index=include_index)
-
     def _unaryop(self, op):
         data_columns = (col.unary_operator(op) for col in self._columns)
         data = zip(self._column_names, data_columns)
@@ -2651,7 +2607,7 @@ def tile(self, count):
         The table containing the tiled "rows".
         """
         result = self.__class__._from_table(libcudf.reshape.tile(self, count))
-        result._postprocess_columns(self)
+        result._copy_type_metadata(self)
         return result
 
     def searchsorted(
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 8af3b6f1d81..86e1f5cfe30 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -198,7 +198,7 @@ def agg(self, func):
             result.index.names = self.grouping.names
 
         # copy categorical information from keys to the result index:
-        result.index._postprocess_columns(self.grouping.keys)
+        result.index._copy_type_metadata(self.grouping.keys)
         result._index = cudf.core.index.Index._from_table(result._index)
 
         if not self._as_index:
@@ -263,7 +263,7 @@ def _grouped(self):
 
         grouped_keys = cudf.Index._from_table(grouped_keys)
         grouped_values = self.obj.__class__._from_table(grouped_values)
-        grouped_values._postprocess_columns(self.obj)
+        grouped_values._copy_type_metadata(self.obj)
         group_names = grouped_keys.unique()
         return (group_names, offsets, grouped_keys, grouped_values)
 
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 5b67cac7a34..36b412f8c53 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -788,7 +788,7 @@ def merge_sorted(
             na_position=na_position,
         )
     )
-    result._postprocess_columns(objs[0])
+    result._copy_type_metadata(objs[0])
     return result
 
 
diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py
index e0e4631fea8..c7efb55c089 100644
--- a/python/cudf/cudf/tests/test_struct.py
+++ b/python/cudf/cudf/tests/test_struct.py
@@ -23,3 +23,14 @@ def test_create_struct_series(data):
     expect = pd.Series(data)
     got = cudf.Series(data)
     assert_eq(expect, got, check_dtype=False)
+
+
+def test_struct_of_struct_copy():
+    sr = cudf.Series([{"a": {"b": 1}}])
+    assert_eq(sr, sr.copy())
+
+
+def test_struct_of_struct_loc():
+    df = cudf.DataFrame({"col": [{"a": {"b": 1}}]})
+    expect = cudf.Series([{"a": {"b": 1}}], name="col")
+    assert_eq(expect, df["col"])