rapidsai · rapids-bot · Jun 21, 2021 · Jun 4, 2021 · Jun 9, 2021 · Jun 9, 2021
@@ -181,12 +181,14 @@ def set_base_data(self, value):
 
     def serialize(self):
         header = {}
+        frames = []
         header["type-serialized"] = pickle.dumps(type(self))
-        header["dtype"] = pickle.dumps(self.dtype)
         header["null_count"] = self.null_count
         header["size"] = self.size
+        header["dtype"], dtype_frames = self.dtype.serialize()
+        header["dtype_frames_count"] = len(dtype_frames)
+        frames.extend(dtype_frames)
 
-        frames = []
         sub_headers = []
 
         for item in self.children:
@@ -211,9 +213,14 @@ def deserialize(cls, header, frames):
         else:
             mask = None
 
+        # Deserialize dtype
+        dtype = pickle.loads(header["dtype"]["type-serialized"]).deserialize(
+            header["dtype"], frames[: header["dtype_frames_count"]]
+        )
+
         # Deserialize child columns
         children = []
-        f = 0
+        f = header["dtype_frames_count"]
         for h in header["subheaders"]:
             fcount = h["frame_count"]
             child_frames = frames[f : f + fcount]
@@ -224,7 +231,7 @@ def deserialize(cls, header, frames):
         # Materialize list column
         return column.build_column(
             data=None,
-            dtype=pickle.loads(header["dtype"]),
+            dtype=dtype,
             mask=mask,
             children=tuple(children),
             size=header["size"],

@@ -2,7 +2,7 @@
 
 import decimal
 import pickle
-from typing import Any, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import numpy as np
 import pandas as pd
@@ -17,9 +17,11 @@
 
 import cudf
 from cudf._typing import Dtype
+from cudf.core.abc import Serializable
+from cudf.core.buffer import Buffer
 
 
-class _BaseDtype(ExtensionDtype):
+class _BaseDtype(ExtensionDtype, Serializable):
     # Base type for all cudf-specific dtypes
     pass
 
@@ -111,12 +113,16 @@ def construct_from_string(self):
 
     def serialize(self):
         header = {}
-        frames = []
+        header["type-serialized"] = pickle.dumps(type(self))
         header["ordered"] = self.ordered
+
+        frames = []
+
         if self.categories is not None:
             categories_header, categories_frames = self.categories.serialize()
         header["categories"] = categories_header
         frames.extend(categories_frames)
+
         return header, frames
 
     @classmethod
@@ -191,6 +197,30 @@ def __repr__(self):
     def __hash__(self):
         return hash(self._typ)
 
+    def serialize(self) -> Tuple[dict, list]:
+        header: Dict[str, Dtype] = {}
+        header["type-serialized"] = pickle.dumps(type(self))
+
+        frames = []
+
+        if isinstance(self.element_type, _BaseDtype):
+            header["element-type"], frames = self.element_type.serialize()
+        else:
+            header["element-type"] = self.element_type
+
+        return header, frames
+
+    @classmethod
+    def deserialize(cls, header: dict, frames: list):
+        if isinstance(header["element-type"], dict):
+            element_type = pickle.loads(
+                header["element-type"]["type-serialized"]
+            ).deserialize(header["element-type"], frames)
+        else:
+            element_type = header["element-type"]
+
+        return cls(element_type=element_type)
+
 
 class StructDtype(_BaseDtype):
 
@@ -242,6 +272,41 @@ def __repr__(self):
     def __hash__(self):
         return hash(self._typ)
 
+    def serialize(self) -> Tuple[dict, list]:
+        header: Dict[str, Any] = {}
+        header["type-serialized"] = pickle.dumps(type(self))
+
+        frames: List[Buffer] = []
+
+        fields = {}
+
+        for k, dtype in self.fields.items():
+            if isinstance(dtype, _BaseDtype):
+                dtype_header, dtype_frames = dtype.serialize()
+                fields[k] = (
+                    dtype_header,
+                    (len(frames), len(frames) + len(dtype_frames)),
+                )
+                frames.extend(dtype_frames)
+            else:
+                fields[k] = dtype
+        header["fields"] = fields
+
+        return header, frames
+
+    @classmethod
+    def deserialize(cls, header: dict, frames: list):
+        fields = {}
+        for k, dtype in header["fields"].items():
+            if isinstance(dtype, tuple):
+                dtype_header, (start, stop) = dtype
+                fields[k] = pickle.loads(
+                    dtype_header["type-serialized"]
+                ).deserialize(dtype_header, frames[start:stop],)
+            else:
+                fields[k] = dtype
+        return cls(fields)
+
 
 class Decimal64Dtype(_BaseDtype):
 
@@ -337,7 +402,14 @@ def _from_decimal(cls, decimal):
         return cls(precision, -metadata.exponent)
 
     def serialize(self) -> Tuple[dict, list]:
-        return {"precision": self.precision, "scale": self.scale}, []
+        return (
+            {
+                "type-serialized": pickle.dumps(type(self)),
+                "precision": self.precision,
+                "scale": self.scale,
+            },
+            [],
+        )
 
     @classmethod
     def deserialize(cls, header: dict, frames: list):

@@ -804,3 +804,25 @@ def test_series_construction_with_nulls(input_obj, dtype):
     got = cudf.Series(input_obj, dtype="category").to_pandas()
 
     assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"a": cudf.Series(["a", "b", "c", "a", "c", "b"]).astype("category")},
+        {
+            "a": cudf.Series(["a", "a", "b", "b"]).astype("category"),
+            "b": cudf.Series(["b", "b", "c", "c"]).astype("category"),
+            "c": cudf.Series(["c", "c", "a", "a"]).astype("category"),
+        },
+        {
+            "a": cudf.Series(["a", None, "b", "b"]).astype("category"),
+            "b": cudf.Series(["b", "b", None, "c"]).astype("category"),
+            "c": cudf.Series(["c", "c", "a", None]).astype("category"),
+        },
+    ],
+)
+def test_serialize_categorical_columns(data):
+    df = cudf.DataFrame(data)
+    recreated = df.__class__.deserialize(*df.serialize())
+    assert_eq(recreated, df)
@@ -297,3 +297,41 @@ def test_series_construction_with_nulls(input_obj):
     got = cudf.Series(input_obj).to_arrow()
 
     assert expect == got
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {
+            "a": _decimal_series(
+                ["1", "2", "3"], dtype=cudf.Decimal64Dtype(1, 0)
+            )
+        },
+        {
+            "a": _decimal_series(
+                ["1", "2", "3"], dtype=cudf.Decimal64Dtype(1, 0)
+            ),
+            "b": _decimal_series(
+                ["1.0", "2.0", "3.0"], dtype=cudf.Decimal64Dtype(2, 1)
+            ),
+            "c": _decimal_series(
+                ["10.1", "20.2", "30.3"], dtype=cudf.Decimal64Dtype(3, 1)
+            ),
+        },
+        {
+            "a": _decimal_series(
+                ["1", None, "3"], dtype=cudf.Decimal64Dtype(1, 0)
+            ),
+            "b": _decimal_series(
+                ["1.0", "2.0", None], dtype=cudf.Decimal64Dtype(2, 1)
+            ),
+            "c": _decimal_series(
+                [None, "20.2", "30.3"], dtype=cudf.Decimal64Dtype(3, 1)
+            ),
+        },
+    ],
+)
+def test_serialize_decimal_columns(data):
+    df = cudf.DataFrame(data)
+    recreated = df.__class__.deserialize(*df.serialize())
+    assert_eq(recreated, df)
@@ -360,3 +360,23 @@ def test_construction_series_with_nulls(input_obj):
     got = cudf.Series(input_obj).to_arrow()
 
     assert expect == got
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"a": [[]]},
+        {"a": [[1, 2, None, 4]]},
+        {"a": [["cat", None, "dog"]]},
+        {
+            "a": [[1, 2, 3, None], [4, None, 5]],
+            "b": [None, ["fish", "bird"]],
+            "c": [[], []],
+        },
+        {"a": [[1, 2, 3, None], [4, None, 5], None, [6, 7]]},
+    ],
+)
+def test_serialize_list_columns(data):
+    df = cudf.DataFrame(data)
+    recreated = df.__class__.deserialize(*df.serialize())
+    assert_eq(recreated, df)
@@ -9,7 +9,7 @@
 
 import cudf
 from cudf.tests import utils
-from cudf.tests.utils import _decimal_series, assert_eq
+from cudf.tests.utils import assert_eq
 
 
 @pytest.mark.parametrize(
@@ -269,64 +269,6 @@ def test_serialize_string_check_buffer_sizes():
     assert expect == got
 
 
-@pytest.mark.parametrize(
-    "data",
-    [
-        {"a": [[]]},
-        {"a": [[1, 2, None, 4]]},
-        {"a": [["cat", None, "dog"]]},
-        {
-            "a": [[1, 2, 3, None], [4, None, 5]],
-            "b": [None, ["fish", "bird"]],
-            "c": [[], []],
-        },
-        {"a": [[1, 2, 3, None], [4, None, 5], None, [6, 7]]},
-    ],
-)
-def test_serialize_list_columns(data):
-    df = cudf.DataFrame(data)
-    recreated = df.__class__.deserialize(*df.serialize())
-    assert_eq(recreated, df)
-
-
-@pytest.mark.parametrize(
-    "data",
-    [
-        {
-            "a": _decimal_series(
-                ["1", "2", "3"], dtype=cudf.Decimal64Dtype(1, 0)
-            )
-        },
-        {
-            "a": _decimal_series(
-                ["1", "2", "3"], dtype=cudf.Decimal64Dtype(1, 0)
-            ),
-            "b": _decimal_series(
-                ["1.0", "2.0", "3.0"], dtype=cudf.Decimal64Dtype(2, 1)
-            ),
-            "c": _decimal_series(
-                ["10.1", "20.2", "30.3"], dtype=cudf.Decimal64Dtype(3, 1)
-            ),
-        },
-        {
-            "a": _decimal_series(
-                ["1", None, "3"], dtype=cudf.Decimal64Dtype(1, 0)
-            ),
-            "b": _decimal_series(
-                ["1.0", "2.0", None], dtype=cudf.Decimal64Dtype(2, 1)
-            ),
-            "c": _decimal_series(
-                [None, "20.2", "30.3"], dtype=cudf.Decimal64Dtype(3, 1)
-            ),
-        },
-    ],
-)
-def test_serialize_decimal_columns(data):
-    df = cudf.DataFrame(data)
-    recreated = df.__class__.deserialize(*df.serialize())
-    assert_eq(recreated, df)
-
-
 def test_deserialize_cudf_0_16(datadir):
     fname = datadir / "pkl" / "stringColumnWithRangeIndex_cudf_0.16.pkl"
 

@@ -1,5 +1,6 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
+import numpy as np
 import pandas as pd
 import pyarrow as pa
 import pytest
@@ -53,3 +54,24 @@ def test_series_construction_with_nulls(input_obj):
     got = cudf.Series(input_obj).to_arrow()
 
     assert expect == got
+
+
+@pytest.mark.parametrize(
+    "fields",
+    [
+        {"a": np.dtype(np.int64)},
+        {"a": np.dtype(np.int64), "b": None},
+        {
+            "a": cudf.ListDtype(np.dtype(np.int64)),
+            "b": cudf.Decimal64Dtype(1, 0),
+        },
+        {
+            "a": cudf.ListDtype(cudf.StructDtype({"b": np.dtype(np.int64)})),
+            "b": cudf.ListDtype(cudf.ListDtype(np.dtype(np.int64))),
+        },
+    ],
+)
+def test_serialize_struct_dtype(fields):
+    dtype = cudf.StructDtype(fields)
+    recreated = dtype.__class__.deserialize(*dtype.serialize())
+    assert recreated == dtype