Fix categorical scalar insertion (#6830)

This PR closes part of #2269 and is created in place of #5306 . I created this PR because that PR #5306 got messy after rebasing from an earlier branch of `cudf`. The purpose of this is to fix the below pandas Categorical assignment which in turn fixes #2269 . ```python import numpy as np import pandas as pd import cudf head = cudf.DataFrame({'a':[1]}) cat_s = pd.Categorical.from_codes(np.zeros(len(head), dtype=int), categories=['cat.csv']) head.assign(path=cat_s) ```
rapidsai · Nov 24, 2020 · fd72e5f · fd72e5f
1 parent cdd72c9
commit fd72e5f
Show file tree

Hide file tree

Showing 3 changed files with 81 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -154,6 +154,7 @@
 - PR #6798 Fix `read_avro` docs
 - PR #6824 Fix JNI build
 - PR #6826 Fix resource management in Java ColumnBuilder
+- PR #6830 Fix categorical scalar insertion
 
 
 # cuDF 0.16.0 (21 Oct 2020)

diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
@@ -713,3 +713,53 @@ def test_add_categories_mixed_error():
 
     with pytest.raises(TypeError):
         gds.cat.add_categories(["a", "bd", "ef"])
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1, 2, 3, 4],
+        ["a", "1", "2", "1", "a"],
+        pd.Series(["a", "1", "22", "1", "aa"]),
+        pd.Series(["a", "1", "22", "1", "aa"], dtype="category"),
+        pd.Series([1, 2, 3, 4], dtype="int64"),
+        pd.Series([1, 2.3, 3, 4], dtype="float"),
+        [None, 1, None, 2, None],
+        ["a"],
+    ],
+)
+@pytest.mark.parametrize(
+    "cat_dtype",
+    [
+        pd.CategoricalDtype(categories=["aa", "bb", "cc"]),
+        pd.CategoricalDtype(categories=[2, 4, 10, 100]),
+        pd.CategoricalDtype(categories=["aa", "bb", "c"]),
+        pd.CategoricalDtype(categories=["a", "bb", "c"]),
+        pd.CategoricalDtype(categories=["a", "b", "c"]),
+        pd.CategoricalDtype(categories=["22", "b", "c"]),
+        pd.CategoricalDtype(categories=["a"]),
+    ],
+)
+def test_categorical_assignment(data, cat_dtype):
+    pd_df = pd.DataFrame()
+    pd_df["a"] = np.ones(len(data))
+    cd_df = gd.from_pandas(pd_df)
+
+    pd_cat_series = pd.Series(data, dtype=cat_dtype)
+    # assign categorical series
+    pd_df.assign(cat_col=pd_cat_series)
+    cd_df.assign(cat_col=pd_cat_series)
+    assert_eq(pd_df, cd_df)
+
+    # assign categorical array
+    # needed for dask_cudf support for including file name
+    # as a categorical column
+    # see issue: https://github.com/rapidsai/cudf/issues/2269
+    pd_df = pd.DataFrame()
+    pd_df["a"] = np.ones(len(data))
+    cd_df = gd.from_pandas(pd_df)
+
+    pd_categorical = pd.Categorical(data, dtype=cat_dtype)
+    pd_df.assign(cat_col=pd_categorical)
+    cd_df.assign(cat_col=pd_categorical)
+    assert_eq(pd_df, cd_df)
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
@@ -89,7 +89,12 @@ def scalar_broadcast_to(scalar, size, dtype=None):
         return column.column_empty(size, dtype=dtype, masked=True)
 
     if isinstance(scalar, pd.Categorical):
-        return scalar_broadcast_to(scalar.categories[0], size).astype(dtype)
+        if dtype is None:
+            return _categorical_scalar_broadcast_to(scalar, size)
+        else:
+            return scalar_broadcast_to(scalar.categories[0], size).astype(
+                dtype
+            )
 
     scalar = to_cudf_compatible_scalar(scalar, dtype=dtype)
     dtype = scalar.dtype
@@ -532,3 +537,27 @@ def get_relevant_submodule(func, module):
         else:
             return None
     return module
+
+
+def _categorical_scalar_broadcast_to(cat_scalar, size):
+    if isinstance(cat_scalar, (cudf.Series, pd.Series)):
+        cats = cat_scalar.cat.categories
+        code = cat_scalar.cat.codes[0]
+        ordered = cat_scalar.cat.ordered
+    else:
+        # handles pd.Categorical, cudf.categorical.CategoricalColumn
+        cats = cat_scalar.categories
+        code = cat_scalar.codes[0]
+        ordered = cat_scalar.ordered
+
+    cats = column.as_column(cats)
+    codes = scalar_broadcast_to(code, size)
+
+    return column.build_categorical_column(
+        categories=cats,
+        codes=codes,
+        mask=codes.base_mask,
+        size=codes.size,
+        offset=codes.offset,
+        ordered=ordered,
+    )