From fd72e5fed1a146f2d67db4514e5cb7b7f85a6ba3 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Mon, 23 Nov 2020 22:28:42 -0800 Subject: [PATCH] Fix categorical scalar insertion (#6830) This PR closes part of #2269 and is created in place of https://github.com/rapidsai/cudf/pull/5306 . I created this PR because that PR https://github.com/rapidsai/cudf/pull/5306 got messy after rebasing from an earlier branch of `cudf`. The purpose of this is to fix the below pandas Categorical assignment which in turn fixes #2269 . ```python import numpy as np import pandas as pd import cudf head = cudf.DataFrame({'a':[1]}) cat_s = pd.Categorical.from_codes(np.zeros(len(head), dtype=int), categories=['cat.csv']) head.assign(path=cat_s) ``` --- CHANGELOG.md | 1 + python/cudf/cudf/tests/test_categorical.py | 50 ++++++++++++++++++++++ python/cudf/cudf/utils/utils.py | 31 +++++++++++++- 3 files changed, 81 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f811964c75b..25bbb34fbe1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -154,6 +154,7 @@ - PR #6798 Fix `read_avro` docs - PR #6824 Fix JNI build - PR #6826 Fix resource management in Java ColumnBuilder +- PR #6830 Fix categorical scalar insertion # cuDF 0.16.0 (21 Oct 2020) diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 1b54fa67fba..2d8130e6cb1 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -713,3 +713,53 @@ def test_add_categories_mixed_error(): with pytest.raises(TypeError): gds.cat.add_categories(["a", "bd", "ef"]) + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3, 4], + ["a", "1", "2", "1", "a"], + pd.Series(["a", "1", "22", "1", "aa"]), + pd.Series(["a", "1", "22", "1", "aa"], dtype="category"), + pd.Series([1, 2, 3, 4], dtype="int64"), + pd.Series([1, 2.3, 3, 4], dtype="float"), + [None, 1, None, 2, None], + ["a"], + ], +) +@pytest.mark.parametrize( + "cat_dtype", + [ + pd.CategoricalDtype(categories=["aa", "bb", "cc"]), + pd.CategoricalDtype(categories=[2, 4, 10, 100]), + pd.CategoricalDtype(categories=["aa", "bb", "c"]), + pd.CategoricalDtype(categories=["a", "bb", "c"]), + pd.CategoricalDtype(categories=["a", "b", "c"]), + pd.CategoricalDtype(categories=["22", "b", "c"]), + pd.CategoricalDtype(categories=["a"]), + ], +) +def test_categorical_assignment(data, cat_dtype): + pd_df = pd.DataFrame() + pd_df["a"] = np.ones(len(data)) + cd_df = gd.from_pandas(pd_df) + + pd_cat_series = pd.Series(data, dtype=cat_dtype) + # assign categorical series + pd_df.assign(cat_col=pd_cat_series) + cd_df.assign(cat_col=pd_cat_series) + assert_eq(pd_df, cd_df) + + # assign categorical array + # needed for dask_cudf support for including file name + # as a categorical column + # see issue: https://github.com/rapidsai/cudf/issues/2269 + pd_df = pd.DataFrame() + pd_df["a"] = np.ones(len(data)) + cd_df = gd.from_pandas(pd_df) + + pd_categorical = pd.Categorical(data, dtype=cat_dtype) + pd_df.assign(cat_col=pd_categorical) + cd_df.assign(cat_col=pd_categorical) + assert_eq(pd_df, cd_df) diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 4ab1c833ebc..c4592dfa100 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -89,7 +89,12 @@ def scalar_broadcast_to(scalar, size, dtype=None): return column.column_empty(size, dtype=dtype, masked=True) if isinstance(scalar, pd.Categorical): - return scalar_broadcast_to(scalar.categories[0], size).astype(dtype) + if dtype is None: + return _categorical_scalar_broadcast_to(scalar, size) + else: + return scalar_broadcast_to(scalar.categories[0], size).astype( + dtype + ) scalar = to_cudf_compatible_scalar(scalar, dtype=dtype) dtype = scalar.dtype @@ -532,3 +537,27 @@ def get_relevant_submodule(func, module): else: return None return module + + +def _categorical_scalar_broadcast_to(cat_scalar, size): + if isinstance(cat_scalar, (cudf.Series, pd.Series)): + cats = cat_scalar.cat.categories + code = cat_scalar.cat.codes[0] + ordered = cat_scalar.cat.ordered + else: + # handles pd.Categorical, cudf.categorical.CategoricalColumn + cats = cat_scalar.categories + code = cat_scalar.codes[0] + ordered = cat_scalar.ordered + + cats = column.as_column(cats) + codes = scalar_broadcast_to(code, size) + + return column.build_categorical_column( + categories=cats, + codes=codes, + mask=codes.base_mask, + size=codes.size, + offset=codes.offset, + ordered=ordered, + )