Skip to content

Commit

Permalink
Fix categorical scalar insertion (#6830)
Browse files Browse the repository at this point in the history
This PR closes part of #2269 and is created in place of #5306 . 

I created this PR because that PR #5306  got messy after rebasing from an earlier branch of `cudf`. 

The purpose of this is to fix the below pandas Categorical assignment which in turn fixes  #2269  .  

```python

import numpy as np
import pandas as pd
import cudf

head = cudf.DataFrame({'a':[1]})
cat_s = pd.Categorical.from_codes(np.zeros(len(head), dtype=int), categories=['cat.csv'])
head.assign(path=cat_s)
```
  • Loading branch information
VibhuJawa authored Nov 24, 2020
1 parent cdd72c9 commit fd72e5f
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@
- PR #6798 Fix `read_avro` docs
- PR #6824 Fix JNI build
- PR #6826 Fix resource management in Java ColumnBuilder
- PR #6830 Fix categorical scalar insertion


# cuDF 0.16.0 (21 Oct 2020)
Expand Down
50 changes: 50 additions & 0 deletions python/cudf/cudf/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -713,3 +713,53 @@ def test_add_categories_mixed_error():

with pytest.raises(TypeError):
gds.cat.add_categories(["a", "bd", "ef"])


@pytest.mark.parametrize(
"data",
[
[1, 2, 3, 4],
["a", "1", "2", "1", "a"],
pd.Series(["a", "1", "22", "1", "aa"]),
pd.Series(["a", "1", "22", "1", "aa"], dtype="category"),
pd.Series([1, 2, 3, 4], dtype="int64"),
pd.Series([1, 2.3, 3, 4], dtype="float"),
[None, 1, None, 2, None],
["a"],
],
)
@pytest.mark.parametrize(
"cat_dtype",
[
pd.CategoricalDtype(categories=["aa", "bb", "cc"]),
pd.CategoricalDtype(categories=[2, 4, 10, 100]),
pd.CategoricalDtype(categories=["aa", "bb", "c"]),
pd.CategoricalDtype(categories=["a", "bb", "c"]),
pd.CategoricalDtype(categories=["a", "b", "c"]),
pd.CategoricalDtype(categories=["22", "b", "c"]),
pd.CategoricalDtype(categories=["a"]),
],
)
def test_categorical_assignment(data, cat_dtype):
pd_df = pd.DataFrame()
pd_df["a"] = np.ones(len(data))
cd_df = gd.from_pandas(pd_df)

pd_cat_series = pd.Series(data, dtype=cat_dtype)
# assign categorical series
pd_df.assign(cat_col=pd_cat_series)
cd_df.assign(cat_col=pd_cat_series)
assert_eq(pd_df, cd_df)

# assign categorical array
# needed for dask_cudf support for including file name
# as a categorical column
# see issue: https://github.com/rapidsai/cudf/issues/2269
pd_df = pd.DataFrame()
pd_df["a"] = np.ones(len(data))
cd_df = gd.from_pandas(pd_df)

pd_categorical = pd.Categorical(data, dtype=cat_dtype)
pd_df.assign(cat_col=pd_categorical)
cd_df.assign(cat_col=pd_categorical)
assert_eq(pd_df, cd_df)
31 changes: 30 additions & 1 deletion python/cudf/cudf/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,12 @@ def scalar_broadcast_to(scalar, size, dtype=None):
return column.column_empty(size, dtype=dtype, masked=True)

if isinstance(scalar, pd.Categorical):
return scalar_broadcast_to(scalar.categories[0], size).astype(dtype)
if dtype is None:
return _categorical_scalar_broadcast_to(scalar, size)
else:
return scalar_broadcast_to(scalar.categories[0], size).astype(
dtype
)

scalar = to_cudf_compatible_scalar(scalar, dtype=dtype)
dtype = scalar.dtype
Expand Down Expand Up @@ -532,3 +537,27 @@ def get_relevant_submodule(func, module):
else:
return None
return module


def _categorical_scalar_broadcast_to(cat_scalar, size):
if isinstance(cat_scalar, (cudf.Series, pd.Series)):
cats = cat_scalar.cat.categories
code = cat_scalar.cat.codes[0]
ordered = cat_scalar.cat.ordered
else:
# handles pd.Categorical, cudf.categorical.CategoricalColumn
cats = cat_scalar.categories
code = cat_scalar.codes[0]
ordered = cat_scalar.ordered

cats = column.as_column(cats)
codes = scalar_broadcast_to(code, size)

return column.build_categorical_column(
categories=cats,
codes=codes,
mask=codes.base_mask,
size=codes.size,
offset=codes.offset,
ordered=ordered,
)

0 comments on commit fd72e5f

Please sign in to comment.