rapidsai · rapids-bot · Feb 26, 2021 · Feb 12, 2021 · Feb 12, 2021 · Feb 12, 2021
@@ -946,12 +946,14 @@ def unary_operator(self, unaryop: str):
         )
 
     def __setitem__(self, key, value):
-        to_add_categories = cudf.Index(value).difference(self.categories)
+        if cudf.utils.dtypes.is_scalar(
+            value
+        ) and cudf._lib.scalar._is_null_host_scalar(value):
+            to_add_categories = []
+        else:
+            to_add_categories = cudf.Index(value).difference(self.categories)
 
-        if (
-            len(to_add_categories)
-            and not to_add_categories.isna()._values.all()
-        ):
+        if len(to_add_categories):
             raise ValueError(
                 "Cannot setitem on a Categorical with a new "
                 "category, set the categories first"
@@ -1067,11 +1069,18 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]:
     def to_pandas(
         self, index: ColumnLike = None, nullable: bool = False, **kwargs
     ) -> pd.Series:
-        signed_dtype = min_signed_type(len(self.categories))
-        codes = self.cat().codes.astype(signed_dtype).fillna(-1).to_array()
-        categories = self.categories.to_pandas()
+
+        if self.categories.isnull().any():
+            col = self.copy(deep=True)
+            col[col.isnull()] = None
+        else:
+            col = self
+
+        signed_dtype = min_signed_type(len(col.categories))
+        codes = col.cat().codes.astype(signed_dtype).fillna(-1).to_array()
+        categories = col.categories.dropna(drop_nan=True).to_pandas()
         data = pd.Categorical.from_codes(
-            codes, categories=categories, ordered=self.ordered
+            codes, categories=categories, ordered=col.ordered
         )
         return pd.Series(data, index=index)
 
@@ -1201,6 +1210,20 @@ def find_and_replace(
             ordered=self.dtype.ordered,
         )
 
+    def isnull(self) -> ColumnBase:
+        """Identify missing values in a Column.
+        """
+        result = libcudf.unary.is_null(self)
+
+        if self.categories.dtype.kind == "f":
+            # Need to consider `np.nan` values incase
+            # of a float column
+            result = result | libcudf.unary.is_nan(
+                self.astype(self.categories.dtype)
+            )
+
+        return result
+
     def fillna(
         self, fill_value: Any = None, method: Any = None, dtype: Dtype = None
     ) -> CategoricalColumn:

@@ -1063,14 +1063,14 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase:
 
         # columns include null index in factorization; remove:
         if self.has_nulls:
-            cats = cats.dropna()
+            cats = cats._column.dropna(drop_nan=False)
             min_type = min_unsigned_type(len(cats), 8)
             labels = labels - 1
             if np.dtype(min_type).itemsize < labels.dtype.itemsize:
                 labels = labels.astype(min_type)
 
         return build_categorical_column(
-            categories=cats._column,
+            categories=cats,
             codes=labels._column,
             mask=self.mask,
             ordered=ordered,
@@ -2077,9 +2077,11 @@ def _construct_array(
         arbitrary = cupy.asarray(arbitrary, dtype=dtype)
     except (TypeError, ValueError):
         native_dtype = dtype
-        if dtype is None and pd.api.types.infer_dtype(arbitrary) in (
-            "mixed",
-            "mixed-integer",
+        if (
+            dtype is None
+            and not cudf._lib.scalar._is_null_host_scalar(arbitrary)
+            and pd.api.types.infer_dtype(arbitrary)
+            in ("mixed", "mixed-integer",)
         ):
             native_dtype = "object"
         arbitrary = np.asarray(

@@ -1993,7 +1993,20 @@ def __repr__(self):
         # utilize `Index.to_string` once it is implemented
         # related issue : https://github.com/pandas-dev/pandas/issues/35389
         if isinstance(preprocess, CategoricalIndex):
-            output = preprocess.to_pandas().__repr__()
+            if preprocess.categories.dtype.kind == "f":
+                output = (
+                    preprocess.astype("str")
+                    .to_pandas()
+                    .astype("category")
+                    .__repr__()
+                )
+                break_idx = output.find("ordered=")
+                output = (
+                    output[:break_idx].replace("'", "") + output[break_idx:]
+                )
+            else:
+                output = preprocess.to_pandas().__repr__()
+
             output = output.replace("nan", cudf._NA_REP)
         elif preprocess._values.nullable:
             output = self._clean_nulls_from_index().to_pandas().__repr__()

@@ -95,8 +95,10 @@ def __setitem__(self, key, value):
         else:
             value = column.as_column(value)
 
-        if hasattr(value, "dtype") and pd.api.types.is_numeric_dtype(
-            value.dtype
+        if (
+            not is_categorical_dtype(self._sr._column.dtype)
+            and hasattr(value, "dtype")
+            and pd.api.types.is_numeric_dtype(value.dtype)
         ):
             # normalize types if necessary:
             if not pd.api.types.is_integer(key):

@@ -1070,7 +1070,13 @@ def __repr__(self):
                 else get_option("display.min_rows")
             )
             show_dimensions = get_option("display.show_dimensions")
-            output = preprocess.to_pandas().to_string(
+            if preprocess._column.categories.dtype.kind == "f":
+                pd_series = (
+                    preprocess.astype("str").to_pandas().astype("category")
+                )
+            else:
+                pd_series = preprocess.to_pandas()
+            output = pd_series.to_string(
                 name=self.name,
                 dtype=self.dtype,
                 min_rows=min_rows,
@@ -1085,6 +1091,8 @@ def __repr__(self):
 
         if isinstance(preprocess._column, cudf.core.column.CategoricalColumn):
             category_memory = lines[-1]
+            if preprocess._column.categories.dtype.kind == "f":
+                category_memory = category_memory.replace("'", "")
             lines = lines[:-1]
         if len(lines) > 1:
             if lines[-1].startswith("Name: "):

@@ -762,3 +762,31 @@ def test_categorical_assignment(data, cat_dtype):
     pd_df.assign(cat_col=pd_categorical)
     cd_df.assign(cat_col=pd_categorical)
     assert_eq(pd_df, cd_df)
+
+
+def test_categorical_allow_nan():
+    gs = cudf.Series([1, 2, np.nan, 10, np.nan, None], nan_as_null=False)
+    gs = gs.astype("category")
+    expected_codes = cudf.Series([0, 1, 3, 2, 3, None], dtype="uint8")
+    assert_eq(expected_codes, gs.cat.codes)
+
+    expected_categories = cudf.Index([1.0, 2.0, 10.0, np.nan], dtype="float64")
+    assert_eq(expected_categories, gs.cat.categories)
+
+    actual_ps = gs.to_pandas()
+    expected_ps = pd.Series(
+        [1.0, 2.0, np.nan, 10.0, np.nan, np.nan], dtype="category"
+    )
+    assert_eq(actual_ps, expected_ps)
+
+
+def test_categorical_setitem_with_nan():
+    gs = cudf.Series(
+        [1, 2, np.nan, 10, np.nan, None], nan_as_null=False
+    ).astype("category")
+    gs[[1, 3]] = np.nan
+
+    expected_series = cudf.Series(
+        [1, np.nan, np.nan, np.nan, np.nan, None], nan_as_null=False
+    ).astype(gs.dtype)
+    assert_eq(gs, expected_series)
@@ -1417,3 +1417,59 @@ def test_mulitIndex_null_repr(gdi, expected_repr):
     actual_repr = gdi.__repr__()
 
     assert actual_repr.split() == expected_repr.split()
+
+
+def test_categorical_series_with_nan_repr():
+    series = cudf.Series(
+        [1, 2, np.nan, 10, np.nan, None], nan_as_null=False
+    ).astype("category")
+
+    expected_repr = textwrap.dedent(
+        """
+    0     1.0
+    1     2.0
+    2     NaN
+    3    10.0
+    4     NaN
+    5    <NA>
+    dtype: category
+    Categories (4, object): [1.0, 10.0, 2.0, NaN]
+    """
+    )
+
+    assert series.__repr__().split() == expected_repr.split()
+
+
+def test_categorical_dataframe_with_nan_repr():
+    series = cudf.Series(
+        [1, 2, np.nan, 10, np.nan, None], nan_as_null=False
+    ).astype("category")
+    df = cudf.DataFrame({"a": series})
+    expected_repr = textwrap.dedent(
+        """
+          a
+    0   1.0
+    1   2.0
+    2   NaN
+    3  10.0
+    4   NaN
+    5  <NA>
+    """
+    )
+
+    assert df.__repr__().split() == expected_repr.split()
+
+
+def test_categorical_index_with_nan_repr():
+    cat_index = cudf.Index(
+        cudf.Series(
+            [1, 2, np.nan, 10, np.nan, None], nan_as_null=False
+        ).astype("category")
+    )
+
+    expected_repr = (
+        "CategoricalIndex([1.0, 2.0, NaN, 10.0, NaN, <NA>], "
+        "categories=[1.0, 10.0, 2.0, NaN], ordered=False, dtype='category')"
+    )
+
+    assert cat_index.__repr__() == expected_repr