rapidsai · rapids-bot · Feb 26, 2021 · Feb 12, 2021 · Feb 12, 2021 · Feb 12, 2021
@@ -17,7 +17,7 @@ dependencies:
   - python>=3.6,<3.8
   - numba>=0.49.0,!=0.51.0
   - numpy
-  - pandas>=1.0,<1.2.0dev0
+  - pandas>=1.0,<1.3.0dev0
   - pyarrow=1.0.1
   - fastavro>=0.22.9
   - notebook>=0.5.0

@@ -17,7 +17,7 @@ dependencies:
   - python>=3.6,<3.8
   - numba>=0.49,!=0.51.0
   - numpy
-  - pandas>=1.0,<1.2.0dev0
+  - pandas>=1.0,<1.3.0dev0
   - pyarrow=1.0.1
   - fastavro>=0.22.9
   - notebook>=0.5.0

@@ -17,7 +17,7 @@ dependencies:
   - python>=3.6,<3.8
   - numba>=0.49,!=0.51.0
   - numpy
-  - pandas>=1.0,<1.2.0dev0
+  - pandas>=1.0,<1.3.0dev0
   - pyarrow=1.0.1
   - fastavro>=0.22.9
   - notebook>=0.5.0

@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
@@ -35,7 +35,7 @@ requirements:
     - protobuf
     - python
     - typing_extensions
-    - pandas >=1.0,<1.2.0dev0
+    - pandas >=1.0,<1.3.0dev0
     - cupy >7.1.0,<9.0.0a0
     - numba >=0.49.0
     - numpy

@@ -1,8 +1,9 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 import pandas as pd
 from packaging import version
 
 PANDAS_VERSION = version.parse(pd.__version__)
 PANDAS_GE_100 = PANDAS_VERSION >= version.parse("1.0")
 PANDAS_GE_110 = PANDAS_VERSION >= version.parse("1.1")
+PANDAS_GE_120 = PANDAS_VERSION >= version.parse("1.2")
@@ -9,6 +9,7 @@
     Dict,
     Mapping,
     Optional,
+    Sequence,
     Tuple,
     Union,
     cast,
@@ -867,6 +868,15 @@ def set_base_data(self, value):
         else:
             super().set_base_data(value)
 
+    def _process_values_for_isin(
+        self, values: Sequence
+    ) -> Tuple[ColumnBase, ColumnBase]:
+        lhs = self
+        # We need to convert values to same type as self,
+        # hence passing dtype=self.dtype
+        rhs = cudf.core.column.as_column(values, dtype=self.dtype)
+        return lhs, rhs
+
     def set_base_mask(self, value: Optional[Buffer]):
         super().set_base_mask(value)
         self._codes = None
@@ -936,6 +946,21 @@ def unary_operator(self, unaryop: str):
         )
 
     def __setitem__(self, key, value):
+        if cudf.utils.dtypes.is_scalar(
+            value
+        ) and cudf._lib.scalar._is_null_host_scalar(value):
+            to_add_categories = 0
+        else:
+            to_add_categories = len(
+                cudf.Index(value).difference(self.categories)
+            )
+
+        if to_add_categories > 0:
+            raise ValueError(
+                "Cannot setitem on a Categorical with a new "
+                "category, set the categories first"
+            )
+
         if cudf.utils.dtypes.is_scalar(value):
             value = self._encode(value) if value is not None else value
         else:
@@ -1046,11 +1071,24 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]:
     def to_pandas(
         self, index: ColumnLike = None, nullable: bool = False, **kwargs
     ) -> pd.Series:
-        signed_dtype = min_signed_type(len(self.categories))
-        codes = self.cat().codes.astype(signed_dtype).fillna(-1).to_array()
-        categories = self.categories.to_pandas()
+
+        if self.categories.dtype.kind == "f":
+            new_mask = bools_to_mask(self.notnull())
+            col = column.build_categorical_column(
+                categories=self.categories,
+                codes=column.as_column(self.codes, dtype=self.codes.dtype),
+                mask=new_mask,
+                ordered=self.dtype.ordered,
+                size=self.codes.size,
+            )
+        else:
+            col = self
+
+        signed_dtype = min_signed_type(len(col.categories))
+        codes = col.cat().codes.astype(signed_dtype).fillna(-1).to_array()
+        categories = col.categories.dropna(drop_nan=True).to_pandas()
         data = pd.Categorical.from_codes(
-            codes, categories=categories, ordered=self.ordered
+            codes, categories=categories, ordered=col.ordered
         )
         return pd.Series(data, index=index)
 
@@ -1180,6 +1218,38 @@ def find_and_replace(
             ordered=self.dtype.ordered,
         )
 
+    def isnull(self) -> ColumnBase:
+        """
+        Identify missing values in a CategoricalColumn.
+        """
+        result = libcudf.unary.is_null(self)
+
+        if self.categories.dtype.kind == "f":
+            # Need to consider `np.nan` values incase
+            # of an underlying float column
+            categories = libcudf.unary.is_nan(self.categories)
+            if categories.any():
+                code = self._encode(np.nan)
+                result = result | (self.codes == cudf.Scalar(code))
+
+        return result
+
+    def notnull(self) -> ColumnBase:
+        """
+        Identify non-missing values in a CategoricalColumn.
+        """
+        result = libcudf.unary.is_valid(self)
+
+        if self.categories.dtype.kind == "f":
+            # Need to consider `np.nan` values incase
+            # of an underlying float column
+            categories = libcudf.unary.is_nan(self.categories)
+            if categories.any():
+                code = self._encode(np.nan)
+                result = result & (self.codes != cudf.Scalar(code))
+
+        return result
+
     def fillna(
         self, fill_value: Any = None, method: Any = None, dtype: Dtype = None
     ) -> CategoricalColumn:
@@ -1204,6 +1274,12 @@ def fillna(
                         raise ValueError(err_msg) from err
             else:
                 fill_value = column.as_column(fill_value, nan_as_null=False)
+                if isinstance(fill_value, CategoricalColumn):
+                    if self.dtype != fill_value.dtype:
+                        raise ValueError(
+                            "Cannot set a Categorical with another, "
+                            "without identical categories"
+                        )
                 # TODO: only required if fill_value has a subset of the
                 # categories:
                 fill_value = fill_value.cat()._set_categories(

@@ -1,4 +1,5 @@
 # Copyright (c) 2018-2021, NVIDIA CORPORATION.
+
 from __future__ import annotations
 
 import builtins
@@ -49,12 +50,12 @@
     get_time_unit,
     is_categorical_dtype,
     is_decimal_dtype,
+    is_interval_dtype,
     is_list_dtype,
     is_numerical_dtype,
     is_scalar,
     is_string_dtype,
     is_struct_dtype,
-    is_interval_dtype,
     min_signed_type,
     min_unsigned_type,
     np_to_pa_dtype,
@@ -848,55 +849,65 @@ def isin(self, values: Sequence) -> ColumnBase:
         -------
         result: Column
             Column of booleans indicating if each element is in values.
-        Raises
-        -------
-        TypeError
-            If values is a string
         """
-        if is_scalar(values):
-            raise TypeError(
-                "only list-like objects are allowed to be passed "
-                f"to isin(), you passed a [{type(values).__name__}]"
-            )
-
         lhs = self
         rhs = None
 
         try:
-            # We need to convert values to same type as self,
-            # hence passing dtype=self.dtype
-            rhs = as_column(values, dtype=self.dtype)
-
-            # Short-circuit if rhs is all null.
-            if lhs.null_count == 0 and (rhs.null_count == len(rhs)):
-                return full(len(self), False, dtype="bool")
+            lhs, rhs = self._process_values_for_isin(values)
+            res = lhs._isin_earlystop(rhs)
+            if res is not None:
+                return res
         except ValueError:
             # pandas functionally returns all False when cleansing via
             # typecasting fails
             return full(len(self), False, dtype="bool")
 
-        # If categorical, combine categories first
-        if is_categorical_dtype(lhs):
-            lhs_cats = lhs.cat().categories._values
-            rhs_cats = rhs.cat().categories._values
-
-            if not np.issubdtype(rhs_cats.dtype, lhs_cats.dtype):
-                # If they're not the same dtype, short-circuit if the values
-                # list doesn't have any nulls. If it does have nulls, make
-                # the values list a Categorical with a single null
-                if not rhs.has_nulls:
-                    return full(len(self), False, dtype="bool")
-                rhs = as_column(pd.Categorical.from_codes([-1], categories=[]))
-                rhs = rhs.cat().set_categories(lhs_cats).astype(self.dtype)
-
-        ldf = cudf.DataFrame({"x": lhs, "orig_order": arange(len(lhs))})
+        res = lhs._obtain_isin_result(rhs)
+
+        return res
+
+    def _process_values_for_isin(
+        self, values: Sequence
+    ) -> Tuple[ColumnBase, ColumnBase]:
+        """
+        Helper function for `isin` which pre-process `values` based on `self`.
+        """
+        lhs = self
+        rhs = as_column(values, nan_as_null=False)
+        if lhs.null_count == len(lhs):
+            lhs = lhs.astype(rhs.dtype)
+        elif rhs.null_count == len(rhs):
+            rhs = rhs.astype(lhs.dtype)
+        return lhs, rhs
+
+    def _isin_earlystop(self, rhs: ColumnBase) -> Union[ColumnBase, None]:
+        """
+        Helper function for `isin` which determines possibility of
+        early-stopping or not.
+        """
+        if self.dtype != rhs.dtype:
+            if self.null_count and rhs.null_count:
+                return self.isna()
+            else:
+                return cudf.core.column.full(len(self), False, dtype="bool")
+        elif self.null_count == 0 and (rhs.null_count == len(rhs)):
+            return cudf.core.column.full(len(self), False, dtype="bool")
+        else:
+            return None
+
+    def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase:
+        """
+        Helper function for `isin` which merges `self` & `rhs`
+        to determine what values of `rhs` exist in `self`.
+        """
+        ldf = cudf.DataFrame({"x": self, "orig_order": arange(len(self))})
         rdf = cudf.DataFrame(
             {"x": rhs, "bool": full(len(rhs), True, dtype="bool")}
         )
         res = ldf.merge(rdf, on="x", how="left").sort_values(by="orig_order")
         res = res.drop_duplicates(subset="orig_order", ignore_index=True)
         res = res._data["bool"].fillna(False)
-
         return res
 
     def as_mask(self) -> Buffer:
@@ -1052,14 +1063,14 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase:
 
         # columns include null index in factorization; remove:
         if self.has_nulls:
-            cats = cats.dropna()
+            cats = cats._column.dropna(drop_nan=False)
             min_type = min_unsigned_type(len(cats), 8)
             labels = labels - 1
             if np.dtype(min_type).itemsize < labels.dtype.itemsize:
                 labels = labels.astype(min_type)
 
         return build_categorical_column(
-            categories=cats._column,
+            categories=cats,
             codes=labels._column,
             mask=self.mask,
             ordered=ordered,
@@ -1250,7 +1261,7 @@ def sum(
     def product(
         self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
     ):
-        raise TypeError(f"cannot perform prod with type {self.dtype}")
+        raise TypeError(f"cannot perform product with type {self.dtype}")
 
     def mean(self, skipna: bool = None, dtype: Dtype = None):
         raise TypeError(f"cannot perform mean with type {self.dtype}")
@@ -1262,7 +1273,7 @@ def var(self, skipna: bool = None, ddof=1, dtype: Dtype = np.float64):
         raise TypeError(f"cannot perform var with type {self.dtype}")
 
     def kurtosis(self, skipna: bool = None):
-        raise TypeError(f"cannot perform kurt with type {self.dtype}")
+        raise TypeError(f"cannot perform kurtosis with type {self.dtype}")
 
     def skew(self, skipna: bool = None):
         raise TypeError(f"cannot perform skew with type {self.dtype}")
@@ -2066,9 +2077,11 @@ def _construct_array(
         arbitrary = cupy.asarray(arbitrary, dtype=dtype)
     except (TypeError, ValueError):
         native_dtype = dtype
-        if dtype is None and pd.api.types.infer_dtype(arbitrary) in (
-            "mixed",
-            "mixed-integer",
+        if (
+            dtype is None
+            and not cudf._lib.scalar._is_null_host_scalar(arbitrary)
+            and pd.api.types.infer_dtype(arbitrary)
+            in ("mixed", "mixed-integer",)
         ):
             native_dtype = "object"
         arbitrary = np.asarray(