rapidsai · rapids-bot · Sep 8, 2021 · Aug 26, 2021 · Aug 26, 2021 · Aug 27, 2021
@@ -16,7 +16,7 @@ dependencies:
   - python>=3.7,<3.9
   - numba>=0.53.1
   - numpy
-  - pandas>=1.0,<1.3.0dev0
+  - pandas>=1.0,<1.4.0dev0
   - pyarrow=5.0.0=*cuda
   - fastavro>=0.22.9
   - notebook>=0.5.0

@@ -16,7 +16,7 @@ dependencies:
   - python>=3.7,<3.9
   - numba>=0.53.1
   - numpy
-  - pandas>=1.0,<1.3.0dev0
+  - pandas>=1.0,<1.4.0dev0
   - pyarrow=5.0.0=*cuda
   - fastavro>=0.22.9
   - notebook>=0.5.0

@@ -517,7 +517,7 @@ def difference(self, other, sort=None):
             if self.dtype != other.dtype:
                 difference = difference.astype(self.dtype)
 
-        if sort is None:
+        if sort is None and len(other):
             return difference.sort_values()
 
         return difference

@@ -8,3 +8,5 @@
 PANDAS_GE_110 = PANDAS_VERSION >= version.parse("1.1")
 PANDAS_GE_120 = PANDAS_VERSION >= version.parse("1.2")
 PANDAS_LE_122 = PANDAS_VERSION <= version.parse("1.2.2")
+PANDAS_GE_130 = PANDAS_VERSION >= version.parse("1.3.0")
+PANDAS_LT_140 = PANDAS_VERSION < version.parse("1.4.0")
@@ -261,7 +261,7 @@ def as_timedelta_column(
         self, dtype: Dtype, **kwargs
     ) -> "cudf.core.column.TimeDeltaColumn":
         raise TypeError(
-            f"cannot astype a datetimelike from [{self.dtype}] to [{dtype}]"
+            f"cannot astype a datetimelike from {self.dtype} to {dtype}"
         )
 
     def as_numerical_column(

@@ -343,7 +343,7 @@ def as_datetime_column(
         self, dtype: Dtype, **kwargs
     ) -> "cudf.core.column.DatetimeColumn":
         raise TypeError(
-            f"cannot astype a timedelta from [{self.dtype}] to [{dtype}]"
+            f"cannot astype a timedelta from {self.dtype} to {dtype}"
         )
 
     def as_string_column(

@@ -6561,8 +6561,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
         ]
 
         if len(mode_results) == 0:
-            df = DataFrame(index=self.index)
-            return df
+            return DataFrame()
 
         df = cudf.concat(mode_results, axis=1)
         if isinstance(df, Series):

@@ -112,15 +112,15 @@ def from_pandas(cls, dtype: pd.CategoricalDtype) -> "CategoricalDtype":
         )
 
     def to_pandas(self) -> pd.CategoricalDtype:
-        if self.categories is None:
+        if self._categories is None:
             categories = None
         else:
             if isinstance(
-                self.categories, (cudf.Float32Index, cudf.Float64Index)
+                self._categories, (cudf.Float32Index, cudf.Float64Index)
             ):
-                categories = self.categories.dropna().to_pandas()
+                categories = self._categories.dropna().to_pandas()
             else:
-                categories = self.categories.to_pandas()
+                categories = self._categories.to_pandas()
         return pd.CategoricalDtype(categories=categories, ordered=self.ordered)
 
     def _init_categories(self, categories: Any):

@@ -1,4 +1,5 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+
 import collections
 import pickle
 import warnings
@@ -501,7 +502,7 @@ def mult(df):
         chunk_results = [function(chk) for chk in chunks]
 
         if not len(chunk_results):
-            return self.obj.__class__()
+            return self.obj.head(0)
 
         if cudf.utils.dtypes.is_scalar(chunk_results[0]):
             result = cudf.Series(chunk_results, index=group_names)
@@ -630,7 +631,7 @@ def rolling_avg(val, avg):
         .. code-block:: python
 
             Results:
-                 cat  val                 avg
+               cat  val                 avg
             0    1   16
             1    1   45
             2    1   62                41.0
@@ -713,8 +714,8 @@ def describe(self, include=None, exclude=None):
         2   24.0     90
         3   26.0     80
         >>> gdf.groupby('Score').describe()
-            Speed
-            count   mean   std    min    25%    50%    75%     max
+             Speed
+             count   mean   std    min    25%    50%    75%     max
         Score
         30        1  370.0  <NA>  370.0  370.0  370.0  370.0  370.0
         50        1  380.0  <NA>  380.0  380.0  380.0  380.0  380.0
@@ -946,13 +947,13 @@ def fillna(
                 >>> df = pd.DataFrame({'k': [1, 1, 2], 'v': [2, None, 4]})
                 >>> gdf = cudf.from_pandas(df)
                 >>> df.groupby('k').fillna({'v': 4}) # pandas
-                        v
+                       v
                 k
                 1 0  2.0
-                    1  4.0
+                  1  4.0
                 2 2  4.0
                 >>> gdf.groupby('k').fillna({'v': 4}) # cudf
-                        v
+                     v
                 0  2.0
                 1  4.0
                 2  4.0
@@ -1127,9 +1128,9 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
                     Max Speed
     Animal Type
     Falcon Captive      390.0
-        Wild         350.0
+           Wild         350.0
     Parrot Captive       30.0
-        Wild          20.0
+           Wild          20.0
     >>> df.groupby(level=0).mean()
             Max Speed
     Animal

@@ -1,4 +1,5 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+
 from __future__ import annotations
 
 import itertools
@@ -1422,7 +1423,7 @@ def to_pandas(self, nullable=False, **kwargs):
         if hasattr(self, "_source_data"):
             result = self._source_data.to_pandas(nullable=nullable)
             result.columns = self.names
-            return pd.MultiIndex.from_frame(result)
+            return pd.MultiIndex.from_frame(result, names=self.names)
 
         pandas_codes = []
         for code in self.codes.columns:

@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION
+# Copyright (c) 2020-2021, NVIDIA CORPORATION
 
 import itertools
 
@@ -393,7 +393,9 @@ def __init__(self, groupby, window, min_periods=None, center=False):
         # of `groupby.grouping.keys` and `groupby.obj`.
         # As an optimization, avoid gathering those twice.
         self._group_keys = groupby.grouping.keys.take(sort_order)
-        obj = groupby.obj.take(sort_order)
+        obj = groupby.obj.drop(
+            columns=groupby.grouping._key_column_names_from_obj
+        ).take(sort_order)
 
         gb_size = groupby.size().sort_index()
         self._group_starts = (

@@ -396,7 +396,18 @@ def test_categorical_as_unordered(pd_str_cat, inplace):
 
 @pytest.mark.parametrize("from_ordered", [True, False])
 @pytest.mark.parametrize("to_ordered", [True, False])
-@pytest.mark.parametrize("inplace", [True, False])
+@pytest.mark.parametrize(
+    "inplace",
+    [
+        pytest.param(
+            True,
+            marks=pytest.mark.xfail(
+                reason="https://github.com/pandas-dev/pandas/issues/43232"
+            ),
+        ),
+        False,
+    ],
+)
 def test_categorical_reorder_categories(
     pd_str_cat, from_ordered, to_ordered, inplace
 ):
@@ -420,7 +431,18 @@ def test_categorical_reorder_categories(
     assert str(cd_sr_1) == str(pd_sr_1)
 
 
-@pytest.mark.parametrize("inplace", [True, False])
+@pytest.mark.parametrize(
+    "inplace",
+    [
+        pytest.param(
+            True,
+            marks=pytest.mark.xfail(
+                reason="https://github.com/pandas-dev/pandas/issues/43232"
+            ),
+        ),
+        False,
+    ],
+)
 def test_categorical_add_categories(pd_str_cat, inplace):
 
     pd_sr = pd.Series(pd_str_cat.copy())
@@ -441,7 +463,18 @@ def test_categorical_add_categories(pd_str_cat, inplace):
     assert_eq(pd_sr_1, cd_sr_1)
 
 
-@pytest.mark.parametrize("inplace", [True, False])
+@pytest.mark.parametrize(
+    "inplace",
+    [
+        pytest.param(
+            True,
+            marks=pytest.mark.xfail(
+                reason="https://github.com/pandas-dev/pandas/issues/43232"
+            ),
+        ),
+        False,
+    ],
+)
 def test_categorical_remove_categories(pd_str_cat, inplace):
 
     pd_sr = pd.Series(pd_str_cat.copy())

@@ -525,9 +525,22 @@ def test_concat_empty_dataframes(df, other, ignore_index):
     if expected.shape != df.shape:
         for key, col in actual[actual.columns].iteritems():
             if is_categorical_dtype(col.dtype):
-                expected[key] = expected[key].fillna("-1")
+                if expected[key].dtype != "category":
+                    # TODO: Pandas bug:
+                    # https://github.com/pandas-dev/pandas/issues/42840
+                    expected[key] = expected[key].fillna("-1").astype("str")
+                else:
+                    expected[key] = (
+                        expected[key]
+                        .cat.add_categories(["-1"])
+                        .fillna("-1")
+                        .astype("str")
+                    )
                 actual[key] = col.astype("str").fillna("-1")
-        assert_eq(expected.fillna(-1), actual.fillna(-1), check_dtype=False)
+            else:
+                expected[key] = expected[key].fillna(-1)
+                actual[key] = col.fillna(-1)
+        assert_eq(expected, actual, check_dtype=False)
     else:
         assert_eq(
             expected, actual, check_index_type=False if gdf.empty else True
@@ -1079,8 +1092,23 @@ def test_concat_join_empty_dataframes(
         if axis == 0:
             for key, col in actual[actual.columns].iteritems():
                 if is_categorical_dtype(col.dtype):
-                    expected[key] = expected[key].fillna("-1")
+                    if expected[key].dtype != "category":
+                        # TODO: Pandas bug:
+                        # https://github.com/pandas-dev/pandas/issues/42840
+                        expected[key] = (
+                            expected[key].fillna("-1").astype("str")
+                        )
+                    else:
+                        expected[key] = (
+                            expected[key]
+                            .cat.add_categories(["-1"])
+                            .fillna("-1")
+                            .astype("str")
+                        )
                     actual[key] = col.astype("str").fillna("-1")
+                else:
+                    expected[key] = expected[key].fillna(-1)
+                    actual[key] = col.fillna(-1)
 
             assert_eq(
                 expected.fillna(-1),
@@ -1100,7 +1128,11 @@ def test_concat_join_empty_dataframes(
                 check_column_type=False,
             )
     assert_eq(
-        expected, actual, check_index_type=False, check_column_type=False
+        expected,
+        actual,
+        check_dtype=False,
+        check_index_type=False,
+        check_column_type=False,
     )
 
 

@@ -1762,18 +1762,7 @@ def test_csv_write_empty_column_name(df, index, columns):
         cudf.DataFrame(index=cudf.Index([], name="index name")),
     ],
 )
-@pytest.mark.parametrize(
-    "index",
-    [
-        True,
-        pytest.param(
-            False,
-            marks=pytest.mark.xfail(
-                reason="https://github.com/rapidsai/cudf/issues/6691"
-            ),
-        ),
-    ],
-)
+@pytest.mark.parametrize("index", [True, False])
 def test_csv_write_empty_dataframe(df, index):
     pdf = df.to_pandas()