Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Upgrade pandas version in cudf #9147

Merged
merged 42 commits into from
Sep 8, 2021
Merged
Show file tree
Hide file tree
Changes from 38 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
866d112
fix test
galipremsagar Aug 26, 2021
e3bd5df
Merge remote-tracking branch 'upstream/branch-21.10' into 9125
galipremsagar Aug 26, 2021
7bde9b4
Merge remote-tracking branch 'upstream/branch-21.10' into 9125
galipremsagar Aug 27, 2021
b8be491
fix initial pass of issues related to pandas 1.3 upgrade.
galipremsagar Aug 30, 2021
3209afc
Merge remote-tracking branch 'upstream/branch-21.10' into 9125
galipremsagar Aug 30, 2021
2890bac
fix failures.
galipremsagar Aug 31, 2021
45a73e7
merge
galipremsagar Aug 31, 2021
3c720bd
more fixes
galipremsagar Aug 31, 2021
626685b
remove pandas bug workarounds
galipremsagar Aug 31, 2021
d1db177
more cleanup
galipremsagar Aug 31, 2021
94d46aa
more fixes
galipremsagar Aug 31, 2021
3c1282b
Merge remote-tracking branch 'upstream/branch-21.10' into 9125
galipremsagar Aug 31, 2021
3385542
match pandas behavior
galipremsagar Sep 1, 2021
f69a997
add conditional xfail
galipremsagar Sep 1, 2021
f843245
temp commit
galipremsagar Sep 2, 2021
60e4f0e
Merge remote-tracking branch 'upstream/branch-21.10' into 9125
galipremsagar Sep 2, 2021
7362ba2
tmp
galipremsagar Sep 2, 2021
49a1fcc
Merge remote-tracking branch 'upstream/branch-21.10' into 9125
galipremsagar Sep 2, 2021
a58f88e
tmp
galipremsagar Sep 2, 2021
2793df4
tmp
galipremsagar Sep 2, 2021
bdb31ae
Update ci/gpu/build.sh
galipremsagar Sep 2, 2021
fa85131
tmp
galipremsagar Sep 3, 2021
f8cd6c4
tmp
galipremsagar Sep 3, 2021
ac1cf22
tmp
galipremsagar Sep 3, 2021
061fa6c
Update ci/gpu/build.sh
galipremsagar Sep 3, 2021
cde3ba7
Update ci/gpu/build.sh
galipremsagar Sep 3, 2021
82465f1
Update ci/gpu/build.sh
galipremsagar Sep 3, 2021
21c110c
Update ci/gpu/build.sh
galipremsagar Sep 3, 2021
16455a3
Update build.sh
galipremsagar Sep 3, 2021
35b1c5d
tmp
galipremsagar Sep 3, 2021
e0cfcbf
Update build.sh
galipremsagar Sep 3, 2021
078d9d9
Update build.sh
galipremsagar Sep 4, 2021
d85fe5b
Merge remote-tracking branch 'upstream/branch-21.10' into 9125
galipremsagar Sep 4, 2021
b1a7cf0
remove unnecessary code
galipremsagar Sep 4, 2021
0aef021
misc doc fixes
galipremsagar Sep 4, 2021
11bd27a
copyright
galipremsagar Sep 4, 2021
bd1c35e
Update build.sh
galipremsagar Sep 7, 2021
c0435e1
Update build.sh
galipremsagar Sep 7, 2021
cfac5b9
address reviews
galipremsagar Sep 7, 2021
1f7c111
add comment
galipremsagar Sep 7, 2021
1ded3a3
make `kind` an optional param.
galipremsagar Sep 7, 2021
3a099c3
Merge branch 'rapidsai:branch-21.10' into 9125
galipremsagar Sep 7, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion conda/environments/cudf_dev_cuda11.0.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ dependencies:
- python>=3.7,<3.9
- numba>=0.53.1
- numpy
- pandas>=1.0,<1.3.0dev0
- pandas>=1.0,<1.4.0dev0
- pyarrow=5.0.0=*cuda
- fastavro>=0.22.9
- notebook>=0.5.0
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/cudf_dev_cuda11.2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ dependencies:
- python>=3.7,<3.9
- numba>=0.53.1
- numpy
- pandas>=1.0,<1.3.0dev0
- pandas>=1.0,<1.4.0dev0
- pyarrow=5.0.0=*cuda
- fastavro>=0.22.9
- notebook>=0.5.0
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,7 @@ def difference(self, other, sort=None):
if self.dtype != other.dtype:
difference = difference.astype(self.dtype)

if sort is None:
if sort is None and len(other):
return difference.sort_values()

return difference
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/core/_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@
PANDAS_GE_110 = PANDAS_VERSION >= version.parse("1.1")
PANDAS_GE_120 = PANDAS_VERSION >= version.parse("1.2")
PANDAS_LE_122 = PANDAS_VERSION <= version.parse("1.2.2")
PANDAS_GE_130 = PANDAS_VERSION >= version.parse("1.3.0")
PANDAS_LT_140 = PANDAS_VERSION < version.parse("1.4.0")
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ def as_timedelta_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.TimeDeltaColumn":
raise TypeError(
f"cannot astype a datetimelike from [{self.dtype}] to [{dtype}]"
f"cannot astype a datetimelike from {self.dtype} to {dtype}"
)

def as_numerical_column(
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ def as_datetime_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.DatetimeColumn":
raise TypeError(
f"cannot astype a timedelta from [{self.dtype}] to [{dtype}]"
f"cannot astype a timedelta from {self.dtype} to {dtype}"
)

def as_string_column(
Expand Down
3 changes: 1 addition & 2 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6561,8 +6561,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
]

if len(mode_results) == 0:
df = DataFrame(index=self.index)
return df
return DataFrame()

df = cudf.concat(mode_results, axis=1)
if isinstance(df, Series):
Expand Down
8 changes: 4 additions & 4 deletions python/cudf/cudf/core/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,15 +112,15 @@ def from_pandas(cls, dtype: pd.CategoricalDtype) -> "CategoricalDtype":
)

def to_pandas(self) -> pd.CategoricalDtype:
if self.categories is None:
if self._categories is None:
categories = None
else:
if isinstance(
self.categories, (cudf.Float32Index, cudf.Float64Index)
self._categories, (cudf.Float32Index, cudf.Float64Index)
):
categories = self.categories.dropna().to_pandas()
categories = self._categories.dropna().to_pandas()
else:
categories = self.categories.to_pandas()
categories = self._categories.to_pandas()
return pd.CategoricalDtype(categories=categories, ordered=self.ordered)

def _init_categories(self, categories: Any):
Expand Down
21 changes: 11 additions & 10 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2021, NVIDIA CORPORATION.

import collections
import pickle
import warnings
Expand Down Expand Up @@ -501,7 +502,7 @@ def mult(df):
chunk_results = [function(chk) for chk in chunks]

if not len(chunk_results):
return self.obj.__class__()
return self.obj.head(0)

if cudf.utils.dtypes.is_scalar(chunk_results[0]):
result = cudf.Series(chunk_results, index=group_names)
Expand Down Expand Up @@ -630,7 +631,7 @@ def rolling_avg(val, avg):
.. code-block:: python

Results:
cat val avg
cat val avg
0 1 16
1 1 45
2 1 62 41.0
Expand Down Expand Up @@ -713,8 +714,8 @@ def describe(self, include=None, exclude=None):
2 24.0 90
3 26.0 80
>>> gdf.groupby('Score').describe()
Speed
count mean std min 25% 50% 75% max
Speed
count mean std min 25% 50% 75% max
Score
30 1 370.0 <NA> 370.0 370.0 370.0 370.0 370.0
50 1 380.0 <NA> 380.0 380.0 380.0 380.0 380.0
Expand Down Expand Up @@ -946,13 +947,13 @@ def fillna(
>>> df = pd.DataFrame({'k': [1, 1, 2], 'v': [2, None, 4]})
>>> gdf = cudf.from_pandas(df)
>>> df.groupby('k').fillna({'v': 4}) # pandas
v
v
k
1 0 2.0
1 4.0
1 4.0
2 2 4.0
>>> gdf.groupby('k').fillna({'v': 4}) # cudf
v
v
0 2.0
1 4.0
2 4.0
Expand Down Expand Up @@ -1127,9 +1128,9 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
Max Speed
Animal Type
Falcon Captive 390.0
Wild 350.0
Wild 350.0
Parrot Captive 30.0
Wild 20.0
Wild 20.0
>>> df.groupby(level=0).mean()
Max Speed
Animal
Expand Down
5 changes: 3 additions & 2 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) 2019-2020, NVIDIA CORPORATION.
# Copyright (c) 2019-2021, NVIDIA CORPORATION.

from __future__ import annotations

import itertools
Expand Down Expand Up @@ -1422,7 +1423,7 @@ def to_pandas(self, nullable=False, **kwargs):
if hasattr(self, "_source_data"):
result = self._source_data.to_pandas(nullable=nullable)
result.columns = self.names
return pd.MultiIndex.from_frame(result)
return pd.MultiIndex.from_frame(result, names=self.names)

pandas_codes = []
for code in self.codes.columns:
Expand Down
6 changes: 4 additions & 2 deletions python/cudf/cudf/core/window/rolling.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION
# Copyright (c) 2020-2021, NVIDIA CORPORATION

import itertools

Expand Down Expand Up @@ -393,7 +393,9 @@ def __init__(self, groupby, window, min_periods=None, center=False):
# of `groupby.grouping.keys` and `groupby.obj`.
# As an optimization, avoid gathering those twice.
self._group_keys = groupby.grouping.keys.take(sort_order)
obj = groupby.obj.take(sort_order)
obj = groupby.obj.drop(
columns=groupby.grouping._key_column_names_from_obj
).take(sort_order)

gb_size = groupby.size().sort_index()
self._group_starts = (
Expand Down
39 changes: 36 additions & 3 deletions python/cudf/cudf/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,18 @@ def test_categorical_as_unordered(pd_str_cat, inplace):

@pytest.mark.parametrize("from_ordered", [True, False])
@pytest.mark.parametrize("to_ordered", [True, False])
@pytest.mark.parametrize("inplace", [True, False])
@pytest.mark.parametrize(
"inplace",
[
pytest.param(
True,
marks=pytest.mark.xfail(
reason="https://github.com/pandas-dev/pandas/issues/43232"
),
),
False,
],
)
def test_categorical_reorder_categories(
pd_str_cat, from_ordered, to_ordered, inplace
):
Expand All @@ -420,7 +431,18 @@ def test_categorical_reorder_categories(
assert str(cd_sr_1) == str(pd_sr_1)


@pytest.mark.parametrize("inplace", [True, False])
@pytest.mark.parametrize(
"inplace",
[
pytest.param(
True,
marks=pytest.mark.xfail(
reason="https://github.com/pandas-dev/pandas/issues/43232"
),
),
False,
],
)
def test_categorical_add_categories(pd_str_cat, inplace):

pd_sr = pd.Series(pd_str_cat.copy())
Expand All @@ -441,7 +463,18 @@ def test_categorical_add_categories(pd_str_cat, inplace):
assert_eq(pd_sr_1, cd_sr_1)


@pytest.mark.parametrize("inplace", [True, False])
@pytest.mark.parametrize(
"inplace",
[
pytest.param(
True,
marks=pytest.mark.xfail(
reason="https://github.com/pandas-dev/pandas/issues/43232"
),
),
False,
],
)
def test_categorical_remove_categories(pd_str_cat, inplace):

pd_sr = pd.Series(pd_str_cat.copy())
Expand Down
40 changes: 36 additions & 4 deletions python/cudf/cudf/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,9 +525,22 @@ def test_concat_empty_dataframes(df, other, ignore_index):
if expected.shape != df.shape:
for key, col in actual[actual.columns].iteritems():
if is_categorical_dtype(col.dtype):
expected[key] = expected[key].fillna("-1")
if expected[key].dtype != "category":
# TODO: Pandas bug:
# https://github.com/pandas-dev/pandas/issues/42840
expected[key] = expected[key].fillna("-1").astype("str")
else:
expected[key] = (
expected[key]
.cat.add_categories(["-1"])
.fillna("-1")
.astype("str")
)
actual[key] = col.astype("str").fillna("-1")
assert_eq(expected.fillna(-1), actual.fillna(-1), check_dtype=False)
else:
expected[key] = expected[key].fillna(-1)
actual[key] = col.fillna(-1)
assert_eq(expected, actual, check_dtype=False)
else:
assert_eq(
expected, actual, check_index_type=False if gdf.empty else True
Expand Down Expand Up @@ -1079,8 +1092,23 @@ def test_concat_join_empty_dataframes(
if axis == 0:
for key, col in actual[actual.columns].iteritems():
if is_categorical_dtype(col.dtype):
expected[key] = expected[key].fillna("-1")
if expected[key].dtype != "category":
# TODO: Pandas bug:
# https://github.com/pandas-dev/pandas/issues/42840
expected[key] = (
expected[key].fillna("-1").astype("str")
)
else:
expected[key] = (
expected[key]
.cat.add_categories(["-1"])
.fillna("-1")
.astype("str")
)
actual[key] = col.astype("str").fillna("-1")
else:
expected[key] = expected[key].fillna(-1)
actual[key] = col.fillna(-1)

assert_eq(
expected.fillna(-1),
Expand All @@ -1100,7 +1128,11 @@ def test_concat_join_empty_dataframes(
check_column_type=False,
)
assert_eq(
expected, actual, check_index_type=False, check_column_type=False
expected,
actual,
check_dtype=False,
check_index_type=False,
check_column_type=False,
)


Expand Down
13 changes: 1 addition & 12 deletions python/cudf/cudf/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -1762,18 +1762,7 @@ def test_csv_write_empty_column_name(df, index, columns):
cudf.DataFrame(index=cudf.Index([], name="index name")),
],
)
@pytest.mark.parametrize(
"index",
[
True,
pytest.param(
False,
marks=pytest.mark.xfail(
reason="https://github.com/rapidsai/cudf/issues/6691"
),
),
],
)
@pytest.mark.parametrize("index", [True, False])
def test_csv_write_empty_dataframe(df, index):
pdf = df.to_pandas()

Expand Down
Loading