Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Upgrade pandas to 1.5 #11617

Merged
merged 43 commits into from
Sep 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
a39281b
test pandas 1.5 rc0
galipremsagar Aug 29, 2022
f87f232
temp commit
galipremsagar Aug 29, 2022
fc1647a
Merge remote-tracking branch 'upstream/branch-22.10' into pandas_1.5.x
galipremsagar Aug 29, 2022
b7b3d76
initial pass of fixes
galipremsagar Aug 30, 2022
41d0381
Merge remote-tracking branch 'upstream/branch-22.10' into pandas_1.5.x
galipremsagar Aug 30, 2022
1b92423
Merge remote-tracking branch 'upstream/branch-22.10' into pandas_1.5.x
galipremsagar Aug 31, 2022
7095878
fix
galipremsagar Aug 31, 2022
01bd01e
Merge remote-tracking branch 'upstream/branch-22.10' into pandas_1.5.x
galipremsagar Aug 31, 2022
28d12db
more fixes
galipremsagar Aug 31, 2022
47eea3c
more fixes
galipremsagar Aug 31, 2022
8d53832
more fixes
galipremsagar Sep 1, 2022
a558088
Merge remote-tracking branch 'upstream/branch-22.10' into pandas_1.5.x
galipremsagar Sep 1, 2022
5fae833
more fixes
galipremsagar Sep 1, 2022
988443d
Merge remote-tracking branch 'upstream/branch-22.10' into pandas_1.5.x
galipremsagar Sep 1, 2022
d8d545e
fix
galipremsagar Sep 1, 2022
817bcf1
merge
galipremsagar Sep 2, 2022
ff61af7
update
galipremsagar Sep 2, 2022
092d20c
Merge remote-tracking branch 'upstream/branch-22.10' into pandas_1.5.x
galipremsagar Sep 2, 2022
287756a
fix
galipremsagar Sep 2, 2022
f95fd4b
Merge remote-tracking branch 'upstream/branch-22.10' into pandas_1.5.x
galipremsagar Sep 6, 2022
e409cc6
Merge remote-tracking branch 'upstream/branch-22.10' into pandas_1.5.x
galipremsagar Sep 6, 2022
d49ee36
merge
galipremsagar Sep 8, 2022
8b6d105
Merge remote-tracking branch 'upstream/branch-22.10' into pandas_1.5.x
galipremsagar Sep 9, 2022
08869aa
fix change in where API behavior
galipremsagar Sep 9, 2022
9424341
Merge remote-tracking branch 'upstream/branch-22.10' into pandas_1.5.x
galipremsagar Sep 9, 2022
e7b9647
version
galipremsagar Sep 10, 2022
825d6e0
Merge remote-tracking branch 'upstream/branch-22.10' into pandas_1.5.x
galipremsagar Sep 12, 2022
a421ecc
Merge remote-tracking branch 'upstream/branch-22.10' into pandas_1.5.x
galipremsagar Sep 12, 2022
13a1010
raise error for setitem with bools
galipremsagar Sep 12, 2022
5547a48
cleanup
galipremsagar Sep 12, 2022
d97b09c
Merge remote-tracking branch 'upstream/branch-22.10' into pandas_1.5.x
galipremsagar Sep 12, 2022
158d441
Update python/cudf/cudf/core/column/numerical.py
galipremsagar Sep 14, 2022
d433fa3
Apply suggestions from code review
galipremsagar Sep 20, 2022
ff6b7b2
Merge branch 'rapidsai:branch-22.10' into pandas_1.5.x
galipremsagar Sep 20, 2022
0cda494
Merge remote-tracking branch 'upstream/branch-22.10' into pandas_1.5.x
galipremsagar Sep 20, 2022
1abc7d9
Merge branch 'pandas_1.5.x' of https://github.com/galipremsagar/cudf …
galipremsagar Sep 20, 2022
7e53b8b
address reviews
galipremsagar Sep 20, 2022
eb12f7a
fix startswith and endswith
galipremsagar Sep 20, 2022
4c19a38
Merge remote-tracking branch 'upstream/branch-22.10' into pandas_1.5.x
galipremsagar Sep 20, 2022
80bbc02
temp commit
galipremsagar Sep 20, 2022
6c9a580
Update ci/gpu/build.sh
galipremsagar Sep 20, 2022
2f00321
improve conda install command
galipremsagar Sep 20, 2022
645feb5
Update ci/gpu/build.sh
galipremsagar Sep 20, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion ci/gpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ gpuci_logger "Check conda environment"
conda info
conda config --show-sources
conda list --show-channel-urls

gpuci_logger "Check compiler versions"
python --version

Expand Down Expand Up @@ -251,6 +250,8 @@ fi

cd "$WORKSPACE/python/cudf/cudf"
# It is essential to cd into $WORKSPACE/python/cudf/cudf as `pytest-xdist` + `coverage` seem to work only at this directory level.
gpuci_logger "Check conda packages"
conda list
gpuci_logger "Python py.test for cuDF"
py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config="$WORKSPACE/python/cudf/.coveragerc" --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope tests

Expand Down
2 changes: 1 addition & 1 deletion conda/environments/cudf_dev_cuda11.5.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ dependencies:
- python>=3.8,<3.10
- numba>=0.54
- numpy
- pandas>=1.0,<1.5.0dev0
- pandas>=1.0,<1.6.0dev0
- pyarrow=9
- fastavro>=0.22.9
- python-snappy>=0.6.0
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ requirements:
- protobuf>=3.20.1,<3.21.0a0
- python
- typing_extensions
- pandas >=1.0,<1.5.0dev0
- pandas >=1.0,<1.6.0dev0
- cupy >=9.5.0,<12.0.0a0
- numba >=0.54
- numpy
Expand Down
7 changes: 7 additions & 0 deletions python/cudf/cudf/core/_internals/where.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from cudf.core.missing import NA
from cudf.utils.dtypes import (
_can_cast,
_dtype_can_hold_element,
find_common_type,
is_mixed_with_object_dtype,
)
Expand Down Expand Up @@ -84,6 +85,12 @@ def _check_and_cast_columns_with_other(
other, source_dtype
):
common_dtype = source_dtype
elif (
isinstance(source_col, cudf.core.column.NumericalColumn)
and other_is_scalar
and _dtype_can_hold_element(source_dtype, other)
):
common_dtype = source_dtype
else:
common_dtype = find_common_type(
[
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -755,7 +755,7 @@ def __setitem__(self, key, value):
)

if to_add_categories > 0:
raise ValueError(
raise TypeError(
"Cannot setitem on a Categorical with a new "
"category, set the categories first"
)
Expand Down
10 changes: 7 additions & 3 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
is_string_dtype,
is_struct_dtype,
)
from cudf.core._compat import PANDAS_GE_150
from cudf.core.abc import Serializable
from cudf.core.buffer import Buffer, DeviceBufferLike, as_device_buffer_like
from cudf.core.dtypes import (
Expand All @@ -83,6 +84,11 @@
)
from cudf.utils.utils import _array_ufunc, mask_dtype

if PANDAS_GE_150:
from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
else:
from pandas.core.arrays._arrow_utils import ArrowIntervalType

T = TypeVar("T", bound="ColumnBase")
# TODO: This workaround allows type hints for `slice`, since `slice` is a
# method in ColumnBase.
Expand Down Expand Up @@ -290,9 +296,7 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
size=codes.size,
ordered=array.type.ordered,
)
elif isinstance(
array.type, pd.core.arrays._arrow_utils.ArrowIntervalType
):
elif isinstance(array.type, ArrowIntervalType):
mroeschke marked this conversation as resolved.
Show resolved Hide resolved
return cudf.core.column.IntervalColumn.from_arrow(array)

result = libcudf.interop.from_arrow(data)[0]
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,5 +131,5 @@ def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series":
# types into pandas (trying to convert the underlying numerical columns
# directly is problematic), so we're stuck with this for now.
return pd.Series(
pd.IntervalDtype().__from_arrow__(self.to_arrow()), index=index
self.dtype.to_pandas().__from_arrow__(self.to_arrow()), index=index
)
38 changes: 38 additions & 0 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
is_integer,
is_integer_dtype,
is_number,
is_scalar,
)
from cudf.core.buffer import DeviceBufferLike, as_device_buffer_like
from cudf.core.column import (
Expand Down Expand Up @@ -128,6 +129,43 @@ def has_nulls(self, include_nan=False):
self.nan_count != 0 if include_nan else False
)

def __setitem__(self, key: Any, value: Any):
shwina marked this conversation as resolved.
Show resolved Hide resolved
"""
Set the value of ``self[key]`` to ``value``.

If ``value`` and ``self`` are of different types, ``value`` is coerced
to ``self.dtype``.
"""

# Normalize value to scalar/column
device_value = (
cudf.Scalar(
value,
dtype=self.dtype
if cudf._lib.scalar._is_null_host_scalar(value)
else None,
)
if is_scalar(value)
else as_column(value)
)

if not is_bool_dtype(self.dtype) and is_bool_dtype(device_value.dtype):
raise TypeError(f"Invalid value {value} for dtype {self.dtype}")
else:
device_value = device_value.astype(self.dtype)

out: Optional[ColumnBase] # If None, no need to perform mimic inplace.
if isinstance(key, slice):
out = self._scatter_by_slice(key, device_value)
else:
key = as_column(key)
if not isinstance(key, cudf.core.column.NumericalColumn):
raise ValueError(f"Invalid scatter map type {key.dtype}.")
out = self._scatter_by_column(key, device_value)

if out:
self._mimic_inplace(out, inplace=True)

@property
def __cuda_array_interface__(self) -> Mapping[str, Any]:
output = {
Expand Down
10 changes: 6 additions & 4 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -3752,8 +3752,9 @@ def endswith(self, pat: str) -> SeriesOrIndex:
dtype: bool
"""
if pat is None:
result_col = column.column_empty(
len(self._column), dtype="bool", masked=True
raise TypeError(
f"expected a string or a sequence-like object, not "
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note, in 1.5. we also further restricted startswith/endswith to only accept a tuple specifically to mirror the stdlib

In [15]: "f".endswith(["f", "g"])
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Input In [15], in <cell line: 1>()
----> 1 "f".endswith(["f", "g"])

TypeError: endswith first arg must be str or a tuple of str, not list

Copy link
Contributor Author

@galipremsagar galipremsagar Sep 20, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did see that change, I'll be opening a follow-up PR because that will need some discussions & libcudf side changes too.

f"{type(pat).__name__}"
)
elif is_scalar(pat):
result_col = libstrings.endswith(
Expand Down Expand Up @@ -3814,8 +3815,9 @@ def startswith(self, pat: Union[str, Sequence]) -> SeriesOrIndex:
dtype: bool
"""
if pat is None:
result_col = column.column_empty(
len(self._column), dtype="bool", masked=True
raise TypeError(
f"expected a string or a sequence-like object, not "
f"{type(pat).__name__}"
)
elif is_scalar(pat):
result_col = libstrings.startswith(
Expand Down
14 changes: 12 additions & 2 deletions python/cudf/cudf/core/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,22 @@
import pyarrow as pa
from pandas.api import types as pd_types
from pandas.api.extensions import ExtensionDtype
from pandas.core.arrays._arrow_utils import ArrowIntervalType
from pandas.core.dtypes.dtypes import (
CategoricalDtype as pd_CategoricalDtype,
CategoricalDtypeType as pd_CategoricalDtypeType,
)

import cudf
from cudf._typing import Dtype
from cudf.core._compat import PANDAS_GE_130
from cudf.core._compat import PANDAS_GE_130, PANDAS_GE_150
from cudf.core.abc import Serializable
from cudf.core.buffer import DeviceBufferLike

if PANDAS_GE_150:
from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
else:
from pandas.core.arrays._arrow_utils import ArrowIntervalType


def dtype(arbitrary):
"""
Expand Down Expand Up @@ -610,6 +614,12 @@ def from_pandas(cls, pd_dtype: pd.IntervalDtype) -> "IntervalDtype":
else:
return cls(subtype=pd_dtype.subtype)

def to_pandas(self) -> pd.IntervalDtype:
if PANDAS_GE_130:
return pd.IntervalDtype(subtype=self.subtype, closed=self.closed)
else:
return pd.IntervalDtype(subtype=self.subtype)

def __eq__(self, other):
if isinstance(other, str):
# This means equality isn't transitive but mimics pandas
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from cudf._typing import DataFrameOrSeries
from cudf.api.types import is_integer, is_list_like, is_object_dtype
from cudf.core import column
from cudf.core._compat import PANDAS_GE_120
from cudf.core._compat import PANDAS_GE_120, PANDAS_GE_150
from cudf.core.frame import Frame
from cudf.core.index import (
BaseIndex,
Expand Down Expand Up @@ -451,8 +451,8 @@ def __repr__(self):
)
)

if PANDAS_GE_120:
# TODO: Remove this whole `if` block,
if PANDAS_GE_120 and not PANDAS_GE_150:
# Need this whole `if` block,
# this is a workaround for the following issue:
# https://github.com/pandas-dev/pandas/issues/39984
preprocess_pdf = pd.DataFrame(
Expand Down
22 changes: 16 additions & 6 deletions python/cudf/cudf/core/window/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from cudf import _lib as libcudf
from cudf.api.types import is_integer, is_number
from cudf.core import column
from cudf.core._compat import PANDAS_GE_150
from cudf.core.column.column import as_column
from cudf.core.mixins import Reducible
from cudf.utils import cudautils
Expand Down Expand Up @@ -215,12 +216,21 @@ def _apply_agg_column(self, source_column, agg_name):
following_window = None
window = self.window
elif isinstance(self.window, BaseIndexer):
start, end = self.window.get_window_bounds(
num_values=len(self.obj),
min_periods=self.min_periods,
center=self.center,
closed=None,
)
if PANDAS_GE_150:
start, end = self.window.get_window_bounds(
num_values=len(self.obj),
min_periods=self.min_periods,
center=self.center,
closed=None,
step=None,
)
else:
start, end = self.window.get_window_bounds(
num_values=len(self.obj),
min_periods=self.min_periods,
center=self.center,
closed=None,
)
start = as_column(start, dtype="int32")
end = as_column(end, dtype="int32")

Expand Down
10 changes: 8 additions & 2 deletions python/cudf/cudf/tests/test_array_ufunc.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pytest

import cudf
from cudf.core._compat import PANDAS_GE_150
from cudf.testing._utils import assert_eq, set_random_null_mask_inplace

_UFUNCS = [
Expand Down Expand Up @@ -84,14 +85,19 @@ def test_ufunc_index(ufunc):
assert_eq(g, e, check_exact=False)
else:
assert_eq(got, expect, check_exact=False)
except AssertionError:
except AssertionError as e:
# TODO: This branch can be removed when
# https://github.com/rapidsai/cudf/issues/10178 is resolved
if fname in ("power", "float_power"):
if (got - expect).abs().max() == 1:
pytest.xfail("https://github.com/rapidsai/cudf/issues/10178")
elif fname in ("bitwise_and", "bitwise_or", "bitwise_xor"):
pytest.xfail("https://github.com/pandas-dev/pandas/issues/46769")
if PANDAS_GE_150:
raise e
else:
pytest.xfail(
"https://github.com/pandas-dev/pandas/issues/46769"
)
raise


Expand Down
9 changes: 6 additions & 3 deletions python/cudf/cudf/tests/test_binops.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import cudf
from cudf import Series
from cudf.core._compat import PANDAS_GE_150
from cudf.core.index import as_index
from cudf.testing import _utils as utils
from cudf.utils.dtypes import (
Expand Down Expand Up @@ -768,7 +769,7 @@ def test_operator_func_between_series_logical(
@pytest.mark.parametrize("func", _operators_comparison)
@pytest.mark.parametrize("has_nulls", [True, False])
@pytest.mark.parametrize("scalar", [-59.0, np.nan, 0, 59.0])
@pytest.mark.parametrize("fill_value", [None, True, False, 1.0])
@pytest.mark.parametrize("fill_value", [None, 1.0])
@pytest.mark.parametrize("use_cudf_scalar", [False, True])
def test_operator_func_series_and_scalar_logical(
dtype, func, has_nulls, scalar, fill_value, use_cudf_scalar
Expand Down Expand Up @@ -1561,7 +1562,8 @@ def test_scalar_null_binops(op, dtype_l, dtype_r):
pytest.param(
"nanoseconds",
marks=pytest.mark.xfail(
reason="https://github.com/pandas-dev/pandas/issues/36589"
condition=not PANDAS_GE_150,
reason="https://github.com/pandas-dev/pandas/issues/36589",
),
),
],
Expand Down Expand Up @@ -1668,7 +1670,8 @@ def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op):
pytest.param(
"nanoseconds",
marks=pytest.mark.xfail(
reason="https://github.com/pandas-dev/pandas/issues/36589"
condition=not PANDAS_GE_150,
reason="https://github.com/pandas-dev/pandas/issues/36589",
),
),
],
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ def test_categorical_as_unordered(pd_str_cat, inplace):
pytest.param(
True,
marks=pytest.mark.skipif(
not PANDAS_GE_134,
condition=not PANDAS_GE_134,
reason="https://github.com/pandas-dev/pandas/issues/43232",
),
),
Expand Down Expand Up @@ -454,7 +454,7 @@ def test_categorical_reorder_categories(
pytest.param(
True,
marks=pytest.mark.skipif(
not PANDAS_GE_134,
condition=not PANDAS_GE_134,
reason="https://github.com/pandas-dev/pandas/issues/43232",
),
),
Expand Down Expand Up @@ -491,7 +491,7 @@ def test_categorical_add_categories(pd_str_cat, inplace):
pytest.param(
True,
marks=pytest.mark.skipif(
not PANDAS_GE_134,
condition=not PANDAS_GE_134,
reason="https://github.com/pandas-dev/pandas/issues/43232",
),
),
Expand Down
Loading