Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Upgrade cudf to support pandas 1.4.x versions #10584

Merged
merged 50 commits into from
May 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
3e95441
upgrade pandas versions
galipremsagar Apr 4, 2022
978a2c1
handle breaking pd.core.window changes
galipremsagar Apr 4, 2022
bea489f
first pass of changes
galipremsagar Apr 4, 2022
d8b06d0
Merge remote-tracking branch 'upstream/branch-22.06' into pandas_1.4x
galipremsagar Apr 5, 2022
925524c
second pass of fixes
galipremsagar Apr 5, 2022
38934ef
Merge remote-tracking branch 'upstream/branch-22.06' into pandas_1.4x
galipremsagar Apr 5, 2022
fe297fe
Merge remote-tracking branch 'upstream/branch-22.06' into pandas_1.4x
galipremsagar Apr 6, 2022
e6098bf
more fixes
galipremsagar Apr 6, 2022
bb3a6b9
Merge remote-tracking branch 'upstream/branch-22.06' into pandas_1.4x
galipremsagar Apr 7, 2022
dd2b05e
another pass
galipremsagar Apr 8, 2022
531527c
Merge remote-tracking branch 'upstream/branch-22.06' into pandas_1.4x
galipremsagar Apr 8, 2022
203b103
more fixes
galipremsagar Apr 11, 2022
5c2010c
Merge remote-tracking branch 'upstream/branch-22.06' into pandas_1.4x
galipremsagar Apr 12, 2022
53363db
remove xfails
galipremsagar Apr 12, 2022
f5dae8c
workaround
galipremsagar Apr 12, 2022
3a0e89b
fix a pytest
galipremsagar Apr 12, 2022
856f5f5
xfail
galipremsagar Apr 12, 2022
1eabd0e
Merge remote-tracking branch 'upstream/branch-22.06' into pandas_1.4x
galipremsagar Apr 13, 2022
8a93674
fixes
galipremsagar Apr 13, 2022
eb34d83
merge
galipremsagar Apr 26, 2022
eeaff59
xfail
galipremsagar Apr 26, 2022
dff79f6
xfail
galipremsagar Apr 26, 2022
c605b33
cleanup
galipremsagar Apr 26, 2022
69c5fc6
temp commit
galipremsagar Apr 26, 2022
b6f0067
Merge branch 'rapidsai:branch-22.06' into pandas_1.4x
galipremsagar Apr 27, 2022
7615e71
Update build.sh
galipremsagar Apr 27, 2022
32665f8
Update build.sh
galipremsagar Apr 27, 2022
c0a2e10
Update build.sh
galipremsagar Apr 28, 2022
e1e1f36
Update build.sh
galipremsagar Apr 28, 2022
2ff1692
Update build.sh
galipremsagar Apr 28, 2022
29b118c
Update build.sh
galipremsagar May 3, 2022
f5588df
Merge remote-tracking branch 'upstream/branch-22.06' into pandas_1.4x
galipremsagar May 3, 2022
c7ab944
Update build.sh
galipremsagar May 3, 2022
0bccdcf
Update build.sh
galipremsagar May 3, 2022
72faede
Update build.sh
galipremsagar May 3, 2022
3b92aab
Merge remote-tracking branch 'upstream/branch-22.06' into pandas_1.4x
galipremsagar May 3, 2022
762cc0f
fix
galipremsagar May 3, 2022
7655c6e
Merge branch 'pandas_1.4x' of https://github.com/galipremsagar/cudf i…
galipremsagar May 3, 2022
5d2184f
Merge remote-tracking branch 'upstream/branch-22.06' into pandas_1.4x
galipremsagar May 5, 2022
b99bae3
import warnings
galipremsagar May 5, 2022
8bbbff5
change to single warning
galipremsagar May 5, 2022
75a1e42
add negative rangeIndex test
galipremsagar May 5, 2022
106edc3
fix name assignment
galipremsagar May 5, 2022
72666b5
version workarounds
galipremsagar May 5, 2022
e551332
Apply suggestions from code review
galipremsagar May 5, 2022
09ab290
Merge branch 'pandas_1.4x' of https://github.com/galipremsagar/cudf i…
galipremsagar May 5, 2022
76b9973
add index type check
galipremsagar May 5, 2022
a005ee9
Update build.sh
galipremsagar May 7, 2022
d89bd9b
Update ci/gpu/build.sh
galipremsagar May 7, 2022
05f0a54
fix recipe
galipremsagar May 9, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion conda/environments/cudf_dev_cuda11.5.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ dependencies:
- python>=3.7,<3.9
- numba>=0.54
- numpy
- pandas>=1.0,<1.4.0dev0
- pandas>=1.0,<1.5.0dev0
- pyarrow=7.0.0=*cuda
- fastavro>=0.22.9
- python-snappy>=0.6.0
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ requirements:
- protobuf
- python
- typing_extensions
- pandas >=1.0,<1.4.0dev0
- pandas >=1.0,<1.5.0dev0
- cupy >=9.5.0,<11.0.0a0
- numba >=0.54
- numpy
Expand Down
11 changes: 11 additions & 0 deletions python/cudf/cudf/_lib/copying.pyx
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

import pickle
import warnings

import pandas as pd

Expand Down Expand Up @@ -608,10 +609,20 @@ def shift(Column input, int offset, object fill_value=None):
cdef DeviceScalar fill

if isinstance(fill_value, DeviceScalar):
fill_value_type = fill_value.dtype
fill = fill_value
else:
fill_value_type = type(fill_value)
fill = as_device_scalar(fill_value, input.dtype)

if not cudf.utils.dtypes._can_cast(input.dtype, fill_value_type):
warnings.warn(
f"Passing {fill_value_type} to shift is deprecated and will "
f"raise in a future version"
f", pass a {input.dtype} scalar instead.",
FutureWarning,
)

cdef column_view c_input = input.view()
cdef int32_t c_offset = offset
cdef const scalar* c_fill_value = fill.get_raw_ptr()
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/core/_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@
PANDAS_GE_120 = PANDAS_VERSION >= version.parse("1.2")
PANDAS_LE_122 = PANDAS_VERSION <= version.parse("1.2.2")
PANDAS_GE_130 = PANDAS_VERSION >= version.parse("1.3.0")
PANDAS_GE_133 = PANDAS_VERSION >= version.parse("1.3.3")
PANDAS_GE_134 = PANDAS_VERSION >= version.parse("1.3.4")
PANDAS_LT_140 = PANDAS_VERSION < version.parse("1.4.0")
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1227,7 +1227,7 @@ def fillna(
fill_value = column.as_column(fill_value, nan_as_null=False)
if isinstance(fill_value, CategoricalColumn):
if self.dtype != fill_value.dtype:
raise ValueError(
raise TypeError(
"Cannot set a Categorical with another, "
"without identical categories"
)
Expand Down
5 changes: 2 additions & 3 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -790,9 +790,8 @@ def _init_from_series_list(self, data, columns, index):
data.extend([o for o in initial_data])
else:
raise ValueError(
f"Shape of passed values is "
f"{(data_length, len(data[0]))}, "
f"indices imply {(index_length, len(data[0]))}"
f"Length of values ({data_length}) does "
f"not match length of index ({index_length})"
)

final_index = as_index(index)
Expand Down
24 changes: 12 additions & 12 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2586,7 +2586,7 @@ def _reduce(self, *args, **kwargs):
def min(
self,
axis=None,
skipna=None,
skipna=True,
level=None,
numeric_only=None,
**kwargs,
Expand Down Expand Up @@ -2637,7 +2637,7 @@ def min(
def max(
self,
axis=None,
skipna=None,
skipna=True,
level=None,
numeric_only=None,
**kwargs,
Expand Down Expand Up @@ -2688,7 +2688,7 @@ def max(
def sum(
self,
axis=None,
skipna=None,
skipna=True,
dtype=None,
level=None,
numeric_only=None,
Expand Down Expand Up @@ -2747,7 +2747,7 @@ def sum(
def product(
self,
axis=None,
skipna=None,
skipna=True,
dtype=None,
level=None,
numeric_only=None,
Expand Down Expand Up @@ -2810,7 +2810,7 @@ def product(

@_cudf_nvtx_annotate
def mean(
self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
):
"""
Return the mean of the values for the requested axis.
Expand Down Expand Up @@ -2857,7 +2857,7 @@ def mean(
def std(
self,
axis=None,
skipna=None,
skipna=True,
level=None,
ddof=1,
numeric_only=None,
Expand Down Expand Up @@ -2914,7 +2914,7 @@ def std(
def var(
self,
axis=None,
skipna=None,
skipna=True,
level=None,
ddof=1,
numeric_only=None,
Expand Down Expand Up @@ -2968,12 +2968,12 @@ def var(

@_cudf_nvtx_annotate
def kurtosis(
self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
):
"""
Return Fisher's unbiased kurtosis of a sample.

Kurtosis obtained using Fishers definition of
Kurtosis obtained using Fisher's definition of
kurtosis (kurtosis of normal == 0.0). Normalized by N-1.

Parameters
Expand Down Expand Up @@ -3025,7 +3025,7 @@ def kurtosis(
# Alias for kurtosis.
@copy_docstring(kurtosis)
def kurt(
self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
):
return self.kurtosis(
axis=axis,
Expand All @@ -3037,7 +3037,7 @@ def kurt(

@_cudf_nvtx_annotate
def skew(
self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
):
"""
Return unbiased Fisher-Pearson skew of a sample.
Expand Down Expand Up @@ -3199,7 +3199,7 @@ def sum_of_squares(self, dtype=None):

@_cudf_nvtx_annotate
def median(
self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
):
"""
Return the median of the values for the requested axis.
Expand Down
30 changes: 30 additions & 0 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -722,6 +722,36 @@ def _intersection(self, other, sort=False):

return new_index

def sort_values(
self,
return_indexer=False,
ascending=True,
na_position="last",
key=None,
):
if key is not None:
raise NotImplementedError("key parameter is not yet implemented.")
if na_position not in {"first", "last"}:
raise ValueError(f"invalid na_position: {na_position}")

sorted_index = self
indexer = RangeIndex(range(len(self)))

sorted_index = self
if ascending:
if self.step < 0:
sorted_index = self[::-1]
indexer = indexer[::-1]
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
else:
if self.step > 0:
sorted_index = self[::-1]
indexer = indexer = indexer[::-1]

if return_indexer:
return sorted_index, indexer
else:
return sorted_index

@_cudf_nvtx_annotate
def _gather(self, gather_map, nullify=False, check_bounds=True):
gather_map = cudf.core.column.as_column(gather_map)
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def _align_objs(objs, how="outer", sort=None):

if not_matching_index:
if not all(o.index.is_unique for o in objs):
raise ValueError("cannot reindex from a duplicate axis")
raise ValueError("cannot reindex on an axis with duplicate labels")

index = objs[0].index
name = index.name
Expand Down
9 changes: 6 additions & 3 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,10 +406,12 @@ def __init__(
else:
index = as_index(data.index)
elif isinstance(data, pd.Index):
name = data.name
if name is None:
name = data.name
data = data.values
elif isinstance(data, BaseIndex):
name = data.name
if name is None:
name = data.name
data = data._values
if dtype is not None:
data = data.astype(dtype)
Expand Down Expand Up @@ -805,8 +807,9 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False):
return cudf.core.dataframe.DataFrame._from_data(data, index)
# For ``name`` behavior, see:
# https://github.com/pandas-dev/pandas/issues/44575
# ``name`` has to be ignored when `drop=True`
return self._mimic_inplace(
Series._from_data(data, index, name if inplace else None),
Series._from_data(data, index, self.name),
inplace=inplace,
)

Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/tests/test_array_ufunc.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ def test_ufunc_index(ufunc):
if fname in ("power", "float_power"):
if (got - expect).abs().max() == 1:
pytest.xfail("https://github.com/rapidsai/cudf/issues/10178")
elif fname in ("bitwise_and", "bitwise_or", "bitwise_xor"):
pytest.xfail("https://github.com/pandas-dev/pandas/issues/46769")
raise


Expand Down
68 changes: 20 additions & 48 deletions python/cudf/cudf/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import cudf as gd
from cudf.api.types import is_categorical_dtype
from cudf.core._compat import PANDAS_LT_140
from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
from cudf.testing._utils import assert_eq, assert_exceptions_equal

Expand Down Expand Up @@ -341,8 +342,8 @@ def test_pandas_concat_compatibility_axis1():
got = gd.concat([d1, d2, d3, d4, d5], axis=1)

assert_eq(
got,
expect,
got.sort_index(),
expect.sort_index(),
check_index_type=True,
)

Expand Down Expand Up @@ -659,9 +660,12 @@ def test_concat_dataframe_with_multiindex(df1, df2):
actual = gd.concat([gdf1, gdf2], axis=1)
expected = pd.concat([pdf1, pdf2], axis=1)

# Will need to sort_index before comparing as
# ordering is not deterministic in case of pandas
# multiIndex with concat.
assert_eq(
expected,
actual,
expected.sort_index(),
actual.sort_index(),
check_index_type=True,
)

Expand Down Expand Up @@ -798,18 +802,8 @@ def test_concat_join_axis_1(objs, ignore_index, sort, join, axis):
ignore_index=ignore_index,
axis=axis,
)
# TODO: Remove special handling below
# after following bug from pandas is fixed:
# https://github.com/pandas-dev/pandas/issues/43584
assert_eq(
expected,
actual,
check_index_type=False
if sort
and isinstance(expected.index, pd.Int64Index)
and isinstance(actual.index, gd.RangeIndex)
else True,
)

assert_eq(expected, actual, check_index_type=True)


@pytest.mark.parametrize("ignore_index", [True, False])
Expand Down Expand Up @@ -875,18 +869,8 @@ def test_concat_join_one_df(ignore_index, sort, join, axis):
actual = gd.concat(
[gdf1], sort=sort, join=join, ignore_index=ignore_index, axis=axis
)
# TODO: Remove special handling below
# after following bug from pandas is fixed:
# https://github.com/pandas-dev/pandas/issues/43584
assert_eq(
expected,
actual,
check_index_type=False
if sort
and isinstance(expected.index, pd.Int64Index)
and isinstance(actual.index, gd.RangeIndex)
else True,
)

assert_eq(expected, actual, check_index_type=True)


@pytest.mark.parametrize(
Expand All @@ -910,6 +894,10 @@ def test_concat_join_one_df(ignore_index, sort, join, axis):
@pytest.mark.parametrize("sort", [True, False])
@pytest.mark.parametrize("join", ["inner", "outer"])
@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.xfail(
condition=PANDAS_LT_140,
reason="https://github.com/pandas-dev/pandas/issues/43584",
)
def test_concat_join_no_overlapping_columns(
pdf1, pdf2, ignore_index, sort, join, axis
):
Expand All @@ -931,19 +919,7 @@ def test_concat_join_no_overlapping_columns(
axis=axis,
)

# TODO: Remove special handling below
# after following bug from pandas is fixed:
# https://github.com/pandas-dev/pandas/issues/43584
assert_eq(
expected,
actual,
check_index_type=False
if sort
and axis == 1
and isinstance(expected.index, pd.Int64Index)
and isinstance(actual.index, gd.RangeIndex)
else True,
)
assert_eq(expected, actual, check_index_type=True)
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved


@pytest.mark.parametrize("ignore_index", [False, True])
Expand Down Expand Up @@ -1097,7 +1073,7 @@ def test_concat_join_no_overlapping_columns_empty_df_basic(
)
# TODO: change `check_index_type` to `True`
# after following bug from pandas is fixed:
# https://github.com/pandas-dev/pandas/issues/43584
# https://github.com/pandas-dev/pandas/issues/46675
assert_eq(expected, actual, check_index_type=False)


Expand Down Expand Up @@ -1133,15 +1109,11 @@ def test_concat_join_series(ignore_index, sort, join, axis):

# TODO: Remove special handling below
# after following bug from pandas is fixed:
# https://github.com/pandas-dev/pandas/issues/43584
# https://github.com/pandas-dev/pandas/issues/46675
assert_eq(
expected,
actual,
check_index_type=False
if sort
and isinstance(expected.index, pd.Int64Index)
and isinstance(actual.index, gd.RangeIndex)
else True,
check_index_type=False if axis == 1 and join == "outer" else True,
)


Expand Down
Loading