diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml index 15f4bff583e..1b79bdb763f 100644 --- a/conda/environments/cudf_dev_cuda11.5.yml +++ b/conda/environments/cudf_dev_cuda11.5.yml @@ -17,7 +17,7 @@ dependencies: - python>=3.7,<3.9 - numba>=0.54 - numpy - - pandas>=1.0,<1.4.0dev0 + - pandas>=1.0,<1.5.0dev0 - pyarrow=7.0.0=*cuda - fastavro>=0.22.9 - python-snappy>=0.6.0 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 84443a45567..a88eea949e9 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -42,7 +42,7 @@ requirements: - protobuf - python - typing_extensions - - pandas >=1.0,<1.4.0dev0 + - pandas >=1.0,<1.5.0dev0 - cupy >=9.5.0,<11.0.0a0 - numba >=0.54 - numpy diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index abf20869a15..1e315ea4785 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -1,6 +1,7 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. import pickle +import warnings import pandas as pd @@ -608,10 +609,20 @@ def shift(Column input, int offset, object fill_value=None): cdef DeviceScalar fill if isinstance(fill_value, DeviceScalar): + fill_value_type = fill_value.dtype fill = fill_value else: + fill_value_type = type(fill_value) fill = as_device_scalar(fill_value, input.dtype) + if not cudf.utils.dtypes._can_cast(input.dtype, fill_value_type): + warnings.warn( + f"Passing {fill_value_type} to shift is deprecated and will " + f"raise in a future version" + f", pass a {input.dtype} scalar instead.", + FutureWarning, + ) + cdef column_view c_input = input.view() cdef int32_t c_offset = offset cdef const scalar* c_fill_value = fill.get_raw_ptr() diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py index 70162c7afc6..f30d229ee4e 100644 --- a/python/cudf/cudf/core/_compat.py +++ b/python/cudf/cudf/core/_compat.py @@ -9,5 +9,6 @@ PANDAS_GE_120 = PANDAS_VERSION >= version.parse("1.2") PANDAS_LE_122 = PANDAS_VERSION <= version.parse("1.2.2") PANDAS_GE_130 = PANDAS_VERSION >= version.parse("1.3.0") +PANDAS_GE_133 = PANDAS_VERSION >= version.parse("1.3.3") PANDAS_GE_134 = PANDAS_VERSION >= version.parse("1.3.4") PANDAS_LT_140 = PANDAS_VERSION < version.parse("1.4.0") diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index f9bb7ea2f1a..777e8ac7463 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1227,7 +1227,7 @@ def fillna( fill_value = column.as_column(fill_value, nan_as_null=False) if isinstance(fill_value, CategoricalColumn): if self.dtype != fill_value.dtype: - raise ValueError( + raise TypeError( "Cannot set a Categorical with another, " "without identical categories" ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 036ef890696..a3e2f40b28e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -790,9 +790,8 @@ def _init_from_series_list(self, data, columns, index): data.extend([o for o in initial_data]) else: raise ValueError( - f"Shape of passed values is " - f"{(data_length, len(data[0]))}, " - f"indices imply {(index_length, len(data[0]))}" + f"Length of values ({data_length}) does " + f"not match length of index ({index_length})" ) final_index = as_index(index) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index d0e9e6d94c1..84d3d95e216 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2586,7 +2586,7 @@ def _reduce(self, *args, **kwargs): def min( self, axis=None, - skipna=None, + skipna=True, level=None, numeric_only=None, **kwargs, @@ -2637,7 +2637,7 @@ def min( def max( self, axis=None, - skipna=None, + skipna=True, level=None, numeric_only=None, **kwargs, @@ -2688,7 +2688,7 @@ def max( def sum( self, axis=None, - skipna=None, + skipna=True, dtype=None, level=None, numeric_only=None, @@ -2747,7 +2747,7 @@ def sum( def product( self, axis=None, - skipna=None, + skipna=True, dtype=None, level=None, numeric_only=None, @@ -2810,7 +2810,7 @@ def product( @_cudf_nvtx_annotate def mean( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs ): """ Return the mean of the values for the requested axis. @@ -2857,7 +2857,7 @@ def mean( def std( self, axis=None, - skipna=None, + skipna=True, level=None, ddof=1, numeric_only=None, @@ -2914,7 +2914,7 @@ def std( def var( self, axis=None, - skipna=None, + skipna=True, level=None, ddof=1, numeric_only=None, @@ -2968,12 +2968,12 @@ def var( @_cudf_nvtx_annotate def kurtosis( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs ): """ Return Fisher's unbiased kurtosis of a sample. - Kurtosis obtained using Fisher’s definition of + Kurtosis obtained using Fisher's definition of kurtosis (kurtosis of normal == 0.0). Normalized by N-1. Parameters @@ -3025,7 +3025,7 @@ def kurtosis( # Alias for kurtosis. @copy_docstring(kurtosis) def kurt( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs ): return self.kurtosis( axis=axis, @@ -3037,7 +3037,7 @@ def kurt( @_cudf_nvtx_annotate def skew( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs ): """ Return unbiased Fisher-Pearson skew of a sample. @@ -3199,7 +3199,7 @@ def sum_of_squares(self, dtype=None): @_cudf_nvtx_annotate def median( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs ): """ Return the median of the values for the requested axis. diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 1ed530ae22b..37039a009ca 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -722,6 +722,36 @@ def _intersection(self, other, sort=False): return new_index + def sort_values( + self, + return_indexer=False, + ascending=True, + na_position="last", + key=None, + ): + if key is not None: + raise NotImplementedError("key parameter is not yet implemented.") + if na_position not in {"first", "last"}: + raise ValueError(f"invalid na_position: {na_position}") + + sorted_index = self + indexer = RangeIndex(range(len(self))) + + sorted_index = self + if ascending: + if self.step < 0: + sorted_index = self[::-1] + indexer = indexer[::-1] + else: + if self.step > 0: + sorted_index = self[::-1] + indexer = indexer = indexer[::-1] + + if return_indexer: + return sorted_index, indexer + else: + return sorted_index + @_cudf_nvtx_annotate def _gather(self, gather_map, nullify=False, check_bounds=True): gather_map = cudf.core.column.as_column(gather_map) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index b405c018983..744437a02c7 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -43,7 +43,7 @@ def _align_objs(objs, how="outer", sort=None): if not_matching_index: if not all(o.index.is_unique for o in objs): - raise ValueError("cannot reindex from a duplicate axis") + raise ValueError("cannot reindex on an axis with duplicate labels") index = objs[0].index name = index.name diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index d813db58d1e..41d7c11870f 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -406,10 +406,12 @@ def __init__( else: index = as_index(data.index) elif isinstance(data, pd.Index): - name = data.name + if name is None: + name = data.name data = data.values elif isinstance(data, BaseIndex): - name = data.name + if name is None: + name = data.name data = data._values if dtype is not None: data = data.astype(dtype) @@ -805,8 +807,9 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): return cudf.core.dataframe.DataFrame._from_data(data, index) # For ``name`` behavior, see: # https://github.com/pandas-dev/pandas/issues/44575 + # ``name`` has to be ignored when `drop=True` return self._mimic_inplace( - Series._from_data(data, index, name if inplace else None), + Series._from_data(data, index, self.name), inplace=inplace, ) diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index 19ef2b66c2a..3ff5210ed94 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -90,6 +90,8 @@ def test_ufunc_index(ufunc): if fname in ("power", "float_power"): if (got - expect).abs().max() == 1: pytest.xfail("https://github.com/rapidsai/cudf/issues/10178") + elif fname in ("bitwise_and", "bitwise_or", "bitwise_xor"): + pytest.xfail("https://github.com/pandas-dev/pandas/issues/46769") raise diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 2017ba06f76..4eab68e83a6 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -9,6 +9,7 @@ import cudf as gd from cudf.api.types import is_categorical_dtype +from cudf.core._compat import PANDAS_LT_140 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing._utils import assert_eq, assert_exceptions_equal @@ -341,8 +342,8 @@ def test_pandas_concat_compatibility_axis1(): got = gd.concat([d1, d2, d3, d4, d5], axis=1) assert_eq( - got, - expect, + got.sort_index(), + expect.sort_index(), check_index_type=True, ) @@ -659,9 +660,12 @@ def test_concat_dataframe_with_multiindex(df1, df2): actual = gd.concat([gdf1, gdf2], axis=1) expected = pd.concat([pdf1, pdf2], axis=1) + # Will need to sort_index before comparing as + # ordering is not deterministic in case of pandas + # multiIndex with concat. assert_eq( - expected, - actual, + expected.sort_index(), + actual.sort_index(), check_index_type=True, ) @@ -798,18 +802,8 @@ def test_concat_join_axis_1(objs, ignore_index, sort, join, axis): ignore_index=ignore_index, axis=axis, ) - # TODO: Remove special handling below - # after following bug from pandas is fixed: - # https://github.com/pandas-dev/pandas/issues/43584 - assert_eq( - expected, - actual, - check_index_type=False - if sort - and isinstance(expected.index, pd.Int64Index) - and isinstance(actual.index, gd.RangeIndex) - else True, - ) + + assert_eq(expected, actual, check_index_type=True) @pytest.mark.parametrize("ignore_index", [True, False]) @@ -875,18 +869,8 @@ def test_concat_join_one_df(ignore_index, sort, join, axis): actual = gd.concat( [gdf1], sort=sort, join=join, ignore_index=ignore_index, axis=axis ) - # TODO: Remove special handling below - # after following bug from pandas is fixed: - # https://github.com/pandas-dev/pandas/issues/43584 - assert_eq( - expected, - actual, - check_index_type=False - if sort - and isinstance(expected.index, pd.Int64Index) - and isinstance(actual.index, gd.RangeIndex) - else True, - ) + + assert_eq(expected, actual, check_index_type=True) @pytest.mark.parametrize( @@ -910,6 +894,10 @@ def test_concat_join_one_df(ignore_index, sort, join, axis): @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("join", ["inner", "outer"]) @pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.xfail( + condition=PANDAS_LT_140, + reason="https://github.com/pandas-dev/pandas/issues/43584", +) def test_concat_join_no_overlapping_columns( pdf1, pdf2, ignore_index, sort, join, axis ): @@ -931,19 +919,7 @@ def test_concat_join_no_overlapping_columns( axis=axis, ) - # TODO: Remove special handling below - # after following bug from pandas is fixed: - # https://github.com/pandas-dev/pandas/issues/43584 - assert_eq( - expected, - actual, - check_index_type=False - if sort - and axis == 1 - and isinstance(expected.index, pd.Int64Index) - and isinstance(actual.index, gd.RangeIndex) - else True, - ) + assert_eq(expected, actual, check_index_type=True) @pytest.mark.parametrize("ignore_index", [False, True]) @@ -1097,7 +1073,7 @@ def test_concat_join_no_overlapping_columns_empty_df_basic( ) # TODO: change `check_index_type` to `True` # after following bug from pandas is fixed: - # https://github.com/pandas-dev/pandas/issues/43584 + # https://github.com/pandas-dev/pandas/issues/46675 assert_eq(expected, actual, check_index_type=False) @@ -1133,15 +1109,11 @@ def test_concat_join_series(ignore_index, sort, join, axis): # TODO: Remove special handling below # after following bug from pandas is fixed: - # https://github.com/pandas-dev/pandas/issues/43584 + # https://github.com/pandas-dev/pandas/issues/46675 assert_eq( expected, actual, - check_index_type=False - if sort - and isinstance(expected.index, pd.Int64Index) - and isinstance(actual.index, gd.RangeIndex) - else True, + check_index_type=False if axis == 1 and join == "outer" else True, ) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 0c4bf68faa9..acad2507292 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -1275,6 +1275,7 @@ def test_csv_reader_column_names(names): assert list(df) == list(names) +@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/10618") def test_csv_reader_repeated_column_name(): buffer = """A,A,A.1,A,A.2,A,A.4,A,A 1,2,3.1,4,a.2,a,a.4,a,a diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 7f482c0e776..f388bc4ed0a 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -20,7 +20,12 @@ from numba import cuda import cudf -from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_120 +from cudf.core._compat import ( + PANDAS_GE_110, + PANDAS_GE_120, + PANDAS_GE_134, + PANDAS_LT_140, +) from cudf.core.column import column from cudf.testing import _utils as utils from cudf.testing._utils import ( @@ -1941,7 +1946,7 @@ def gdf(pdf): "any", ], ) -@pytest.mark.parametrize("skipna", [True, False, None]) +@pytest.mark.parametrize("skipna", [True, False]) def test_dataframe_reductions(data, axis, func, skipna): pdf = pd.DataFrame(data=data) gdf = cudf.DataFrame.from_pandas(pdf) @@ -2005,7 +2010,7 @@ def test_dataframe_count_reduction(data, func): ], ) @pytest.mark.parametrize("ops", ["sum", "product", "prod"]) -@pytest.mark.parametrize("skipna", [True, False, None]) +@pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("min_count", [-10, -1, 0, 1, 2, 3, 10]) def test_dataframe_min_count_ops(data, ops, skipna, min_count): psr = pd.DataFrame(data) @@ -3072,7 +3077,8 @@ def test_dataframe_empty_sort_index(): pd.RangeIndex(2, -1, -1), marks=[ pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/43591" + condition=PANDAS_LT_140, + reason="https://github.com/pandas-dev/pandas/issues/43591", ) ], ), @@ -6937,7 +6943,16 @@ def test_dataframe_append_series_dict(df, other, sort): actual = gdf.append(other_gd, ignore_index=True, sort=sort) if expected.shape != df.shape: - assert_eq(expected.fillna(-1), actual.fillna(-1), check_dtype=False) + # Ignore the column type comparison because pandas incorrectly + # returns pd.Index([1, 2, 3], dtype="object") instead + # of pd.Index([1, 2, 3], dtype="int64") + assert_eq( + expected.fillna(-1), + actual.fillna(-1), + check_dtype=False, + check_column_type=False, + check_index_type=True, + ) else: assert_eq( expected, actual, check_index_type=False if gdf.empty else True @@ -7156,7 +7171,12 @@ def test_dataframe_append_lists(df, other, sort, ignore_index): actual = gdf.append(other_gd, sort=sort, ignore_index=ignore_index) if expected.shape != df.shape: - assert_eq(expected.fillna(-1), actual.fillna(-1), check_dtype=False) + assert_eq( + expected.fillna(-1), + actual.fillna(-1), + check_dtype=False, + check_column_type=False if gdf.empty else True, + ) else: assert_eq( expected, actual, check_index_type=False if gdf.empty else True @@ -7510,6 +7530,12 @@ def test_dataframe_init_from_series_list(data, ignore_dtype, columns): actual = cudf.DataFrame(gd_data, columns=columns) if ignore_dtype: + # When a union is performed to generate columns, + # the order is never guaranteed. Hence sort by + # columns before comparison. + if not expected.columns.equals(actual.columns): + expected = expected.sort_index(axis=1) + actual = actual.sort_index(axis=1) assert_eq( expected.fillna(-1), actual.fillna(-1), @@ -7599,6 +7625,12 @@ def test_dataframe_init_from_series_list_with_index( actual = cudf.DataFrame(gd_data, columns=columns, index=index) if ignore_dtype: + # When a union is performed to generate columns, + # the order is never guaranteed. Hence sort by + # columns before comparison. + if not expected.columns.equals(actual.columns): + expected = expected.sort_index(axis=1) + actual = actual.sort_index(axis=1) assert_eq(expected.fillna(-1), actual.fillna(-1), check_dtype=False) else: assert_eq(expected, actual) @@ -8630,14 +8662,16 @@ def test_explode(data, labels, ignore_index, p_index, label_to_explode): pdf = pd.DataFrame(data, index=p_index, columns=labels) gdf = cudf.from_pandas(pdf) - # TODO: Remove this workaround after - # following issue is fixed: - # https://github.com/pandas-dev/pandas/issues/43314 - if isinstance(label_to_explode, int): - pdlabel_to_explode = [label_to_explode] + if PANDAS_GE_134: + expect = pdf.explode(label_to_explode, ignore_index) else: - pdlabel_to_explode = label_to_explode - expect = pdf.explode(pdlabel_to_explode, ignore_index) + # https://github.com/pandas-dev/pandas/issues/43314 + if isinstance(label_to_explode, int): + pdlabel_to_explode = [label_to_explode] + else: + pdlabel_to_explode = label_to_explode + expect = pdf.explode(pdlabel_to_explode, ignore_index) + got = gdf.explode(label_to_explode, ignore_index) assert_eq(expect, got, check_dtype=False) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 8be338e787a..07242ea49f5 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -13,6 +13,7 @@ import cudf import cudf.testing.dataset_generator as dataset_generator from cudf import DataFrame, Series +from cudf.core._compat import PANDAS_LT_140 from cudf.core.index import DatetimeIndex from cudf.testing._utils import ( DATETIME_TYPES, @@ -1463,7 +1464,7 @@ def test_is_month_start(data, dtype): pytest.param( {"hours": 10, "days": 57, "nanoseconds": 3}, marks=pytest.mark.xfail( - True, + condition=PANDAS_LT_140, reason="Pandas ignoring nanoseconds component. " "https://github.com/pandas-dev/pandas/issues/44393", ), @@ -1550,6 +1551,8 @@ def test_date_range_end_freq_periods(end, freq, periods): if isinstance(freq, str): _gfreq = _pfreq = freq else: + if "nanoseconds" in freq: + pytest.xfail("https://github.com/pandas-dev/pandas/issues/46877") _gfreq = cudf.DateOffset(**freq) _pfreq = pd.DateOffset(**freq) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 9e87fdbd3be..b1625b5f67e 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1350,7 +1350,7 @@ def test_groupby_nth(n, by): @pytest.mark.xfail( - condition=PANDAS_GE_130 and PANDAS_LT_140, + condition=PANDAS_GE_130, reason="https://github.com/pandas-dev/pandas/issues/43209", ) def test_raise_data_error(): @@ -1890,6 +1890,7 @@ def test_groupby_shift_row(nelem, shift_perc, direction, fill_value): @pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) @pytest.mark.parametrize("direction", [1, -1]) @pytest.mark.parametrize("fill_value", [None, 0, 42]) +@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/10608") def test_groupby_shift_row_mixed_numerics( nelem, shift_perc, direction, fill_value ): diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 05830f79880..d81a9f30cfa 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -11,7 +11,7 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_110 +from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_133 from cudf.core.index import ( CategoricalIndex, DatetimeIndex, @@ -504,7 +504,8 @@ def test_empty_df_head_tail_index(n): 10, None, marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/43240" + condition=not PANDAS_GE_133, + reason="https://github.com/pandas-dev/pandas/issues/43240", ), ), ( @@ -705,6 +706,7 @@ def test_index_argsort(data): pd.Index([102, 1001, 1002, 0.0, 23], dtype="datetime64[ns]"), pd.Index([13240.2, 1001, 100.2, 0.0, 23], dtype="datetime64[ns]"), pd.RangeIndex(0, 10, 1), + pd.RangeIndex(0, -100, -2), pd.Index([-10.2, 100.1, -100.2, 0.0, 23], dtype="timedelta64[ns]"), ], ) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 790fbd0d3f8..6d4cd21fad6 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -981,7 +981,13 @@ def test_series_setitem_iloc(key, value, nulls): @pytest.mark.parametrize( "key, value", [ - (0, 0.5), + pytest.param( + 0, + 0.5, + marks=pytest.mark.xfail( + reason="https://github.com/rapidsai/cudf/issues/9913" + ), + ), ([0, 1], 0.5), ([0, 1], [0.5, 2.5]), (slice(0, 2), [0.5, 0.25]), @@ -1446,7 +1452,12 @@ def test_loc_zero_dim_array(): slice((1, 2), None), slice(None, (1, 2)), (1, 1), - (1, slice(None)), + pytest.param( + (1, slice(None)), + marks=pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/46704" + ), + ), ], ) def test_loc_series_multiindex(arg): diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index f3830ed386a..4d06e869fdf 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -15,7 +15,7 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_130, PANDAS_LT_140 +from cudf.core._compat import PANDAS_GE_130 from cudf.core.column import as_column from cudf.core.index import as_index from cudf.testing._utils import assert_eq, assert_exceptions_equal, assert_neq @@ -1031,7 +1031,7 @@ def test_multicolumn_loc(pdf, pdfIndex): @pytest.mark.xfail( - condition=PANDAS_GE_130 and PANDAS_LT_140, + condition=PANDAS_GE_130, reason="https://github.com/pandas-dev/pandas/issues/43351", ) def test_multicolumn_set_item(pdf, pdfIndex): diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py index 15a7eab738a..5c8773edd63 100644 --- a/python/cudf/cudf/tests/test_rank.py +++ b/python/cudf/cudf/tests/test_rank.py @@ -62,12 +62,9 @@ def test_rank_all_arguments( else: expected = pdf.copy(deep=True) - # TODO: Remove per column iteration once the - # following issue is fixed : - # https://github.com/pandas-dev/pandas/issues/43310 - for col in expected.columns: - expected[col] = pdf[col].rank(**kwargs) actual = gdf.rank(**kwargs) + expected = pdf.rank(**kwargs) + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index 08311f89148..94061b8543b 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -8,6 +8,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_134, PANDAS_LT_140 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing._utils import ( INTEGER_TYPES, @@ -56,7 +57,12 @@ def test_series_replace_all(gsr, to_replace, value): pd_value = value actual = gsr.replace(to_replace=gd_to_replace, value=gd_value) - expected = psr.replace(to_replace=pd_to_replace, value=pd_value) + if pd_value is None: + # TODO: Remove this workaround once cudf + # introduces `no_default` values + expected = psr.replace(to_replace=pd_to_replace) + else: + expected = psr.replace(to_replace=pd_to_replace, value=pd_value) assert_eq( expected.sort_values().reset_index(drop=True), @@ -160,12 +166,18 @@ def test_series_replace_with_nulls(): "c": ["abc", "def", ".", None, None], } ), - cudf.DataFrame( - { - "a": ["one", "two", None, "three"], - "b": ["one", None, "two", "three"], - }, - dtype="category", + pytest.param( + cudf.DataFrame( + { + "a": ["one", "two", None, "three"], + "b": ["one", None, "two", "three"], + }, + dtype="category", + ), + marks=pytest.mark.xfail( + condition=not PANDAS_LT_140, + reason="https://github.com/pandas-dev/pandas/issues/46672", + ), ), cudf.DataFrame( { @@ -229,7 +241,10 @@ def test_dataframe_replace(df, to_replace, value): else: gd_to_replace = to_replace - expected = pdf.replace(to_replace=pd_to_replace, value=pd_value) + if pd_value is None: + expected = pdf.replace(to_replace=pd_to_replace) + else: + expected = pdf.replace(to_replace=pd_to_replace, value=pd_value) actual = gdf.replace(to_replace=gd_to_replace, value=gd_value) expected_sorted = expected.sort_values(by=list(expected.columns), axis=0) @@ -986,7 +1001,8 @@ def test_numeric_series_replace_dtype(series_dtype, replacement): pd.Series(["one", "two", "three"], dtype="category"), {"to_replace": "one", "value": "two", "inplace": True}, marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/43232" + condition=not PANDAS_GE_134, + reason="https://github.com/pandas-dev/pandas/issues/43232", ), ), ( diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index 397d7f1c277..bede054037d 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -8,7 +8,7 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_110 +from cudf.core._compat import PANDAS_GE_110, PANDAS_LT_140 from cudf.testing._utils import _create_pandas_series, assert_eq from cudf.testing.dataset_generator import rand_dataframe @@ -522,8 +522,10 @@ def get_window_bounds(self, num_values, min_periods, center, closed): "indexer", [ pd.api.indexers.FixedForwardWindowIndexer(window_size=2), - pd.core.window.indexers.ExpandingIndexer(), - pd.core.window.indexers.FixedWindowIndexer(window_size=3), + pd.core.window.expanding.ExpandingIndexer(), + pd.core.window.indexers.FixedWindowIndexer(window_size=3) + if PANDAS_LT_140 + else pd.core.indexers.objects.FixedWindowIndexer(window_size=3), ], ) def test_rolling_indexer_support(indexer): diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index d783483a8cb..e8d93caaf55 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -379,11 +379,21 @@ def test_write_parquet(s3_base, s3so, pdf, partition_cols): def test_read_json(s3_base, s3so): fname = "test_json_reader.json" bname = "json" + # TODO: After following bug is fixed switch + # back to using bytes: + # https://github.com/pandas-dev/pandas/issues/46935 + + # buffer = ( + # b'{"amount": 100, "name": "Alice"}\n' + # b'{"amount": 200, "name": "Bob"}\n' + # b'{"amount": 300, "name": "Charlie"}\n' + # b'{"amount": 400, "name": "Dennis"}\n' + # ) buffer = ( - b'{"amount": 100, "name": "Alice"}\n' - b'{"amount": 200, "name": "Bob"}\n' - b'{"amount": 300, "name": "Charlie"}\n' - b'{"amount": 400, "name": "Dennis"}\n' + '{"amount": 100, "name": "Alice"}\n' + '{"amount": 200, "name": "Bob"}\n' + '{"amount": 300, "name": "Charlie"}\n' + '{"amount": 400, "name": "Dennis"}\n' ) with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}): diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index d755ed58724..c11ab16ccec 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -12,7 +12,7 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_120 +from cudf.core._compat import PANDAS_GE_120, PANDAS_LT_140 from cudf.testing._utils import ( NUMERIC_TYPES, TIMEDELTA_TYPES, @@ -596,7 +596,7 @@ def test_series_value_counts_optional_arguments(ascending, dropna, normalize): @pytest.mark.parametrize( - "df", + "gs", [ cudf.Series([1, 2, 3]), cudf.Series([None]), @@ -648,11 +648,11 @@ def test_series_value_counts_optional_arguments(ascending, dropna, normalize): ], ) @pytest.mark.parametrize("dropna", [True, False]) -def test_series_mode(df, dropna): - pdf = df.to_pandas() +def test_series_mode(gs, dropna): + ps = gs.to_pandas() - expected = pdf.mode(dropna=dropna) - actual = df.mode(dropna=dropna) + expected = ps.mode(dropna=dropna) + actual = gs.mode(dropna=dropna) assert_eq(expected, actual, check_dtype=False) @@ -1248,7 +1248,8 @@ def test_series_upcast_float16(data): pd.RangeIndex(4, -1, -2), marks=[ pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/43591" + condition=PANDAS_LT_140, + reason="https://github.com/pandas-dev/pandas/issues/43591", ) ], ), diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index fd3f2732556..733fb4d5e4d 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -157,7 +157,7 @@ def test_series_set_equal_length_object_by_mask(replace_data): psr[pd_bool_col] = ( replace_data.to_pandas(nullable=True) if hasattr(replace_data, "to_pandas") - else replace_data + else pd.Series(replace_data) ) gsr[gd_bool_col] = replace_data @@ -167,7 +167,7 @@ def test_series_set_equal_length_object_by_mask(replace_data): psr[psr > 1] = ( replace_data.to_pandas() if hasattr(replace_data, "to_pandas") - else replace_data + else pd.Series(replace_data) ) gsr[gsr > 1] = replace_data @@ -220,12 +220,23 @@ def test_column_set_unequal_length_object_by_mask(): def test_categorical_setitem_invalid(): - ps = pd.Series([1, 2, 3], dtype="category") + # ps = pd.Series([1, 2, 3], dtype="category") gs = cudf.Series([1, 2, 3], dtype="category") - assert_exceptions_equal( - lfunc=ps.__setitem__, - rfunc=gs.__setitem__, - lfunc_args_and_kwargs=([0, 5], {}), - rfunc_args_and_kwargs=([0, 5], {}), - ) + # TODO: After https://github.com/pandas-dev/pandas/issues/46646 + # is fixed remove the following workaround and + # uncomment assert_exceptions_equal + # WORKAROUND + with pytest.raises( + ValueError, + match="Cannot setitem on a Categorical with a new category, set the " + "categories first", + ): + gs[0] = 5 + + # assert_exceptions_equal( + # lfunc=ps.__setitem__, + # rfunc=gs.__setitem__, + # lfunc_args_and_kwargs=([0, 5], {}), + # rfunc_args_and_kwargs=([0, 5], {}), + # ) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index f134849663d..4635d6d531b 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -486,7 +486,7 @@ def test_df_corr(method): "cumprod", ], ) -@pytest.mark.parametrize("skipna", [True, False, None]) +@pytest.mark.parametrize("skipna", [True, False]) def test_nans_stats(data, ops, skipna): psr = _create_pandas_series(data) gsr = cudf.Series(data, nan_as_null=False) @@ -512,7 +512,7 @@ def test_nans_stats(data, ops, skipna): ], ) @pytest.mark.parametrize("ops", ["sum", "product", "prod"]) -@pytest.mark.parametrize("skipna", [True, False, None]) +@pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("min_count", [-10, -1, 0, 1, 2, 3, 5, 10]) def test_min_count_ops(data, ops, skipna, min_count): psr = pd.Series(data) diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 8a118e0e1d6..cce2ac639ef 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -78,7 +78,7 @@ def test_timedelta_series_create(data, dtype): if dtype not in ("timedelta64[ns]"): pytest.skip( - "Bug in pandas" "https://github.com/pandas-dev/pandas/issues/35465" + "Bug in pandas : https://github.com/pandas-dev/pandas/issues/35465" ) psr = pd.Series( cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, dtype=dtype @@ -102,7 +102,7 @@ def test_timedelta_series_create(data, dtype): def test_timedelta_from_typecast(data, dtype, cast_dtype): if dtype not in ("timedelta64[ns]"): pytest.skip( - "Bug in pandas" "https://github.com/pandas-dev/pandas/issues/35465" + "Bug in pandas : https://github.com/pandas-dev/pandas/issues/35465" ) psr = pd.Series( cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, dtype=dtype @@ -1177,6 +1177,7 @@ def test_timedelta_invalid_ops(): rfunc=operator.mod, lfunc_args_and_kwargs=([psr, "a"],), rfunc_args_and_kwargs=([sr, "a"],), + check_exception_type=False, compare_error_message=False, ) diff --git a/python/cudf/setup.py b/python/cudf/setup.py index a447fcfe027..5c2bff92648 100644 --- a/python/cudf/setup.py +++ b/python/cudf/setup.py @@ -34,7 +34,7 @@ "Cython>=0.29,<0.30", "fsspec>=0.6.0", "numpy", - "pandas>=1.0,<1.4.0dev0", + "pandas>=1.0,<1.5.0dev0", "typing_extensions", "protobuf", "nvtx>=0.2.1", diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py index 5a8b2d1b216..fab847fe0f4 100644 --- a/python/dask_cudf/setup.py +++ b/python/dask_cudf/setup.py @@ -14,13 +14,13 @@ "distributed>=2022.03.0", "fsspec>=0.6.0", "numpy", - "pandas>=1.0,<1.4.0dev0", + "pandas>=1.0,<1.5.0dev0", ] extras_require = { "test": [ "numpy", - "pandas>=1.0,<1.4.0dev0", + "pandas>=1.0,<1.5.0dev0", "pytest", "numba>=0.53.1", "dask>=2021.09.1",