From 14f54ac6af052b3dff8e31d5c9eb579094f129ab Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 23 Feb 2023 17:01:09 -0600 Subject: [PATCH 001/162] Update value_counts with new behavior (#12835) This PR updates value_counts behavior to match pandas-2.x, the result name will be count (or proportion if normalize=True is passed), and the index will be named after the original object name. This PR also fixes two dtype APIs that are breaking changes on pandas side. --- python/cudf/cudf/api/types.py | 5 ++--- python/cudf/cudf/core/dataframe.py | 9 ++++++++- python/cudf/cudf/core/series.py | 20 ++++++++++---------- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 62f8377a323..ffe89e3e779 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. """Define common type operations.""" @@ -244,7 +244,6 @@ def _union_categoricals( is_datetime64_dtype = pd_types.is_datetime64_dtype is_datetime64_ns_dtype = pd_types.is_datetime64_ns_dtype is_datetime64tz_dtype = pd_types.is_datetime64tz_dtype -is_extension_type = pd_types.is_extension_type is_extension_array_dtype = pd_types.is_extension_array_dtype is_float_dtype = _wrap_pandas_is_dtype_api(pd_types.is_float_dtype) is_int64_dtype = pd_types.is_int64_dtype @@ -263,7 +262,7 @@ def _union_categoricals( is_named_tuple = pd_types.is_named_tuple is_iterator = pd_types.is_iterator is_bool = pd_types.is_bool -is_categorical = pd_types.is_categorical +is_categorical = pd_types.is_categorical_dtype is_complex = pd_types.is_complex is_float = pd_types.is_float is_hashable = pd_types.is_hashable diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d43621d3d36..19f19cd2cb0 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7211,12 +7211,18 @@ def value_counts( >>> df = cudf.DataFrame({'num_legs': [2, 4, 4, 6], ... 'num_wings': [2, 0, 0, 0]}, ... index=['falcon', 'dog', 'cat', 'ant']) + >>> df + num_legs num_wings + falcon 2 2 + dog 4 0 + cat 4 0 + ant 6 0 >>> df.value_counts() num_legs num_wings 4 0 2 2 2 1 6 0 1 - dtype: int64 + Name: count, dtype: int64 """ if subset: diff = set(subset) - set(self._data) @@ -7238,6 +7244,7 @@ def value_counts( # Pandas always returns MultiIndex even if only one column. if not isinstance(result.index, MultiIndex): result.index = MultiIndex._from_data(result._index._data) + result.name = "proportion" if normalize else "count" return result diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 60655c5a6f9..7838e9409a2 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2946,7 +2946,7 @@ def value_counts( 3.0 3 2.0 2 1.0 1 - dtype: int32 + Name: count, dtype: int32 The order of the counts can be changed by passing ``ascending=True``: @@ -2954,7 +2954,7 @@ def value_counts( 1.0 1 2.0 2 3.0 3 - dtype: int32 + Name: count, dtype: int32 With ``normalize`` set to True, returns the relative frequency by dividing all values by the sum of values. @@ -2963,7 +2963,7 @@ def value_counts( 3.0 0.500000 2.0 0.333333 1.0 0.166667 - dtype: float32 + Name: proportion, dtype: float32 To include ``NA`` value counts, pass ``dropna=False``: @@ -2983,24 +2983,24 @@ def value_counts( 2.0 2 2 1.0 1 - dtype: int32 + Name: count, dtype: int32 >>> s = cudf.Series([3, 1, 2, 3, 4, np.nan]) >>> s.value_counts(bins=3) (2.0, 3.0] 2 (0.996, 2.0] 2 (3.0, 4.0] 1 - dtype: int32 + Name: count, dtype: int32 """ if bins is not None: series_bins = cudf.cut(self, bins, include_lowest=True) - + result_name = "proportion" if normalize else "count" if dropna and self.null_count == len(self): return Series( [], dtype=np.int32, - name=self.name, - index=cudf.Index([], dtype=self.dtype), + name=result_name, + index=cudf.Index([], dtype=self.dtype, name=self.name), ) if bins is not None: @@ -3009,7 +3009,7 @@ def value_counts( else: res = self.groupby(self, dropna=dropna).count(dropna=dropna) - res.index.name = None + res.index.name = self.name if sort: res = res.sort_values(ascending=ascending) @@ -3024,7 +3024,7 @@ def value_counts( res.index._column, res.index.categories.dtype ) res.index = int_index - + res.name = result_name return res @_cudf_nvtx_annotate From 7d62d4e1638322076198fe341aa84ed73498c10d Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 24 Feb 2023 16:57:48 -0600 Subject: [PATCH 002/162] Drop inplace parameter in categorical methods (#12846) This PR drops `inplace` parameters in categorical methods, these are also removed as part of pandas-2.0 --- python/cudf/cudf/core/column/categorical.py | 213 ++------------------ python/cudf/cudf/tests/test_categorical.py | 153 +++----------- 2 files changed, 41 insertions(+), 325 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 52f7c0b957f..a44d63cea23 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -2,7 +2,6 @@ from __future__ import annotations -import warnings from collections import abc from functools import cached_property from typing import TYPE_CHECKING, Any, Mapping, Optional, Sequence, Tuple, cast @@ -130,28 +129,14 @@ def ordered(self) -> bool: """ return self._column.ordered - def as_ordered(self, inplace: bool = False) -> Optional[SeriesOrIndex]: + def as_ordered(self) -> Optional[SeriesOrIndex]: """ Set the Categorical to be ordered. - Parameters - ---------- - inplace : bool, default False - Whether or not to add the categories inplace - or return a copy of this categorical with - added categories. - - .. deprecated:: 23.02 - - The `inplace` parameter is is deprecated and - will be removed in a future version of cudf. - Setting categories as ordered will always - return a new Categorical object. - Returns ------- Categorical - Ordered Categorical or None if inplace. + Ordered Categorical. Examples -------- @@ -177,47 +162,13 @@ def as_ordered(self, inplace: bool = False) -> Optional[SeriesOrIndex]: 6 10 dtype: category Categories (3, int64): [1 < 2 < 10] - >>> s.cat.as_ordered(inplace=True) - >>> s - 0 10 - 1 1 - 2 1 - 3 2 - 4 10 - 5 2 - 6 10 - dtype: category - Categories (3, int64): [1 < 2 < 10] """ - if inplace: - warnings.warn( - "The inplace parameter is deprecated and will be removed in a " - "future release. set_ordered will always return a new Series " - "in the future.", - FutureWarning, - ) - return self._return_or_inplace( - self._column.as_ordered(), inplace=inplace - ) + return self._return_or_inplace(self._column.as_ordered()) - def as_unordered(self, inplace: bool = False) -> Optional[SeriesOrIndex]: + def as_unordered(self) -> Optional[SeriesOrIndex]: """ Set the Categorical to be unordered. - Parameters - ---------- - inplace : bool, default False - Whether or not to set the ordered attribute - in-place or return a copy of this - categorical with ordered set to False. - - .. deprecated:: 23.02 - - The `inplace` parameter is is deprecated and - will be removed in a future version of cudf. - Setting categories as unordered will always - return a new Categorical object. - Returns ------- Categorical @@ -258,32 +209,11 @@ def as_unordered(self, inplace: bool = False) -> Optional[SeriesOrIndex]: 6 10 dtype: category Categories (3, int64): [1, 2, 10] - >>> s.cat.as_unordered(inplace=True) - >>> s - 0 10 - 1 1 - 2 1 - 3 2 - 4 10 - 5 2 - 6 10 - dtype: category - Categories (3, int64): [1, 2, 10] """ - if inplace: - warnings.warn( - "The inplace parameter is deprecated and will be removed in a " - "future release. set_ordered will always return a new Series " - "in the future.", - FutureWarning, - ) - return self._return_or_inplace( - self._column.as_unordered(), inplace=inplace - ) - def add_categories( - self, new_categories: Any, inplace: bool = False - ) -> Optional[SeriesOrIndex]: + return self._return_or_inplace(self._column.as_unordered()) + + def add_categories(self, new_categories: Any) -> Optional[SeriesOrIndex]: """ Add new categories. @@ -295,23 +225,11 @@ def add_categories( ---------- new_categories : category or list-like of category The new categories to be included. - inplace : bool, default False - Whether or not to add the categories inplace - or return a copy of this categorical with - added categories. - - .. deprecated:: 23.04 - - The `inplace` parameter is is deprecated and - will be removed in a future version of cudf. - Adding categories will always return a - new Categorical object. Returns ------- cat - Categorical with new categories added or - None if inplace. + Categorical with new categories added. Examples -------- @@ -332,21 +250,8 @@ def add_categories( 1 2 dtype: category Categories (2, int64): [1, 2] - >>> s.cat.add_categories([0, 3, 4], inplace=True) - >>> s - 0 1 - 1 2 - dtype: category - Categories (5, int64): [1, 2, 0, 3, 4] """ - if inplace: - warnings.warn( - "The `inplace` parameter in cudf.Series.cat.add_categories " - "is deprecated and will be removed in a future version of " - "cudf. Adding categories will always return a new " - "Categorical object.", - FutureWarning, - ) + old_categories = self._column.categories new_categories = column.as_column( new_categories, @@ -376,12 +281,11 @@ def add_categories( if not out_col._categories_equal(new_categories): out_col = out_col._set_categories(new_categories) - return self._return_or_inplace(out_col, inplace=inplace) + return self._return_or_inplace(out_col) def remove_categories( self, removals: Any, - inplace: bool = False, ) -> Optional[SeriesOrIndex]: """ Remove the specified categories. @@ -394,23 +298,11 @@ def remove_categories( ---------- removals : category or list-like of category The categories which should be removed. - inplace : bool, default False - Whether or not to remove the categories - inplace or return a copy of this categorical - with removed categories. - - .. deprecated:: 23.04 - - The `inplace` parameter is is deprecated and - will be removed in a future version of cudf. - Removing categories will always return a - new Categorical object. Returns ------- cat - Categorical with removed categories or None - if inplace. + Categorical with removed categories Examples -------- @@ -446,27 +338,7 @@ def remove_categories( 6 10 dtype: category Categories (3, int64): [1, 2, 10] - >>> s.cat.remove_categories([10], inplace=True) - >>> s - 0 - 1 1 - 2 1 - 3 2 - 4 - 5 2 - 6 - dtype: category - Categories (2, int64): [1, 2] """ - if inplace: - warnings.warn( - "The `inplace` parameter in " - "cudf.Series.cat.remove_categories is deprecated and " - "will be removed in a future version of cudf. " - "Removing categories will always return a new " - "Categorical object.", - FutureWarning, - ) cats = self.categories.to_series() removals = cudf.Series(removals, dtype=cats.dtype) @@ -483,14 +355,13 @@ def remove_categories( if not out_col._categories_equal(new_categories): out_col = out_col._set_categories(new_categories) - return self._return_or_inplace(out_col, inplace=inplace) + return self._return_or_inplace(out_col) def set_categories( self, new_categories: Any, ordered: bool = False, rename: bool = False, - inplace: bool = False, ) -> Optional[SeriesOrIndex]: """ Set the categories to the specified new_categories. @@ -525,23 +396,11 @@ def set_categories( Whether or not the `new_categories` should be considered as a rename of the old categories or as reordered categories. - inplace : bool, default False - Whether or not to reorder the categories in-place - or return a copy of this categorical with - reordered categories. - - .. deprecated:: 23.04 - - The `inplace` parameter is is deprecated and - will be removed in a future version of cudf. - Setting categories will always return a - new Categorical object. Returns ------- cat Categorical with reordered categories - or None if inplace. Examples -------- @@ -565,37 +424,18 @@ def set_categories( 5 10 dtype: category Categories (2, int64): [1, 10] - >>> s.cat.set_categories([1, 10], inplace=True) - >>> s - 0 1 - 1 1 - 2 - 3 10 - 4 - 5 10 - dtype: category - Categories (2, int64): [1, 10] """ - if inplace: - warnings.warn( - "The `inplace` parameter in cudf.Series.cat.set_categories is " - "deprecated and will be removed in a future version of cudf. " - "Setting categories will always return a new Categorical " - "object.", - FutureWarning, - ) + return self._return_or_inplace( self._column.set_categories( new_categories=new_categories, ordered=ordered, rename=rename - ), - inplace=inplace, + ) ) def reorder_categories( self, new_categories: Any, ordered: bool = False, - inplace: bool = False, ) -> Optional[SeriesOrIndex]: """ Reorder categories as specified in new_categories. @@ -611,23 +451,11 @@ def reorder_categories( Whether or not the categorical is treated as a ordered categorical. If not given, do not change the ordered information. - inplace : bool, default False - Whether or not to reorder the categories - inplace or return a copy of this categorical - with reordered categories. - - .. deprecated:: 23.04 - - The `inplace` parameter is is deprecated and - will be removed in a future version of cudf. - Reordering categories will always return a - new Categorical object. Returns ------- cat - Categorical with reordered categories or - None if inplace. + Categorical with reordered categories Raises ------ @@ -664,18 +492,9 @@ def reorder_categories( ValueError: items in new_categories are not the same as in old categories """ - if inplace: - warnings.warn( - "The `inplace` parameter in " - "cudf.Series.cat.reorder_categories is deprecated " - "and will be removed in a future version of cudf. " - "Reordering categories will always return a new " - "Categorical object.", - FutureWarning, - ) + return self._return_or_inplace( self._column.reorder_categories(new_categories, ordered=ordered), - inplace=inplace, ) diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 496039ca2f8..6a705e2fa63 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -11,30 +11,14 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_134 +from cudf.core._compat import PANDAS_GE_110 from cudf.testing._utils import ( NUMERIC_TYPES, assert_eq, assert_exceptions_equal, - expect_warning_if, ) -@contextmanager -def _hide_deprecated_pandas_categorical_inplace_warnings(function_name): - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - ( - "The `inplace` parameter in " - f"pandas.Categorical.{function_name} is deprecated and will " - "be removed in a future version." - ), - category=FutureWarning, - ) - yield - - @contextmanager def _hide_cudf_safe_casting_warning(): with warnings.catch_warnings(): @@ -363,8 +347,7 @@ def test_categorical_set_categories_preserves_order(): ) -@pytest.mark.parametrize("inplace", [True, False]) -def test_categorical_as_ordered(pd_str_cat, inplace): +def test_categorical_as_ordered(pd_str_cat): pd_sr = pd.Series(pd_str_cat.copy().set_ordered(False)) cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(False)) @@ -372,23 +355,15 @@ def test_categorical_as_ordered(pd_str_cat, inplace): assert cd_sr.cat.ordered is False assert cd_sr.cat.ordered == pd_sr.cat.ordered - # pandas internally uses a deprecated call to set_ordered(inplace=inplace) - # inside as_ordered. - with pytest.warns(FutureWarning): - pd_sr_1 = pd_sr.cat.as_ordered(inplace=inplace) - with expect_warning_if(inplace, FutureWarning): - cd_sr_1 = cd_sr.cat.as_ordered(inplace=inplace) - if inplace: - pd_sr_1 = pd_sr - cd_sr_1 = cd_sr + pd_sr_1 = pd_sr.cat.as_ordered() + cd_sr_1 = cd_sr.cat.as_ordered() assert cd_sr_1.cat.ordered is True assert cd_sr_1.cat.ordered == pd_sr_1.cat.ordered assert str(cd_sr_1) == str(pd_sr_1) -@pytest.mark.parametrize("inplace", [True, False]) -def test_categorical_as_unordered(pd_str_cat, inplace): +def test_categorical_as_unordered(pd_str_cat): pd_sr = pd.Series(pd_str_cat.copy().set_ordered(True)) cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(True)) @@ -396,15 +371,8 @@ def test_categorical_as_unordered(pd_str_cat, inplace): assert cd_sr.cat.ordered is True assert cd_sr.cat.ordered == pd_sr.cat.ordered - # pandas internally uses a deprecated call to set_ordered(inplace=inplace) - # inside as_unordered. - with pytest.warns(FutureWarning): - pd_sr_1 = pd_sr.cat.as_unordered(inplace=inplace) - with expect_warning_if(inplace, FutureWarning): - cd_sr_1 = cd_sr.cat.as_unordered(inplace=inplace) - if inplace: - pd_sr_1 = pd_sr - cd_sr_1 = cd_sr + pd_sr_1 = pd_sr.cat.as_unordered() + cd_sr_1 = cd_sr.cat.as_unordered() assert cd_sr_1.cat.ordered is False assert cd_sr_1.cat.ordered == pd_sr_1.cat.ordered @@ -413,22 +381,7 @@ def test_categorical_as_unordered(pd_str_cat, inplace): @pytest.mark.parametrize("from_ordered", [True, False]) @pytest.mark.parametrize("to_ordered", [True, False]) -@pytest.mark.parametrize( - "inplace", - [ - pytest.param( - True, - marks=pytest.mark.skipif( - condition=not PANDAS_GE_134, - reason="https://github.com/pandas-dev/pandas/issues/43232", - ), - ), - False, - ], -) -def test_categorical_reorder_categories( - pd_str_cat, from_ordered, to_ordered, inplace -): +def test_categorical_reorder_categories(pd_str_cat, from_ordered, to_ordered): pd_sr = pd.Series(pd_str_cat.copy().set_ordered(from_ordered)) cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(from_ordered)) @@ -437,39 +390,19 @@ def test_categorical_reorder_categories( assert str(pd_sr) == str(cd_sr) - kwargs = dict(ordered=to_ordered, inplace=inplace) + kwargs = dict( + ordered=to_ordered, + ) - with _hide_deprecated_pandas_categorical_inplace_warnings( - "reorder_categories" - ): - pd_sr_1 = pd_sr.cat.reorder_categories(list("cba"), **kwargs) - if inplace: - with pytest.warns(FutureWarning): - cd_sr_1 = cd_sr.cat.reorder_categories(list("cba"), **kwargs) - pd_sr_1 = pd_sr - cd_sr_1 = cd_sr - else: - cd_sr_1 = cd_sr.cat.reorder_categories(list("cba"), **kwargs) + pd_sr_1 = pd_sr.cat.reorder_categories(list("cba"), **kwargs) + cd_sr_1 = cd_sr.cat.reorder_categories(list("cba"), **kwargs) assert_eq(pd_sr_1, cd_sr_1) assert str(cd_sr_1) == str(pd_sr_1) -@pytest.mark.parametrize( - "inplace", - [ - pytest.param( - True, - marks=pytest.mark.skipif( - condition=not PANDAS_GE_134, - reason="https://github.com/pandas-dev/pandas/issues/43232", - ), - ), - False, - ], -) -def test_categorical_add_categories(pd_str_cat, inplace): +def test_categorical_add_categories(pd_str_cat): pd_sr = pd.Series(pd_str_cat.copy()) cd_sr = cudf.Series(pd_str_cat.copy()) @@ -478,18 +411,8 @@ def test_categorical_add_categories(pd_str_cat, inplace): assert str(pd_sr) == str(cd_sr) - with _hide_deprecated_pandas_categorical_inplace_warnings( - "add_categories" - ): - pd_sr_1 = pd_sr.cat.add_categories(["d"], inplace=inplace) - - if inplace: - with pytest.warns(FutureWarning): - cd_sr_1 = cd_sr.cat.add_categories(["d"], inplace=inplace) - pd_sr_1 = pd_sr - cd_sr_1 = cd_sr - else: - cd_sr_1 = cd_sr.cat.add_categories(["d"], inplace=inplace) + pd_sr_1 = pd_sr.cat.add_categories(["d"]) + cd_sr_1 = cd_sr.cat.add_categories(["d"]) assert "d" in pd_sr_1.cat.categories.to_list() assert "d" in cd_sr_1.cat.categories.to_pandas().to_list() @@ -497,20 +420,7 @@ def test_categorical_add_categories(pd_str_cat, inplace): assert_eq(pd_sr_1, cd_sr_1) -@pytest.mark.parametrize( - "inplace", - [ - pytest.param( - True, - marks=pytest.mark.skipif( - condition=not PANDAS_GE_134, - reason="https://github.com/pandas-dev/pandas/issues/43232", - ), - ), - False, - ], -) -def test_categorical_remove_categories(pd_str_cat, inplace): +def test_categorical_remove_categories(pd_str_cat): pd_sr = pd.Series(pd_str_cat.copy()) cd_sr = cudf.Series(pd_str_cat.copy()) @@ -519,18 +429,8 @@ def test_categorical_remove_categories(pd_str_cat, inplace): assert str(pd_sr) == str(cd_sr) - with _hide_deprecated_pandas_categorical_inplace_warnings( - "remove_categories" - ): - pd_sr_1 = pd_sr.cat.remove_categories(["a"], inplace=inplace) - - if inplace: - with pytest.warns(FutureWarning): - cd_sr_1 = cd_sr.cat.remove_categories(["a"], inplace=inplace) - pd_sr_1 = pd_sr - cd_sr_1 = cd_sr - else: - cd_sr_1 = cd_sr.cat.remove_categories(["a"], inplace=inplace) + pd_sr_1 = pd_sr.cat.remove_categories(["a"]) + cd_sr_1 = cd_sr.cat.remove_categories(["a"]) assert "a" not in pd_sr_1.cat.categories.to_list() assert "a" not in cd_sr_1.cat.categories.to_pandas().to_list() @@ -538,15 +438,12 @@ def test_categorical_remove_categories(pd_str_cat, inplace): assert_eq(pd_sr_1, cd_sr_1) # test using ordered operators - with _hide_deprecated_pandas_categorical_inplace_warnings( - "remove_categories" - ) as _, pytest.warns(FutureWarning) as _: - assert_exceptions_equal( - lfunc=cd_sr.to_pandas().cat.remove_categories, - rfunc=cd_sr.cat.remove_categories, - lfunc_args_and_kwargs=([["a", "d"]], {"inplace": inplace}), - rfunc_args_and_kwargs=([["a", "d"]], {"inplace": inplace}), - ) + assert_exceptions_equal( + lfunc=cd_sr.to_pandas().cat.remove_categories, + rfunc=cd_sr.cat.remove_categories, + lfunc_args_and_kwargs=([["a", "d"]], {}), + rfunc_args_and_kwargs=([["a", "d"]], {}), + ) def test_categorical_dataframe_slice_copy(): From d1b1ea80a88053fae05f07f7805c96614902b70b Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 24 Feb 2023 17:10:07 -0600 Subject: [PATCH 003/162] [REVIEW] Raise error when `numeric_only=True` for non-numeric Series (#12843) This PR raises an error when numeric_only=True for rank if the Series is of non-numeric dtype. --- python/cudf/cudf/core/indexed_frame.py | 11 +++++++++-- python/cudf/cudf/tests/test_rank.py | 21 +++++++++++++-------- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 2992cb005e5..159cc318789 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -4771,7 +4771,7 @@ def rank( self, axis=0, method="average", - numeric_only=None, + numeric_only=False, na_option="keep", ascending=True, pct=False, @@ -4794,7 +4794,7 @@ def rank( * max: highest rank in the group * first: ranks assigned in order they appear in the array * dense: like 'min', but rank always increases by 1 between groups. - numeric_only : bool, optional + numeric_only : bool, default False For DataFrame objects, rank only numeric columns if set to True. na_option : {'keep', 'top', 'bottom'}, default 'keep' How to rank NaN values: @@ -4829,6 +4829,13 @@ def rank( source = self if numeric_only: + if isinstance( + source, cudf.Series + ) and not _is_non_decimal_numeric_dtype(self.dtype): + raise TypeError( + "Series.rank does not allow numeric_only=True with " + "non-numeric dtype." + ) numeric_cols = ( name for name in self._data.names diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py index 9bd67309ece..0aa3d53f962 100644 --- a/python/cudf/cudf/tests/test_rank.py +++ b/python/cudf/cudf/tests/test_rank.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from itertools import chain, combinations_with_replacement, product @@ -55,13 +55,18 @@ def test_rank_all_arguments( assert_eq(gdf["col1"].rank(**kwargs), pdf["col1"].rank(**kwargs)) assert_eq(gdf["col2"].rank(**kwargs), pdf["col2"].rank(**kwargs)) if numeric_only: - with pytest.warns(FutureWarning): - expect = pdf["str"].rank(**kwargs) - got = gdf["str"].rank(**kwargs) - assert expect.empty == got.empty - expected = pdf.select_dtypes(include=np.number) - else: - expected = pdf.copy(deep=True) + assert_exceptions_equal( + lfunc=pdf["str"].rank, + rfunc=gdf["str"].rank, + lfunc_args_and_kwargs=( + [], + kwargs, + ), + rfunc_args_and_kwargs=( + [], + kwargs, + ), + ) actual = gdf.rank(**kwargs) expected = pdf.rank(**kwargs) From 63177336c224c35163a8f0a97456a15c0626abf0 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 28 Feb 2023 10:53:03 -0600 Subject: [PATCH 004/162] Drop is_monotonic (#12853) This PR drops support for `Series.is_monotonic` & `Index.is_monotonic`. Instead, the alternative will be `.is_monotonic_increasing`. --- docs/cudf/source/api_docs/index_objects.rst | 1 - python/cudf/cudf/core/_base_index.py | 19 --------- python/cudf/cudf/core/single_column_frame.py | 23 +--------- python/cudf/cudf/tests/test_dataframe.py | 2 - python/cudf/cudf/tests/test_monotonic.py | 45 ++------------------ 5 files changed, 5 insertions(+), 85 deletions(-) diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst index 0a6e3c169f0..03eb6d68538 100644 --- a/docs/cudf/source/api_docs/index_objects.rst +++ b/docs/cudf/source/api_docs/index_objects.rst @@ -25,7 +25,6 @@ Properties Index.has_duplicates Index.duplicated Index.hasnans - Index.is_monotonic Index.is_monotonic_increasing Index.is_monotonic_decreasing Index.is_unique diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 8f8f2afc734..80ebf88245f 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -3,7 +3,6 @@ from __future__ import annotations import pickle -import warnings from functools import cached_property from typing import Any, Set, TypeVar @@ -189,24 +188,6 @@ def _clean_nulls_from_index(self): """ raise NotImplementedError - @property - def is_monotonic(self): - """Return boolean if values in the object are monotonic_increasing. - - This property is an alias for :attr:`is_monotonic_increasing`. - - Returns - ------- - bool - """ - warnings.warn( - "is_monotonic is deprecated and will be removed in a future " - "version. Use is_monotonic_increasing instead.", - FutureWarning, - ) - - return self.is_monotonic_increasing - @property def is_monotonic_increasing(self): """Return boolean if values in the object are monotonically increasing. diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index afd06ea3629..46cf49c62e0 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -1,9 +1,9 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. + """Base class for Frame types that only have a single column.""" from __future__ import annotations -import warnings from typing import Any, Dict, Optional, Tuple, TypeVar, Union import cupy @@ -223,25 +223,6 @@ def is_unique(self): """ return self._column.is_unique - @property # type: ignore - @_cudf_nvtx_annotate - def is_monotonic(self): - """Return boolean if values in the object are monotonically increasing. - - This property is an alias for :attr:`is_monotonic_increasing`. - - Returns - ------- - bool - """ - warnings.warn( - "is_monotonic is deprecated and will be removed in a future " - "version. Use is_monotonic_increasing instead.", - FutureWarning, - ) - - return self.is_monotonic_increasing - @property # type: ignore @_cudf_nvtx_annotate def is_monotonic_increasing(self): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 13f312f6f0c..a727644e42f 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2518,8 +2518,6 @@ def test_unary_operators(func, pdf, gdf): def test_is_monotonic(gdf): pdf = pd.DataFrame({"x": [1, 2, 3]}, index=[3, 1, 2]) gdf = cudf.DataFrame.from_pandas(pdf) - with pytest.warns(FutureWarning): - assert not gdf.index.is_monotonic assert not gdf.index.is_monotonic_increasing assert not gdf.index.is_monotonic_decreasing diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py index f4e8b80342a..93c202c3138 100644 --- a/python/cudf/cudf/tests/test_monotonic.py +++ b/python/cudf/cudf/tests/test_monotonic.py @@ -1,7 +1,8 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. """ -Tests related to is_unique and is_monotonic attributes +Tests related to is_unique, is_monotonic_increasing & +is_monotonic_decreasing attributes """ import numpy as np import pandas as pd @@ -30,11 +31,6 @@ def test_range_index(testrange): ) assert index.is_unique == index_pd.is_unique - with pytest.warns(FutureWarning): - expect = index_pd.is_monotonic - with pytest.warns(FutureWarning): - got = index.is_monotonic - assert got == expect assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing @@ -58,11 +54,6 @@ def test_generic_index(testlist): index_pd = pd.Index(testlist) assert index.is_unique == index_pd.is_unique - with pytest.warns(FutureWarning): - expect = index_pd.is_monotonic - with pytest.warns(FutureWarning): - got = index.is_monotonic - assert got == expect assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing @@ -82,11 +73,6 @@ def test_string_index(testlist): index_pd = pd.Index(testlist) assert index.is_unique == index_pd.is_unique - with pytest.warns(FutureWarning): - expect = index_pd.is_monotonic - with pytest.warns(FutureWarning): - got = index.is_monotonic - assert got == expect assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing @@ -102,11 +88,6 @@ def test_categorical_index(testlist): index_pd = pd.CategoricalIndex(raw_cat) assert index.is_unique == index_pd.is_unique - with pytest.warns(FutureWarning): - expect = index_pd.is_monotonic - with pytest.warns(FutureWarning): - got = index.is_monotonic - assert got == expect assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing @@ -147,11 +128,6 @@ def test_datetime_index(testlist): index_pd = pd.DatetimeIndex(testlist) assert index.is_unique == index_pd.is_unique - with pytest.warns(FutureWarning): - expect = index_pd.is_monotonic - with pytest.warns(FutureWarning): - got = index.is_monotonic - assert got == expect assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing @@ -174,11 +150,6 @@ def test_series(testlist): series_pd = pd.Series(testlist) assert series.is_unique == series_pd.is_unique - with pytest.warns(FutureWarning): - expect = series_pd.index.is_monotonic - with pytest.warns(FutureWarning): - got = series.index.is_monotonic - assert got == expect assert series.is_monotonic_increasing == series_pd.is_monotonic_increasing assert series.is_monotonic_decreasing == series_pd.is_monotonic_decreasing @@ -203,11 +174,6 @@ def test_multiindex(): gdf = cudf.from_pandas(pdf) assert pdf.index.is_unique == gdf.index.is_unique - with pytest.warns(FutureWarning): - expect = pdf.index.is_monotonic - with pytest.warns(FutureWarning): - got = gdf.index.is_monotonic - assert got == expect assert ( pdf.index.is_monotonic_increasing == gdf.index.is_monotonic_increasing ) @@ -242,11 +208,6 @@ def test_multiindex_tuples(testarr): index_pd = pd.MultiIndex.from_tuples(tuples, names=testarr[1]) assert index.is_unique == index_pd.is_unique - with pytest.warns(FutureWarning): - expect = index_pd.is_monotonic - with pytest.warns(FutureWarning): - got = index.is_monotonic - assert got == expect assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing From 5af05836c2df4ae92514ebd696e1cd18aa80296e Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 8 Mar 2023 13:39:35 -0600 Subject: [PATCH 005/162] [REVIEW] Drop `datetime_is_numeric` parameter from `describe` (#12890) This PR removes support for `datetime_is_numeric` parameter in `describe`. --- python/cudf/cudf/core/dataframe.py | 13 +------------ python/cudf/cudf/core/series.py | 9 --------- python/cudf/cudf/tests/test_dataframe.py | 19 +++++++------------ python/cudf/cudf/tests/test_series.py | 14 +++++--------- python/cudf/cudf/utils/docutils.py | 9 --------- 5 files changed, 13 insertions(+), 51 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 83da93d9ae1..d9900f3adc3 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -4929,21 +4929,11 @@ def describe( percentiles=None, include=None, exclude=None, - datetime_is_numeric=False, ): """{docstring}""" if not include and not exclude: - default_include = [np.number] - if datetime_is_numeric: - default_include.append("datetime") - else: - warnings.warn( - "`datetime_is_numeric` is deprecated. Specify " - "`datetime_is_numeric=True` to silence this " - "warning and adopt the future behavior now.", - FutureWarning, - ) + default_include = [np.number, "datetime"] data_to_describe = self.select_dtypes(include=default_include) if data_to_describe._num_columns == 0: data_to_describe = self @@ -4964,7 +4954,6 @@ def describe( describe_series_list = [ data_to_describe[col].describe( percentiles=percentiles, - datetime_is_numeric=datetime_is_numeric, ) for col in data_to_describe._column_names ] diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index bd5569c042c..d486851176a 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -6,7 +6,6 @@ import inspect import pickle import textwrap -import warnings from collections import abc from shutil import get_terminal_size from typing import Any, Dict, MutableMapping, Optional, Set, Tuple, Union @@ -3108,17 +3107,9 @@ def describe( percentiles=None, include=None, exclude=None, - datetime_is_numeric=False, ): """{docstring}""" - if not datetime_is_numeric: - warnings.warn( - "`datetime_is_numeric` is deprecated and will be removed in " - "a future release. Specify `datetime_is_numeric=True` to " - "silence this warning and adopt the future behavior now.", - FutureWarning, - ) if percentiles is not None: if not all(0 <= x <= 1 for x in percentiles): raise ValueError( diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 5db338e66cf..df235d48a30 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3654,8 +3654,8 @@ def test_dataframe_describe_exclude(): df["x"] = df.x.astype("int64") df["y"] = np.random.normal(10, 1, data_length) pdf = df.to_pandas() - with pytest.warns(FutureWarning): - gdf_results = df.describe(exclude=["float"]) + + gdf_results = df.describe(exclude=["float"]) pdf_results = pdf.describe(exclude=["float"]) assert_eq(gdf_results, pdf_results) @@ -3670,8 +3670,7 @@ def test_dataframe_describe_include(): df["x"] = df.x.astype("int64") df["y"] = np.random.normal(10, 1, data_length) pdf = df.to_pandas() - with pytest.warns(FutureWarning): - gdf_results = df.describe(include=["int"]) + gdf_results = df.describe(include=["int"]) pdf_results = pdf.describe(include=["int"]) assert_eq(gdf_results, pdf_results) @@ -3685,8 +3684,7 @@ def test_dataframe_describe_default(): df["x"] = np.random.normal(10, 1, data_length) df["y"] = np.random.normal(10, 1, data_length) pdf = df.to_pandas() - with pytest.warns(FutureWarning): - gdf_results = df.describe() + gdf_results = df.describe() pdf_results = pdf.describe() assert_eq(pdf_results, gdf_results) @@ -3703,8 +3701,7 @@ def test_series_describe_include_all(): df["animal"] = np.random.choice(["dog", "cat", "bird"], data_length) pdf = df.to_pandas() - with pytest.warns(FutureWarning): - gdf_results = df.describe(include="all") + gdf_results = df.describe(include="all") pdf_results = pdf.describe(include="all") assert_eq(gdf_results[["x", "y"]], pdf_results[["x", "y"]]) @@ -3725,8 +3722,7 @@ def test_dataframe_describe_percentiles(): df["x"] = np.random.normal(10, 1, data_length) df["y"] = np.random.normal(10, 1, data_length) pdf = df.to_pandas() - with pytest.warns(FutureWarning): - gdf_results = df.describe(percentiles=sample_percentiles) + gdf_results = df.describe(percentiles=sample_percentiles) pdf_results = pdf.describe(percentiles=sample_percentiles) assert_eq(pdf_results, gdf_results) @@ -4053,8 +4049,7 @@ def test_empty_dataframe_describe(): gdf = cudf.from_pandas(pdf) expected = pdf.describe() - with pytest.warns(FutureWarning): - actual = gdf.describe() + actual = gdf.describe() assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index ce519a445ba..f08295d228d 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -408,8 +408,7 @@ def test_series_size(data): def test_series_describe_numeric(dtype): ps = pd.Series([0, 1, 2, 3, 1, 2, 3], dtype=dtype) gs = cudf.from_pandas(ps) - with pytest.warns(FutureWarning): - actual = gs.describe() + actual = gs.describe() expected = ps.describe() assert_eq(expected, actual, check_dtype=True) @@ -426,9 +425,8 @@ def test_series_describe_datetime(dtype): # Treating datetimes as categoricals is deprecated in pandas and will # be removed in future. Future behavior is treating datetime as numeric. - expected = ps.describe(datetime_is_numeric=True) - with pytest.warns(FutureWarning): - actual = gs.describe() + expected = ps.describe() + actual = gs.describe() assert_eq(expected.astype("str"), actual) @@ -439,8 +437,7 @@ def test_series_describe_timedelta(dtype): gs = cudf.from_pandas(ps) expected = ps.describe() - with pytest.warns(FutureWarning): - actual = gs.describe() + actual = gs.describe() assert_eq(actual, expected.astype("str")) @@ -465,8 +462,7 @@ def test_series_describe_other_types(ps): gs = cudf.from_pandas(ps) expected = ps.describe() - with pytest.warns(FutureWarning): - actual = gs.describe() + actual = gs.describe() if len(ps) == 0: assert_eq(expected.fillna("a").astype("str"), actual.fillna("a")) diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py index 5a7b8bae980..1a9c4b54aa9 100644 --- a/python/cudf/cudf/utils/docutils.py +++ b/python/cudf/cudf/utils/docutils.py @@ -126,15 +126,6 @@ def wrapper(func): exclude pandas categorical columns, use ``'category'`` - None (default) : The result will exclude nothing. - datetime_is_numeric : bool, default False - For DataFrame input, this also controls whether datetime columns - are included by default. - - .. deprecated:: 23.04 - - `datetime_is_numeric` is deprecated and will be removed in - a future version of cudf. - Returns ------- output_frame : Series or DataFrame From 531f52cde5b014d6533119968ee4ef7edda5b313 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 8 Mar 2023 13:55:54 -0600 Subject: [PATCH 006/162] Drop `names`, `dtype` in `Index.copy` and `dtype`, `levels`, `codes` in `MultiIndex.copy` (#12898) This PR removes `dtype` in Index & `MultiIndex.copy`, and `names` in Index.copy --- python/cudf/cudf/core/index.py | 75 +++-------------------- python/cudf/cudf/core/multiindex.py | 72 +--------------------- python/cudf/cudf/tests/test_index.py | 58 ++++++------------ python/cudf/cudf/tests/test_multiindex.py | 15 +---- 4 files changed, 30 insertions(+), 190 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index cd882aba297..2203d103204 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -302,7 +302,7 @@ def __contains__(self, item): return item in range(self._start, self._stop, self._step) @_cudf_nvtx_annotate - def copy(self, name=None, deep=False, dtype=None, names=None): + def copy(self, name=None, deep=False): """ Make a copy of this object. @@ -311,44 +311,11 @@ def copy(self, name=None, deep=False, dtype=None, names=None): name : object optional (default: None), name of index deep : Bool (default: False) Ignored for RangeIndex - dtype : numpy dtype optional (default: None) - Target dtype for underlying range data - - .. deprecated:: 23.02 - - The `dtype` parameter is deprecated and will be removed in - a future version of cudf. Use the `astype` method instead. - - names : list-like optional (default: False) - Kept compatibility with MultiIndex. Should not be used. - - .. deprecated:: 23.04 - - The parameter `names` is deprecated and will be removed in - a future version of cudf. Use the `name` parameter instead. Returns ------- - New RangeIndex instance with same range, casted to new dtype + New RangeIndex instance with same range """ - if dtype is not None: - warnings.warn( - "parameter dtype is deprecated and will be removed in a " - "future version. Use the astype method instead.", - FutureWarning, - ) - - if names is not None: - warnings.warn( - "parameter names is deprecated and will be removed in a " - "future version. Use the name parameter instead.", - FutureWarning, - ) - - dtype = self.dtype if dtype is None else dtype - - if not np.issubdtype(dtype, np.signedinteger): - raise ValueError(f"Expected Signed Integer Type, Got {dtype}") name = self.name if name is None else name @@ -1140,7 +1107,7 @@ def equals(self, other, **kwargs): return False @_cudf_nvtx_annotate - def copy(self, name=None, deep=False, dtype=None, names=None): + def copy(self, name=None, deep=False): """ Make a copy of this object. @@ -1151,45 +1118,17 @@ def copy(self, name=None, deep=False, dtype=None, names=None): deep : bool, default True Make a deep copy of the data. With ``deep=False`` the original data is used - dtype : numpy dtype, default None - Target datatype to cast into, use original dtype when None - - .. deprecated:: 23.02 - - The `dtype` parameter is deprecated and will be removed in - a future version of cudf. Use the `astype` method instead. - - names : list-like, default False - Kept compatibility with MultiIndex. Should not be used. - - .. deprecated:: 23.04 - - The parameter `names` is deprecated and will be removed in - a future version of cudf. Use the `name` parameter instead. Returns ------- - New index instance, casted to new dtype + New index instance. """ - if dtype is not None: - warnings.warn( - "parameter dtype is deprecated and will be removed in a " - "future version. Use the astype method instead.", - FutureWarning, - ) - - if names is not None: - warnings.warn( - "parameter names is deprecated and will be removed in a " - "future version. Use the name parameter instead.", - FutureWarning, - ) - dtype = self.dtype if dtype is None else dtype name = self.name if name is None else name - col = self._values.astype(dtype) - return _index_from_data({name: col.copy(True) if deep else col}) + return _index_from_data( + {name: self._values.copy(True) if deep else self._values} + ) @_cudf_nvtx_annotate @doc_apply(_index_astype_docstring) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 1f26371f797..2951a362e73 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -5,7 +5,6 @@ import itertools import numbers import pickle -import warnings from collections import abc from functools import cached_property from numbers import Integral @@ -318,9 +317,6 @@ def name(self, value): def copy( self, names=None, - dtype=None, - levels=None, - codes=None, deep=False, name=None, ): @@ -334,36 +330,12 @@ def copy( ---------- names : sequence of objects, optional (default None) Names for each of the index levels. - dtype : object, optional (default None) - MultiIndex dtype, only supports None or object type - - .. deprecated:: 23.02 - - The `dtype` parameter is deprecated and will be removed in - a future version of cudf. Use the `astype` method instead. - - levels : sequence of arrays, optional (default None) - The unique labels for each level. Original values used if None. - - .. deprecated:: 23.02 - - The `levels` parameter is deprecated and will be removed in - a future version of cudf. - - codes : sequence of arrays, optional (default None) - Integers for each level designating which label at each location. - Original values used if None. - - .. deprecated:: 23.02 - - The `codes` parameter is deprecated and will be removed in - a future version of cudf. - deep : Bool (default False) If True, `._data`, `._levels`, `._codes` will be copied. Ignored if `levels` or `codes` are specified. name : object, optional (default None) - To keep consistent with `Index.copy`, should not be used. + Kept for compatibility with 1-dimensional Index. Should not + be used. Returns ------- @@ -401,46 +373,6 @@ def copy( """ - # TODO: Update message when set_levels is implemented. - # https://github.com/rapidsai/cudf/issues/12307 - if levels is not None: - warnings.warn( - "parameter levels is deprecated and will be removed in a " - "future version.", - FutureWarning, - ) - - # TODO: Update message when set_codes is implemented. - # https://github.com/rapidsai/cudf/issues/12308 - if codes is not None: - warnings.warn( - "parameter codes is deprecated and will be removed in a " - "future version.", - FutureWarning, - ) - - if dtype is not None: - warnings.warn( - "parameter dtype is deprecated and will be removed in a " - "future version. Use the astype method instead.", - FutureWarning, - ) - - dtype = object if dtype is None else dtype - if not pd.core.dtypes.common.is_object_dtype(dtype): - raise TypeError("Dtype for MultiIndex only supports object type.") - - # ._data needs to be rebuilt - if levels is not None or codes is not None: - if self._levels is None or self._codes is None: - self._compute_levels_and_codes() - levels = self._levels if levels is None else levels - codes = self._codes if codes is None else codes - names = self.names if names is None else names - - mi = MultiIndex(levels=levels, codes=codes, names=names, copy=deep) - return mi - mi = MultiIndex._from_data(self._data.copy(deep=deep)) if self._levels is not None: mi._levels = [s.copy(deep) for s in self._levels] diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index f0b74ce70e7..d4ce348fa78 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -25,7 +25,6 @@ NUMERIC_TYPES, OTHER_TYPES, SIGNED_INTEGER_TYPES, - SIGNED_TYPES, UNSIGNED_TYPES, _create_pandas_series, assert_column_memory_eq, @@ -307,90 +306,69 @@ def test_set_index_as_property(): @pytest.mark.parametrize("name", ["x"]) -@pytest.mark.parametrize("dtype", SIGNED_INTEGER_TYPES) -def test_index_copy_range(name, dtype, deep=True): +def test_index_copy_range(name, deep=True): cidx = cudf.RangeIndex(1, 5) pidx = cidx.to_pandas() - with pytest.warns(FutureWarning): - pidx_copy = pidx.copy(name=name, deep=deep, dtype=dtype) - with pytest.warns(FutureWarning): - cidx_copy = cidx.copy(name=name, deep=deep, dtype=dtype) + pidx_copy = pidx.copy(name=name, deep=deep) + cidx_copy = cidx.copy(name=name, deep=deep) assert_eq(pidx_copy, cidx_copy) @pytest.mark.parametrize("name", ["x"]) -@pytest.mark.parametrize("dtype,", ["datetime64[ns]", "int64"]) -def test_index_copy_datetime(name, dtype, deep=True): +def test_index_copy_datetime(name, deep=True): cidx = cudf.DatetimeIndex(["2001", "2002", "2003"]) pidx = cidx.to_pandas() - with pytest.warns(FutureWarning): - pidx_copy = pidx.copy(name=name, deep=deep, dtype=dtype) - with pytest.warns(FutureWarning): - cidx_copy = cidx.copy(name=name, deep=deep, dtype=dtype) + pidx_copy = pidx.copy(name=name, deep=deep) + cidx_copy = cidx.copy(name=name, deep=deep) assert_eq(pidx_copy, cidx_copy) @pytest.mark.parametrize("name", ["x"]) -@pytest.mark.parametrize("dtype", ["category", "object"]) -def test_index_copy_string(name, dtype, deep=True): +def test_index_copy_string(name, deep=True): cidx = cudf.StringIndex(["a", "b", "c"]) pidx = cidx.to_pandas() - with pytest.warns(FutureWarning): - pidx_copy = pidx.copy(name=name, deep=deep, dtype=dtype) - with pytest.warns(FutureWarning): - cidx_copy = cidx.copy(name=name, deep=deep, dtype=dtype) + pidx_copy = pidx.copy(name=name, deep=deep) + cidx_copy = cidx.copy(name=name, deep=deep) assert_eq(pidx_copy, cidx_copy) @pytest.mark.parametrize("name", ["x"]) -@pytest.mark.parametrize( - "dtype", - NUMERIC_TYPES + ["datetime64[ns]", "timedelta64[ns]"] + OTHER_TYPES, -) -def test_index_copy_integer(name, dtype, deep=True): +def test_index_copy_integer(name, deep=True): """Test for NumericIndex Copy Casts""" cidx = cudf.Index([1, 2, 3]) pidx = cidx.to_pandas() - with pytest.warns(FutureWarning): - pidx_copy = pidx.copy(name=name, deep=deep, dtype=dtype) - with pytest.warns(FutureWarning): - cidx_copy = cidx.copy(name=name, deep=deep, dtype=dtype) + pidx_copy = pidx.copy(name=name, deep=deep) + cidx_copy = cidx.copy(name=name, deep=deep) assert_eq(pidx_copy, cidx_copy) @pytest.mark.parametrize("name", ["x"]) -@pytest.mark.parametrize("dtype", SIGNED_TYPES) -def test_index_copy_float(name, dtype, deep=True): +def test_index_copy_float(name, deep=True): """Test for NumericIndex Copy Casts""" cidx = cudf.Index([1.0, 2.0, 3.0]) pidx = cidx.to_pandas() - with pytest.warns(FutureWarning): - pidx_copy = pidx.copy(name=name, deep=deep, dtype=dtype) - with pytest.warns(FutureWarning): - cidx_copy = cidx.copy(name=name, deep=deep, dtype=dtype) + pidx_copy = pidx.copy(name=name, deep=deep) + cidx_copy = cidx.copy(name=name, deep=deep) assert_eq(pidx_copy, cidx_copy) @pytest.mark.parametrize("name", ["x"]) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["category"]) -def test_index_copy_category(name, dtype, deep=True): +def test_index_copy_category(name, deep=True): cidx = cudf.core.index.CategoricalIndex([1, 2, 3]) pidx = cidx.to_pandas() - with pytest.warns(FutureWarning): - pidx_copy = pidx.copy(name=name, deep=deep, dtype=dtype) - with pytest.warns(FutureWarning): - cidx_copy = cidx.copy(name=name, deep=deep, dtype=dtype) + pidx_copy = pidx.copy(name=name, deep=deep) + cidx_copy = cidx.copy(name=name, deep=deep) assert_column_memory_ne(cidx._values, cidx_copy._values) assert_eq(pidx_copy, cidx_copy) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 14e3a2a1b9b..d1da63e1d74 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -700,15 +700,8 @@ def test_multiindex_equals(): } ], ) -@pytest.mark.parametrize( - "levels", - [[["2000-01-01", "2000-01-02", "2000-01-03"], ["A", "B", "C"]], None], -) -@pytest.mark.parametrize( - "codes", [[[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]], None] -) @pytest.mark.parametrize("names", [["X", "Y"]]) -def test_multiindex_copy_sem(data, levels, codes, names): +def test_multiindex_copy_sem(data, names): """Test semantic equality for MultiIndex.copy""" gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() @@ -717,12 +710,10 @@ def test_multiindex_copy_sem(data, levels, codes, names): pdf = pdf.groupby(["Date", "Symbol"], sort=True).mean() gmi = gdf.index - with expect_warning_if(levels is not None or codes is not None): - gmi_copy = gmi.copy(levels=levels, codes=codes, names=names) + gmi_copy = gmi.copy(names=names) pmi = pdf.index - with expect_warning_if(levels is not None or codes is not None): - pmi_copy = pmi.copy(levels=levels, codes=codes, names=names) + pmi_copy = pmi.copy(names=names) for glv, plv in zip(gmi_copy.levels, pmi_copy.levels): assert all(glv.values_host == plv.values) From 7ec76b71e8d9f42693eb23f99f2362d9e3aa4a04 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 8 Mar 2023 17:13:51 -0600 Subject: [PATCH 007/162] Drop `kind` parameter from `Index.get_slice_bound` (#12856) This PR drops `kind` parameter from `Index.get_slice_bound` to match pandas-2.0 API. --- python/cudf/cudf/core/_base_index.py | 3 +- python/cudf/cudf/core/column/column.py | 12 ++----- python/cudf/cudf/core/index.py | 20 ++--------- python/cudf/cudf/tests/test_monotonic.py | 44 ++++++++---------------- 4 files changed, 21 insertions(+), 58 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index f72c0f8b1be..88763b8a011 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1412,7 +1412,7 @@ def rename(self, name, inplace=False): out.name = name return out - def get_slice_bound(self, label, side, kind=None): + def get_slice_bound(self, label, side): """ Calculate slice bound that corresponds to given label. Returns leftmost (one-past-the-rightmost if ``side=='right'``) position @@ -1422,7 +1422,6 @@ def get_slice_bound(self, label, side, kind=None): ---------- label : object side : {'left', 'right'} - kind : {'ix', 'loc', 'getitem'} Returns ------- diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 40921b71db5..31cc5a4327f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -883,7 +883,7 @@ def is_monotonic_decreasing(self) -> bool: ascending=[False], null_position=None ) - def get_slice_bound(self, label: ScalarLike, side: str, kind: str) -> int: + def get_slice_bound(self, label: ScalarLike, side: str) -> int: """ Calculate slice bound that corresponds to given label. Returns leftmost (one-past-the-rightmost if ``side=='right'``) position @@ -893,22 +893,14 @@ def get_slice_bound(self, label: ScalarLike, side: str, kind: str) -> int: ---------- label : Scalar side : {'left', 'right'} - kind : {'ix', 'loc', 'getitem'} """ - if kind not in {"ix", "loc", "getitem", None}: - raise ValueError( - f"Invalid value for ``kind`` parameter," - f" must be either one of the following: " - f"{'ix', 'loc', 'getitem', None}, but found: {kind}" - ) + if side not in {"left", "right"}: raise ValueError( "Invalid value for side kwarg," " must be either 'left' or 'right': %s" % (side,) ) - # TODO: Handle errors/missing keys correctly - # Not currently using `kind` argument. if side == "left": return self.find_first_value(label, closest=True) elif side == "right": diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 363fa37f394..145563dce61 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -495,7 +495,7 @@ def is_monotonic_decreasing(self): return self._step < 0 or len(self) <= 1 @_cudf_nvtx_annotate - def get_slice_bound(self, label, side, kind=None): + def get_slice_bound(self, label, side): """ Calculate slice bound that corresponds to given label. Returns leftmost (one-past-the-rightmost if ``side=='right'``) position @@ -506,20 +506,12 @@ def get_slice_bound(self, label, side, kind=None): label : int A valid value in the ``RangeIndex`` side : {'left', 'right'} - kind : Unused - To keep consistency with other index types. Returns ------- int Index of label. """ - if kind is not None: - warnings.warn( - "'kind' argument in get_slice_bound is deprecated and will be " - "removed in a future version.", - FutureWarning, - ) if side not in {"left", "right"}: raise ValueError(f"Unrecognized side parameter: {side}") @@ -1388,14 +1380,8 @@ def notna(self): notnull = notna @_cudf_nvtx_annotate - def get_slice_bound(self, label, side, kind=None): - if kind is not None: - warnings.warn( - "'kind' argument in get_slice_bound is deprecated and will be " - "removed in a future version.", - FutureWarning, - ) - return self._values.get_slice_bound(label, side, kind) + def get_slice_bound(self, label, side): + return self._values.get_slice_bound(label, side) def _is_numeric(self): return False diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py index 93c202c3138..e68024f03d4 100644 --- a/python/cudf/cudf/tests/test_monotonic.py +++ b/python/cudf/cudf/tests/test_monotonic.py @@ -17,7 +17,7 @@ RangeIndex, StringIndex, ) -from cudf.testing._utils import assert_eq, expect_warning_if +from cudf.testing._utils import assert_eq @pytest.mark.parametrize("testrange", [(10, 20, 1), (0, -10, -1), (5, 5, 1)]) @@ -222,15 +222,12 @@ def test_multiindex_tuples(testarr): ], ) @pytest.mark.parametrize("side", ["left", "right"]) -@pytest.mark.parametrize("kind", ["loc", "getitem", None]) -def test_get_slice_bound(testlist, side, kind): +def test_get_slice_bound(testlist, side): index = GenericIndex(testlist) index_pd = pd.Index(testlist) for label in testlist: - with pytest.warns(FutureWarning): - expect = index_pd.get_slice_bound(label, side, kind) - with expect_warning_if(kind is not None, FutureWarning): - got = index.get_slice_bound(label, side, kind) + expect = index_pd.get_slice_bound(label, side) + got = index.get_slice_bound(label, side) assert got == expect @@ -240,16 +237,13 @@ def test_get_slice_bound(testlist, side, kind): [[-1, 0, 5, 10, 11], [-1, 0, 1, 2], [2, 3, 4, 5], [-1, 0, 1], [2, 3, 4]], ) @pytest.mark.parametrize("side", ["left", "right"]) -@pytest.mark.parametrize("kind", ["getitem", "loc"]) -def test_rangeindex_get_slice_bound_basic(bounds, indices, side, kind): +def test_rangeindex_get_slice_bound_basic(bounds, indices, side): start, stop = bounds pd_index = pd.RangeIndex(start, stop) cudf_index = RangeIndex(start, stop) for idx in indices: - with pytest.warns(FutureWarning): - expect = pd_index.get_slice_bound(idx, side, kind) - with expect_warning_if(kind is not None, FutureWarning): - got = cudf_index.get_slice_bound(idx, side, kind) + expect = pd_index.get_slice_bound(idx, side) + got = cudf_index.get_slice_bound(idx, side) assert expect == got @@ -262,31 +256,25 @@ def test_rangeindex_get_slice_bound_basic(bounds, indices, side, kind): [3, 8, 13, 18, 20, 15, 10, 5, -1, 0, 19, 21, 6, 11, 17], ) @pytest.mark.parametrize("side", ["left", "right"]) -@pytest.mark.parametrize("kind", ["getitem", "loc"]) -def test_rangeindex_get_slice_bound_step(bounds, label, side, kind): +def test_rangeindex_get_slice_bound_step(bounds, label, side): start, stop, step = bounds pd_index = pd.RangeIndex(start, stop, step) cudf_index = RangeIndex(start, stop, step) - with pytest.warns(FutureWarning): - expect = pd_index.get_slice_bound(label, side, kind) - with expect_warning_if(kind is not None, FutureWarning): - got = cudf_index.get_slice_bound(label, side, kind) + expect = pd_index.get_slice_bound(label, side) + got = cudf_index.get_slice_bound(label, side) assert expect == got @pytest.mark.parametrize("label", [1, 3, 5, 7, 9, 11]) @pytest.mark.parametrize("side", ["left", "right"]) -@pytest.mark.parametrize("kind", ["loc", "getitem", None]) -def test_get_slice_bound_missing(label, side, kind): +def test_get_slice_bound_missing(label, side): mylist = [2, 4, 6, 8, 10] index = GenericIndex(mylist) index_pd = pd.Index(mylist) - with pytest.warns(FutureWarning): - expect = index_pd.get_slice_bound(label, side, kind) - with expect_warning_if(kind is not None, FutureWarning): - got = index.get_slice_bound(label, side, kind) + expect = index_pd.get_slice_bound(label, side) + got = index.get_slice_bound(label, side) assert got == expect @@ -299,10 +287,8 @@ def test_get_slice_bound_missing_str(label, side): mylist = ["b", "d", "f"] index = GenericIndex(mylist) index_pd = pd.Index(mylist) - with pytest.warns(FutureWarning): - got = index.get_slice_bound(label, side, "getitem") - with pytest.warns(FutureWarning): - expect = index_pd.get_slice_bound(label, side, "getitem") + got = index.get_slice_bound(label, side) + expect = index_pd.get_slice_bound(label, side) assert got == expect From 58b9acb28b3ed640f52c4899f26c4094da1eb352 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 10 Mar 2023 13:29:57 -0600 Subject: [PATCH 008/162] [REVIEW] Update `numeric_only` behavior in reduction APIs (#12847) - [x] This PR removes the deprecation of `numeric_only=None` and defaults to `numeric_only=False`. - [x] Removes `level` parameter from reduction APIs to match pandas-2.0 - [x] Change `axis` defaults to match pandas-2.0 APIs. --- python/cudf/cudf/core/dataframe.py | 60 ++++--- python/cudf/cudf/core/frame.py | 155 ++++++++----------- python/cudf/cudf/core/series.py | 18 +-- python/cudf/cudf/core/single_column_frame.py | 14 +- python/cudf/cudf/tests/test_dataframe.py | 36 ++--- python/cudf/cudf/tests/test_stats.py | 38 ++--- 6 files changed, 129 insertions(+), 192 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 32a0a8ca510..978109917b6 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5780,7 +5780,7 @@ def _prepare_for_rowwise_op(self, method, skipna): return coerced, mask, common_dtype @_cudf_nvtx_annotate - def count(self, axis=0, level=None, numeric_only=False, **kwargs): + def count(self, axis=0, numeric_only=False): """ Count ``non-NA`` cells for each column or row. @@ -5793,7 +5793,7 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): Notes ----- - Parameters currently not supported are `axis`, `level`, `numeric_only`. + Parameters currently not supported are `axis`, `numeric_only`. Examples -------- @@ -5831,12 +5831,9 @@ def _reduce( self, op, axis=None, - level=None, - numeric_only=None, + numeric_only=False, **kwargs, ): - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") source = self if numeric_only: @@ -5872,33 +5869,28 @@ def _reduce( "skew", ) - if numeric_only is None and op in numeric_ops: - warnings.warn( - f"The default value of numeric_only in DataFrame.{op} " - "is deprecated. In a future version, it will default " - "to False. In addition, specifying " - "'numeric_only=None' is deprecated. Select only valid " - "columns or specify the value of numeric_only to " - "silence this warning.", - FutureWarning, - ) - numeric_cols = ( - name + if op in numeric_ops: + if numeric_only: + try: + result = [ + getattr(source._data[col], op)(**kwargs) + for col in source._data.names + ] + except AttributeError: + raise NotImplementedError( + f"Not all column dtypes support op {op}" + ) + elif any( + not is_numeric_dtype(self._data[name]) for name in self._data.names - if is_numeric_dtype(self._data[name]) - ) - source = self._get_columns_by_label(numeric_cols) - if source.empty: - return Series(index=cudf.StringIndex([])) - try: - result = [ - getattr(source._data[col], op)(**kwargs) - for col in source._data.names - ] - except AttributeError: + ): raise TypeError( - f"Not all column dtypes support op {op}" + "Non numeric columns passed with " + "`numeric_only=False`, pass `numeric_only=True` " + f"to perform DataFrame.{op}" ) + else: + raise else: raise @@ -6024,14 +6016,14 @@ def mode(self, axis=0, numeric_only=False, dropna=True): return df @_cudf_nvtx_annotate - def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): + def all(self, axis=0, bool_only=None, skipna=True, **kwargs): obj = self.select_dtypes(include="bool") if bool_only else self - return super(DataFrame, obj).all(axis, skipna, level, **kwargs) + return super(DataFrame, obj).all(axis, skipna, **kwargs) @_cudf_nvtx_annotate - def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): + def any(self, axis=0, bool_only=None, skipna=True, **kwargs): obj = self.select_dtypes(include="bool") if bool_only else self - return super(DataFrame, obj).any(axis, skipna, level, **kwargs) + return super(DataFrame, obj).any(axis, skipna, **kwargs) @_cudf_nvtx_annotate def _apply_cupy_method_axis_1(self, method, *args, **kwargs): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index ea6a6de0b2b..aaee223e854 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1885,10 +1885,9 @@ def _reduce(self, *args, **kwargs): @_cudf_nvtx_annotate def min( self, - axis=None, + axis=0, skipna=True, - level=None, - numeric_only=None, + numeric_only=False, **kwargs, ): """ @@ -1900,35 +1899,32 @@ def min( Axis for the function to be applied on. skipna: bool, default True Exclude NA/null values when computing the result. - level: int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a Series. - numeric_only: bool, default None - Include only float, int, boolean columns. If None, will attempt to - use everything, then use only numeric data. + numeric_only: bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. Returns ------- Series - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - Examples -------- >>> import cudf >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.min() + >>> min_series = df.min() + >>> min_series a 1 b 7 dtype: int64 + + >>> min_series.min() + 1 """ return self._reduce( "min", axis=axis, skipna=skipna, - level=level, numeric_only=numeric_only, **kwargs, ) @@ -1936,9 +1932,8 @@ def min( @_cudf_nvtx_annotate def max( self, - axis=None, + axis=0, skipna=True, - level=None, numeric_only=None, **kwargs, ): @@ -1951,12 +1946,10 @@ def max( Axis for the function to be applied on. skipna: bool, default True Exclude NA/null values when computing the result. - level: int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a Series. numeric_only: bool, default None - Include only float, int, boolean columns. If None, will attempt to - use everything, then use only numeric data. + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. Returns ------- @@ -1979,7 +1972,6 @@ def max( "max", axis=axis, skipna=skipna, - level=level, numeric_only=numeric_only, **kwargs, ) @@ -1990,8 +1982,7 @@ def sum( axis=None, skipna=True, dtype=None, - level=None, - numeric_only=None, + numeric_only=False, min_count=0, **kwargs, ): @@ -2006,6 +1997,10 @@ def sum( Exclude NA/null values when computing the result. dtype: data type Data type to cast the result to. + numeric_only : bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. min_count: int, default 0 The required number of valid values to perform the operation. If fewer than min_count non-NA values are present the result @@ -2018,10 +2013,6 @@ def sum( ------- Series - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - Examples -------- >>> import cudf @@ -2036,7 +2027,6 @@ def sum( axis=axis, skipna=skipna, dtype=dtype, - level=level, numeric_only=numeric_only, min_count=min_count, **kwargs, @@ -2048,8 +2038,7 @@ def product( axis=None, skipna=True, dtype=None, - level=None, - numeric_only=None, + numeric_only=False, min_count=0, **kwargs, ): @@ -2064,6 +2053,10 @@ def product( Exclude NA/null values when computing the result. dtype: data type Data type to cast the result to. + numeric_only : bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. min_count: int, default 0 The required number of valid values to perform the operation. If fewer than min_count non-NA values are present the result @@ -2076,10 +2069,6 @@ def product( ------- Series - Notes - ----- - Parameters currently not supported are level`, `numeric_only`. - Examples -------- >>> import cudf @@ -2097,7 +2086,6 @@ def product( axis=axis, skipna=skipna, dtype=dtype, - level=level, numeric_only=numeric_only, min_count=min_count, **kwargs, @@ -2107,9 +2095,7 @@ def product( prod = product @_cudf_nvtx_annotate - def mean( - self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs - ): + def mean(self, axis=0, skipna=True, numeric_only=False, **kwargs): """ Return the mean of the values for the requested axis. @@ -2119,13 +2105,10 @@ def mean( Axis for the function to be applied on. skipna : bool, default True Exclude NA/null values when computing the result. - level : int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a Series. - numeric_only : bool, default None - Include only float, int, boolean columns. If None, will attempt to - use everything, then use only numeric data. Not implemented for - Series. + numeric_only : bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. **kwargs Additional keyword arguments to be passed to the function. @@ -2146,7 +2129,6 @@ def mean( "mean", axis=axis, skipna=skipna, - level=level, numeric_only=numeric_only, **kwargs, ) @@ -2156,9 +2138,8 @@ def std( self, axis=None, skipna=True, - level=None, ddof=1, - numeric_only=None, + numeric_only=False, **kwargs, ): """ @@ -2177,16 +2158,15 @@ def std( ddof: int, default 1 Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. + numeric_only : bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. Returns ------- Series - Notes - ----- - Parameters currently not supported are `level` and - `numeric_only` - Examples -------- >>> import cudf @@ -2201,7 +2181,6 @@ def std( "std", axis=axis, skipna=skipna, - level=level, ddof=ddof, numeric_only=numeric_only, **kwargs, @@ -2212,9 +2191,8 @@ def var( self, axis=None, skipna=True, - level=None, ddof=1, - numeric_only=None, + numeric_only=False, **kwargs, ): """ @@ -2233,16 +2211,15 @@ def var( ddof: int, default 1 Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. + numeric_only : bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. Returns ------- scalar - Notes - ----- - Parameters currently not supported are `level` and - `numeric_only` - Examples -------- >>> import cudf @@ -2256,16 +2233,13 @@ def var( "var", axis=axis, skipna=skipna, - level=level, ddof=ddof, numeric_only=numeric_only, **kwargs, ) @_cudf_nvtx_annotate - def kurtosis( - self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs - ): + def kurtosis(self, axis=0, skipna=True, numeric_only=False, **kwargs): """ Return Fisher's unbiased kurtosis of a sample. @@ -2278,15 +2252,15 @@ def kurtosis( Axis for the function to be applied on. skipna: bool, default True Exclude NA/null values when computing the result. + numeric_only : bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. Returns ------- Series or scalar - Notes - ----- - Parameters currently not supported are `level` and `numeric_only` - Examples -------- **Series** @@ -2312,7 +2286,6 @@ def kurtosis( "kurtosis", axis=axis, skipna=skipna, - level=level, numeric_only=numeric_only, **kwargs, ) @@ -2321,9 +2294,7 @@ def kurtosis( kurt = kurtosis @_cudf_nvtx_annotate - def skew( - self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs - ): + def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs): """ Return unbiased Fisher-Pearson skew of a sample. @@ -2331,6 +2302,10 @@ def skew( ---------- skipna: bool, default True Exclude NA/null values when computing the result. + numeric_only : bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. Returns ------- @@ -2338,8 +2313,7 @@ def skew( Notes ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` + Parameter currently not supported is `axis` Examples -------- @@ -2373,13 +2347,12 @@ def skew( "skew", axis=axis, skipna=skipna, - level=level, numeric_only=numeric_only, **kwargs, ) @_cudf_nvtx_annotate - def all(self, axis=0, skipna=True, level=None, **kwargs): + def all(self, axis=0, skipna=True, **kwargs): """ Return whether all elements are True in DataFrame. @@ -2398,7 +2371,7 @@ def all(self, axis=0, skipna=True, level=None, **kwargs): Notes ----- - Parameters currently not supported are `axis`, `bool_only`, `level`. + Parameters currently not supported are `axis`, `bool_only`. Examples -------- @@ -2413,12 +2386,11 @@ def all(self, axis=0, skipna=True, level=None, **kwargs): "all", axis=axis, skipna=skipna, - level=level, **kwargs, ) @_cudf_nvtx_annotate - def any(self, axis=0, skipna=True, level=None, **kwargs): + def any(self, axis=0, skipna=True, **kwargs): """ Return whether any elements is True in DataFrame. @@ -2437,7 +2409,7 @@ def any(self, axis=0, skipna=True, level=None, **kwargs): Notes ----- - Parameters currently not supported are `axis`, `bool_only`, `level`. + Parameters currently not supported are `axis`, `bool_only`. Examples -------- @@ -2452,7 +2424,6 @@ def any(self, axis=0, skipna=True, level=None, **kwargs): "any", axis=axis, skipna=skipna, - level=level, **kwargs, ) @@ -2486,25 +2457,26 @@ def sum_of_squares(self, dtype=None): return self._reduce("sum_of_squares", dtype=dtype) @_cudf_nvtx_annotate - def median( - self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs - ): + def median(self, axis=0, skipna=True, numeric_only=False, **kwargs): """ Return the median of the values for the requested axis. Parameters ---------- + axis : {index (0), columns (1)} + Axis for the function to be applied on. For Series this + parameter is unused and defaults to 0. skipna : bool, default True Exclude NA/null values when computing the result. + numeric_only : bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. Returns ------- scalar - Notes - ----- - Parameters currently not supported are `level` and `numeric_only`. - Examples -------- >>> import cudf @@ -2524,7 +2496,6 @@ def median( "median", axis=axis, skipna=skipna, - level=level, numeric_only=numeric_only, **kwargs, ) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index d486851176a..041e5aa07b9 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1850,20 +1850,20 @@ def between(self, left, right, inclusive="both") -> Series: return self._from_data({self.name: lmask & rmask}, self._index) @_cudf_nvtx_annotate - def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): + def all(self, axis=0, bool_only=None, skipna=True, **kwargs): if bool_only not in (None, True): raise NotImplementedError( "The bool_only parameter is not supported for Series." ) - return super().all(axis, skipna, level, **kwargs) + return super().all(axis, skipna, **kwargs) @_cudf_nvtx_annotate - def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): + def any(self, axis=0, bool_only=None, skipna=True, **kwargs): if bool_only not in (None, True): raise NotImplementedError( "The bool_only parameter is not supported for Series." ) - return super().any(axis, skipna, level, **kwargs) + return super().any(axis, skipna, **kwargs) @_cudf_nvtx_annotate def to_pandas(self, index=True, nullable=False, **kwargs): @@ -2460,7 +2460,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs): # Stats # @_cudf_nvtx_annotate - def count(self, level=None, **kwargs): + def count(self): """ Return number of non-NA/null observations in the Series @@ -2469,10 +2469,6 @@ def count(self, level=None, **kwargs): int Number of non-null values in the Series. - Notes - ----- - Parameters currently not supported is `level`. - Examples -------- >>> import cudf @@ -2480,10 +2476,6 @@ def count(self, level=None, **kwargs): >>> ser.count() 5 """ - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - return self.valid_count @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index da48bc0b5a9..9e380e63ae0 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -15,6 +15,7 @@ _is_scalar_or_zero_d_array, is_bool_dtype, is_integer_dtype, + is_numeric_dtype, ) from cudf.core.column import ColumnBase, as_column from cudf.core.frame import Frame @@ -41,19 +42,16 @@ def _reduce( self, op, axis=None, - level=None, - numeric_only=None, + numeric_only=False, **kwargs, ): if axis not in (None, 0): raise NotImplementedError("axis parameter is not implemented yet") - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only: - raise NotImplementedError( - f"Series.{op} does not implement numeric_only" + if numeric_only and not is_numeric_dtype(self._column): + raise TypeError( + f"Series.{op} does not allow numeric_only={numeric_only} " + "with non-numeric dtypes." ) try: return getattr(self._column, op)(**kwargs) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index df235d48a30..fdb6790187e 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8338,8 +8338,8 @@ def test_describe_misc_include(df, include): def test_describe_misc_exclude(df, exclude): pdf = df.to_pandas() - expected = pdf.describe(exclude=exclude, datetime_is_numeric=True) - actual = df.describe(exclude=exclude, datetime_is_numeric=True) + expected = pdf.describe(exclude=exclude) + actual = df.describe(exclude=exclude) for col in expected.columns: if expected[col].dtype == np.dtype("object"): @@ -9703,19 +9703,15 @@ def test_dataframe_pct_change(data, periods, fill_method): assert_eq(expected, actual) -def test_mean_timeseries(): +@pytest.mark.parametrize("numeric_only", [True, False]) +def test_mean_timeseries(numeric_only): gdf = cudf.datasets.timeseries() + if not numeric_only: + gdf = gdf.select_dtypes(include="number") pdf = gdf.to_pandas() - expected = pdf.mean(numeric_only=True) - actual = gdf.mean(numeric_only=True) - - assert_eq(expected, actual) - - with pytest.warns(FutureWarning): - expected = pdf.mean() - with pytest.warns(FutureWarning): - actual = gdf.mean() + expected = pdf.mean(numeric_only=numeric_only) + actual = gdf.mean(numeric_only=numeric_only) assert_eq(expected, actual) @@ -9730,19 +9726,15 @@ def test_mean_timeseries(): } ], ) -def test_std_different_dtypes(data): +@pytest.mark.parametrize("numeric_only", [True, False]) +def test_std_different_dtypes(data, numeric_only): gdf = cudf.DataFrame(data) + if not numeric_only: + gdf = gdf.select_dtypes(include="number") pdf = gdf.to_pandas() - expected = pdf.std(numeric_only=True) - actual = gdf.std(numeric_only=True) - - assert_eq(expected, actual) - - with pytest.warns(FutureWarning): - expected = pdf.std() - with pytest.warns(FutureWarning): - actual = gdf.std() + expected = pdf.std(numeric_only=numeric_only) + actual = gdf.std(numeric_only=numeric_only) assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 6478fbaad95..6ca64fdcfa3 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -593,30 +593,26 @@ def test_cov_corr_invalid_dtypes(gsr): ], ) @pytest.mark.parametrize("null_flag", [False, True]) -def test_kurtosis_df(data, null_flag): +@pytest.mark.parametrize("numeric_only", [False, True]) +def test_kurtosis_df(data, null_flag, numeric_only): + if not numeric_only: + data = data.select_dtypes(include="number") pdata = data.to_pandas() if null_flag and len(data) > 2: data.iloc[[0, 2]] = None pdata.iloc[[0, 2]] = None - with pytest.warns(FutureWarning): - got = data.kurtosis() + got = data.kurtosis(numeric_only=numeric_only) got = got if np.isscalar(got) else got.to_numpy() - with pytest.warns(FutureWarning): - expected = pdata.kurtosis() - np.testing.assert_array_almost_equal(got, expected) - with pytest.warns(FutureWarning): - got = data.kurt() - got = got if np.isscalar(got) else got.to_numpy() - with pytest.warns(FutureWarning): - expected = pdata.kurt() + expected = pdata.kurtosis(numeric_only=numeric_only) np.testing.assert_array_almost_equal(got, expected) - got = data.kurt(numeric_only=True) + got = data.kurt(numeric_only=numeric_only) got = got if np.isscalar(got) else got.to_numpy() - expected = pdata.kurt(numeric_only=True) + + expected = pdata.kurt(numeric_only=numeric_only) np.testing.assert_array_almost_equal(got, expected) @@ -629,21 +625,17 @@ def test_kurtosis_df(data, null_flag): ], ) @pytest.mark.parametrize("null_flag", [False, True]) -def test_skew_df(data, null_flag): +@pytest.mark.parametrize("numeric_only", [False, True]) +def test_skew_df(data, null_flag, numeric_only): + if not numeric_only: + data = data.select_dtypes(include="number") pdata = data.to_pandas() if null_flag and len(data) > 2: data.iloc[[0, 2]] = None pdata.iloc[[0, 2]] = None - with pytest.warns(FutureWarning): - got = data.skew() - with pytest.warns(FutureWarning): - expected = pdata.skew() - got = got if np.isscalar(got) else got.to_numpy() - np.testing.assert_array_almost_equal(got, expected) - - got = data.skew(numeric_only=True) - expected = pdata.skew(numeric_only=True) + got = data.skew(numeric_only=numeric_only) + expected = pdata.skew(numeric_only=numeric_only) got = got if np.isscalar(got) else got.to_numpy() np.testing.assert_array_almost_equal(got, expected) From e115ba593dc168e765fc442dabfd170367b0042c Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 10 Mar 2023 17:37:25 -0600 Subject: [PATCH 009/162] [REVIEW] Drop `DataFrame.append` and `Series.append` (#12839) This PR removes `DataFrame.append` & `Series.append` to match pandas-2.0 API. Test usages are now replaced with `.concat` API calls. --- docs/cudf/source/api_docs/dataframe.rst | 1 - docs/cudf/source/api_docs/series.rst | 1 - python/cudf/cudf/core/dataframe.py | 140 ------------------- python/cudf/cudf/core/indexed_frame.py | 22 --- python/cudf/cudf/core/series.py | 76 ---------- python/cudf/cudf/tests/test_dataframe.py | 168 +++++++++++++---------- python/cudf/cudf/tests/test_series.py | 57 ++++---- 7 files changed, 119 insertions(+), 346 deletions(-) diff --git a/docs/cudf/source/api_docs/dataframe.rst b/docs/cudf/source/api_docs/dataframe.rst index dfe1b2a9b9b..5643f9cff48 100644 --- a/docs/cudf/source/api_docs/dataframe.rst +++ b/docs/cudf/source/api_docs/dataframe.rst @@ -231,7 +231,6 @@ Combining / comparing / joining / merging .. autosummary:: :toctree: api/ - DataFrame.append DataFrame.assign DataFrame.join DataFrame.merge diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst index 9cd0770431c..4c0af814f85 100644 --- a/docs/cudf/source/api_docs/series.rst +++ b/docs/cudf/source/api_docs/series.rst @@ -218,7 +218,6 @@ Combining / comparing / joining / merging .. autosummary:: :toctree: api/ - Series.append Series.update Time Series-related diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 978109917b6..9cfdf46826f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6624,146 +6624,6 @@ def iterrows(self): "if you wish to iterate over each row." ) - @_cudf_nvtx_annotate - def append( - self, other, ignore_index=False, verify_integrity=False, sort=False - ): - """ - Append rows of `other` to the end of caller, returning a new object. - Columns in `other` that are not in the caller are added as new columns. - - Parameters - ---------- - other : DataFrame or Series/dict-like object, or list of these - The data to append. - ignore_index : bool, default False - If True, do not use the index labels. - sort : bool, default False - Sort columns ordering if the columns of - `self` and `other` are not aligned. - verify_integrity : bool, default False - This Parameter is currently not supported. - - Returns - ------- - DataFrame - - See Also - -------- - cudf.concat : General function to concatenate DataFrame or - objects. - - Notes - ----- - If a list of dict/series is passed and the keys are all contained in - the DataFrame's index, the order of the columns in the resulting - DataFrame will be unchanged. - Iteratively appending rows to a cudf DataFrame can be more - computationally intensive than a single concatenate. A better - solution is to append those rows to a list and then concatenate - the list with the original DataFrame all at once. - `verify_integrity` parameter is not supported yet. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame([[1, 2], [3, 4]], columns=list('AB')) - >>> df - A B - 0 1 2 - 1 3 4 - >>> df2 = cudf.DataFrame([[5, 6], [7, 8]], columns=list('AB')) - >>> df2 - A B - 0 5 6 - 1 7 8 - >>> df.append(df2) - A B - 0 1 2 - 1 3 4 - 0 5 6 - 1 7 8 - - With `ignore_index` set to True: - - >>> df.append(df2, ignore_index=True) - A B - 0 1 2 - 1 3 4 - 2 5 6 - 3 7 8 - - The following, while not recommended methods for generating DataFrames, - show two ways to generate a DataFrame from multiple data sources. - Less efficient: - - >>> df = cudf.DataFrame(columns=['A']) - >>> for i in range(5): - ... df = df.append({'A': i}, ignore_index=True) - >>> df - A - 0 0 - 1 1 - 2 2 - 3 3 - 4 4 - - More efficient than above: - - >>> cudf.concat([cudf.DataFrame([i], columns=['A']) for i in range(5)], - ... ignore_index=True) - A - 0 0 - 1 1 - 2 2 - 3 3 - 4 4 - """ - if isinstance(other, dict): - if not ignore_index: - raise TypeError("Can only append a dict if ignore_index=True") - other = DataFrame(other) - elif isinstance(other, Series): - if other.name is None and not ignore_index: - raise TypeError( - "Can only append a Series if ignore_index=True " - "or if the Series has a name" - ) - - current_cols = self._data.to_pandas_index() - combined_columns = other.index.to_pandas() - if len(current_cols): - if cudf.utils.dtypes.is_mixed_with_object_dtype( - current_cols, combined_columns - ): - raise TypeError( - "cudf does not support mixed types, please type-cast " - "the column index of dataframe and index of series " - "to same dtypes." - ) - - combined_columns = current_cols.union( - combined_columns, sort=False - ) - - if sort: - combined_columns = combined_columns.sort_values() - - other = other.reindex(combined_columns, copy=False).to_frame().T - if not current_cols.equals(combined_columns): - self = self.reindex(columns=combined_columns) - elif ( - isinstance(other, list) - and other - and not isinstance(other[0], DataFrame) - ): - other = DataFrame(other) - cols = self._data.to_pandas_index() - if (cols.get_indexer(other._data.to_pandas_index()) >= 0).all(): - other = other.reindex(columns=cols) - - return super()._append(other, ignore_index, verify_integrity, sort) - @_cudf_nvtx_annotate @copy_docstring(reshape.pivot) def pivot(self, index, columns, values=None): diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 159cc318789..074bd554601 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3438,28 +3438,6 @@ def repeat(self, repeats, axis=None): self._index_names, ) - def _append( - self, other, ignore_index=False, verify_integrity=False, sort=None - ): - # Note: Do not remove this function until pandas does. This warning is - # to clean up cudf but to match a deprecation in pandas - warnings.warn( - "The append method is deprecated and will be removed in a future " - "version. Use cudf.concat instead.", - FutureWarning, - ) - if verify_integrity not in (None, False): - raise NotImplementedError( - "verify_integrity parameter is not supported yet." - ) - - if is_list_like(other): - to_concat = [self, *other] - else: - to_concat = [self, other] - - return cudf.concat(to_concat, ignore_index=ignore_index, sort=sort) - def astype(self, dtype, copy=False, errors="raise", **kwargs): """Cast the object to the given dtype. diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 041e5aa07b9..f12bd183676 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -841,82 +841,6 @@ def to_dict(self, into: type[dict] = dict) -> dict: """ return self.to_pandas().to_dict(into=into) - @_cudf_nvtx_annotate - def append(self, to_append, ignore_index=False, verify_integrity=False): - """Append values from another ``Series`` or array-like object. - If ``ignore_index=True``, the index is reset. - - Parameters - ---------- - to_append : Series or list/tuple of Series - Series to append with self. - ignore_index : boolean, default False. - If True, do not use the index. - verify_integrity : bool, default False - This Parameter is currently not supported. - - Returns - ------- - Series - A new concatenated series - - See Also - -------- - cudf.concat : General function to concatenate DataFrame or - Series objects. - - Examples - -------- - >>> import cudf - >>> s1 = cudf.Series([1, 2, 3]) - >>> s2 = cudf.Series([4, 5, 6]) - >>> s1 - 0 1 - 1 2 - 2 3 - dtype: int64 - >>> s2 - 0 4 - 1 5 - 2 6 - dtype: int64 - >>> s1.append(s2) - 0 1 - 1 2 - 2 3 - 0 4 - 1 5 - 2 6 - dtype: int64 - - >>> s3 = cudf.Series([4, 5, 6], index=[3, 4, 5]) - >>> s3 - 3 4 - 4 5 - 5 6 - dtype: int64 - >>> s1.append(s3) - 0 1 - 1 2 - 2 3 - 3 4 - 4 5 - 5 6 - dtype: int64 - - With `ignore_index` set to True: - - >>> s1.append(s2, ignore_index=True) - 0 1 - 1 2 - 2 3 - 3 4 - 4 5 - 5 6 - dtype: int64 - """ - return super()._append(to_append, ignore_index, verify_integrity) - @_cudf_nvtx_annotate def reindex(self, *args, **kwargs): """ diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index af95b95ed68..3828a1ac10c 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -22,7 +22,12 @@ from packaging import version import cudf -from cudf.core._compat import PANDAS_GE_134, PANDAS_GE_150, PANDAS_LT_140 +from cudf.core._compat import ( + PANDAS_GE_134, + PANDAS_GE_150, + PANDAS_GE_200, + PANDAS_LT_140, +) from cudf.core.buffer.spill_manager import get_global_manager from cudf.core.column import column from cudf.testing import _utils as utils @@ -239,7 +244,7 @@ def test_series_from_cupy_scalars(): @pytest.mark.parametrize("a", [[1, 2, 3], [1, 10, 30]]) @pytest.mark.parametrize("b", [[4, 5, 6], [-11, -100, 30]]) -def test_append_index(a, b): +def test_concat_index(a, b): df = pd.DataFrame() df["a"] = a @@ -249,19 +254,14 @@ def test_append_index(a, b): gdf["a"] = a gdf["b"] = b - # Check the default index after appending two columns(Series) - with pytest.warns(FutureWarning, match="append method is deprecated"): - expected = df.a.append(df.b) - with pytest.warns(FutureWarning, match="append method is deprecated"): - actual = gdf.a.append(gdf.b) + expected = pd.concat([df.a, df.b]) + actual = cudf.concat([gdf.a, gdf.b]) assert len(expected) == len(actual) assert_eq(expected.index, actual.index) - with pytest.warns(FutureWarning, match="append method is deprecated"): - expected = df.a.append(df.b, ignore_index=True) - with pytest.warns(FutureWarning, match="append method is deprecated"): - actual = gdf.a.append(gdf.b, ignore_index=True) + expected = pd.concat([df.a, df.b], ignore_index=True) + actual = cudf.concat([gdf.a, gdf.b], ignore_index=True) assert len(expected) == len(actual) assert_eq(expected.index, actual.index) @@ -281,7 +281,8 @@ def test_append_index(a, b): pytest.param( {}, marks=pytest_xfail( - reason="https://github.com/rapidsai/cudf/issues/11080" + condition=not PANDAS_GE_150, + reason="https://github.com/rapidsai/cudf/issues/11080", ), ), pytest.param( @@ -1539,7 +1540,8 @@ def test_concat_different_column_dataframe(df1_d, df2_d): pdf1 = pd.DataFrame(df1_d) pdf2 = pd.DataFrame(df2_d) - # pandas warns when trying to concatenate any empty float columns (or float + # pandas(lower than pandas 2.0 only) warns when trying to + # concatenate any empty float columns (or float # columns with all None values) with any non-empty bool columns. def is_invalid_concat(left, right): return ( @@ -1548,7 +1550,7 @@ def is_invalid_concat(left, right): and right.count() == 0 ) - cond = any( + cond = (not PANDAS_GE_200) and any( is_invalid_concat(pdf1[colname], pdf2[colname]) or is_invalid_concat(pdf2[colname], pdf1[colname]) for colname in set(pdf1) & set(pdf2) @@ -7312,22 +7314,37 @@ def test_series_keys(ps): ) @pytest.mark.parametrize("sort", [False, True]) @pytest.mark.parametrize("ignore_index", [True, False]) -def test_dataframe_append_dataframe(df, other, sort, ignore_index): +def test_dataframe_concat_dataframe(df, other, sort, ignore_index): pdf = df other_pd = other gdf = cudf.from_pandas(df) other_gd = cudf.from_pandas(other) - with pytest.warns(FutureWarning, match="append method is deprecated"): - expected = pdf.append(other_pd, sort=sort, ignore_index=ignore_index) - with pytest.warns(FutureWarning, match="append method is deprecated"): - actual = gdf.append(other_gd, sort=sort, ignore_index=ignore_index) + expected = pd.concat([pdf, other_pd], sort=sort, ignore_index=ignore_index) + actual = cudf.concat([gdf, other_gd], sort=sort, ignore_index=ignore_index) + + # In empty dataframe cases, Pandas & cudf differ in columns + # creation, pandas creates RangeIndex(0, 0) + # whereas cudf creates an empty Index([], dtype="object"). + check_column_type = ( + False if len(expected.columns) == len(df.columns) == 0 else True + ) if expected.shape != df.shape: - assert_eq(expected.fillna(-1), actual.fillna(-1), check_dtype=False) + assert_eq( + expected.fillna(-1), + actual.fillna(-1), + check_dtype=False, + check_column_type=check_column_type, + ) else: - assert_eq(expected, actual, check_index_type=not gdf.empty) + assert_eq( + expected, + actual, + check_index_type=not gdf.empty, + check_column_type=check_column_type, + ) @pytest_unmark_spilling @@ -7372,20 +7389,18 @@ def test_dataframe_append_dataframe(df, other, sort, ignore_index): ], ) @pytest.mark.parametrize("sort", [False, True]) -def test_dataframe_append_series_dict(df, other, sort): +def test_dataframe_concat_series(df, other, sort): pdf = df - other_pd = other - gdf = cudf.from_pandas(df) - if isinstance(other, pd.Series): - other_gd = cudf.from_pandas(other) + + if isinstance(other, dict): + other_pd = pd.Series(other) else: - other_gd = other + other_pd = other + other_gd = cudf.from_pandas(other_pd) - with pytest.warns(FutureWarning, match="append method is deprecated"): - expected = pdf.append(other_pd, ignore_index=True, sort=sort) - with pytest.warns(FutureWarning, match="append method is deprecated"): - actual = gdf.append(other_gd, ignore_index=True, sort=sort) + expected = pd.concat([pdf, other_pd], ignore_index=True, sort=sort) + actual = cudf.concat([gdf, other_gd], ignore_index=True, sort=sort) if expected.shape != df.shape: # Ignore the column type comparison because pandas incorrectly @@ -7402,20 +7417,18 @@ def test_dataframe_append_series_dict(df, other, sort): assert_eq(expected, actual, check_index_type=not gdf.empty) -def test_dataframe_append_series_mixed_index(): +def test_dataframe_concat_series_mixed_index(): df = cudf.DataFrame({"first": [], "d": []}) + pdf = df.to_pandas() + sr = cudf.Series([1, 2, 3, 4]) + psr = sr.to_pandas() - with pytest.raises( - TypeError, - match=re.escape( - "cudf does not support mixed types, please type-cast " - "the column index of dataframe and index of series " - "to same dtypes." - ), - ): - with pytest.warns(FutureWarning, match="append method is deprecated"): - df.append(sr, ignore_index=True) + assert_eq( + cudf.concat([df, sr], ignore_index=True), + pd.concat([pdf, psr], ignore_index=True), + check_dtype=False, + ) @pytest_unmark_spilling @@ -7540,24 +7553,40 @@ def test_dataframe_append_series_mixed_index(): ) @pytest.mark.parametrize("sort", [False, True]) @pytest.mark.parametrize("ignore_index", [True, False]) -def test_dataframe_append_dataframe_lists(df, other, sort, ignore_index): +def test_dataframe_concat_dataframe_lists(df, other, sort, ignore_index): pdf = df other_pd = other gdf = cudf.from_pandas(df) - other_gd = [ - cudf.from_pandas(o) if isinstance(o, pd.DataFrame) else o - for o in other - ] + other_gd = [cudf.from_pandas(o) for o in other] + + expected = pd.concat( + [pdf] + other_pd, sort=sort, ignore_index=ignore_index + ) + actual = cudf.concat( + [gdf] + other_gd, sort=sort, ignore_index=ignore_index + ) + + # In some cases, Pandas creates an empty Index([], dtype="object") for + # columns whereas cudf creates a RangeIndex(0, 0). + check_column_type = ( + False if len(expected.columns) == len(df.columns) == 0 else True + ) - with pytest.warns(FutureWarning, match="append method is deprecated"): - expected = pdf.append(other_pd, sort=sort, ignore_index=ignore_index) - with pytest.warns(FutureWarning, match="append method is deprecated"): - actual = gdf.append(other_gd, sort=sort, ignore_index=ignore_index) if expected.shape != df.shape: - assert_eq(expected.fillna(-1), actual.fillna(-1), check_dtype=False) + assert_eq( + expected.fillna(-1), + actual.fillna(-1), + check_dtype=False, + check_column_type=check_column_type, + ) else: - assert_eq(expected, actual, check_index_type=not gdf.empty) + assert_eq( + expected, + actual, + check_index_type=not gdf.empty, + check_column_type=check_column_type, + ) @pytest.mark.parametrize( @@ -7633,20 +7662,19 @@ def test_dataframe_ffill(df): ) @pytest.mark.parametrize("sort", [False, True]) @pytest.mark.parametrize("ignore_index", [True, False]) -def test_dataframe_append_lists(df, other, sort, ignore_index): +def test_dataframe_concat_lists(df, other, sort, ignore_index): pdf = df - other_pd = other + other_pd = [pd.DataFrame(o) for o in other] gdf = cudf.from_pandas(df) - other_gd = [ - cudf.from_pandas(o) if isinstance(o, pd.DataFrame) else o - for o in other - ] + other_gd = [cudf.from_pandas(o) for o in other_pd] - with pytest.warns(FutureWarning, match="append method is deprecated"): - expected = pdf.append(other_pd, sort=sort, ignore_index=ignore_index) - with pytest.warns(FutureWarning, match="append method is deprecated"): - actual = gdf.append(other_gd, sort=sort, ignore_index=ignore_index) + expected = pd.concat( + [pdf] + other_pd, sort=sort, ignore_index=ignore_index + ) + actual = cudf.concat( + [gdf] + other_gd, sort=sort, ignore_index=ignore_index + ) if expected.shape != df.shape: assert_eq( @@ -7659,17 +7687,13 @@ def test_dataframe_append_lists(df, other, sort, ignore_index): assert_eq(expected, actual, check_index_type=not gdf.empty) -def test_dataframe_append_error(): +def test_dataframe_concat_series_without_name(): df = cudf.DataFrame({"a": [1, 2, 3]}) - ps = cudf.Series([1, 2, 3]) + pdf = df.to_pandas() + gs = cudf.Series([1, 2, 3]) + ps = gs.to_pandas() - with pytest.raises( - TypeError, - match="Can only append a Series if ignore_index=True " - "or if the Series has a name", - ): - with pytest.warns(FutureWarning, match="append method is deprecated"): - df.append(ps) + assert_eq(pd.concat([pdf, ps]), cudf.concat([df, gs])) def test_cudf_arrow_array_error(): diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 682fccda8dc..719dee308b9 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -95,17 +95,16 @@ def test_series_init_dict_lists(data): ], ) @pytest.mark.parametrize("ignore_index", [True, False]) -def test_series_append_basic(data, others, ignore_index): +def test_series_concat_basic(data, others, ignore_index): psr = pd.Series(data) gsr = cudf.Series(data) other_ps = pd.Series(others) other_gs = cudf.Series(others) - with pytest.warns(FutureWarning): - expected = psr.append(other_ps, ignore_index=ignore_index) - with pytest.warns(FutureWarning): - actual = gsr.append(other_gs, ignore_index=ignore_index) + expected = pd.concat([psr, other_ps], ignore_index=ignore_index) + actual = cudf.concat([gsr, other_gs], ignore_index=ignore_index) + assert_eq(expected, actual) @@ -142,17 +141,15 @@ def test_series_append_basic(data, others, ignore_index): ], ) @pytest.mark.parametrize("ignore_index", [True, False]) -def test_series_append_basic_str(data, others, ignore_index): +def test_series_concat_basic_str(data, others, ignore_index): psr = pd.Series(data) gsr = cudf.Series(data) other_ps = pd.Series(others) other_gs = cudf.Series(others) - with pytest.warns(FutureWarning): - expected = psr.append(other_ps, ignore_index=ignore_index) - with pytest.warns(FutureWarning): - actual = gsr.append(other_gs, ignore_index=ignore_index) + expected = pd.concat([psr, other_ps], ignore_index=ignore_index) + actual = cudf.concat([gsr, other_gs], ignore_index=ignore_index) assert_eq(expected, actual) @@ -195,21 +192,20 @@ def test_series_append_basic_str(data, others, ignore_index): ], ) @pytest.mark.parametrize("ignore_index", [True, False]) -def test_series_append_series_with_index(data, others, ignore_index): +def test_series_concat_series_with_index(data, others, ignore_index): psr = pd.Series(data) gsr = cudf.Series(data) other_ps = others other_gs = cudf.from_pandas(others) - with pytest.warns(FutureWarning): - expected = psr.append(other_ps, ignore_index=ignore_index) - with pytest.warns(FutureWarning): - actual = gsr.append(other_gs, ignore_index=ignore_index) + expected = pd.concat([psr, other_ps], ignore_index=ignore_index) + actual = cudf.concat([gsr, other_gs], ignore_index=ignore_index) + assert_eq(expected, actual) -def test_series_append_error_mixed_types(): +def test_series_concat_error_mixed_types(): gsr = cudf.Series([1, 2, 3, 4]) other = cudf.Series(["a", "b", "c", "d"]) @@ -218,16 +214,14 @@ def test_series_append_error_mixed_types(): match="cudf does not support mixed types, please type-cast " "both series to same dtypes.", ): - with pytest.warns(FutureWarning): - gsr.append(other) + cudf.concat([gsr, other]) with pytest.raises( TypeError, match="cudf does not support mixed types, please type-cast " "both series to same dtypes.", ): - with pytest.warns(FutureWarning): - gsr.append([gsr, other, gsr, other]) + cudf.concat([gsr, gsr, other, gsr, other]) @pytest.mark.parametrize( @@ -278,35 +272,32 @@ def test_series_append_error_mixed_types(): ], ) @pytest.mark.parametrize("ignore_index", [True, False]) -def test_series_append_list_series_with_index(data, others, ignore_index): +def test_series_concat_list_series_with_index(data, others, ignore_index): psr = pd.Series(data) gsr = cudf.Series(data) other_ps = others other_gs = [cudf.from_pandas(obj) for obj in others] - with pytest.warns(FutureWarning): - expected = psr.append(other_ps, ignore_index=ignore_index) - with pytest.warns(FutureWarning): - actual = gsr.append(other_gs, ignore_index=ignore_index) + expected = pd.concat([psr] + other_ps, ignore_index=ignore_index) + actual = cudf.concat([gsr] + other_gs, ignore_index=ignore_index) + assert_eq(expected, actual) -def test_series_append_existing_buffers(): +def test_series_concat_existing_buffers(): a1 = np.arange(10, dtype=np.float64) gs = cudf.Series(a1) # Add new buffer a2 = cudf.Series(np.arange(5)) - with pytest.warns(FutureWarning): - gs = gs.append(a2) + gs = cudf.concat([gs, a2]) assert len(gs) == 15 np.testing.assert_equal(gs.to_numpy(), np.hstack([a1, a2.to_numpy()])) # Ensure appending to previous buffer a3 = cudf.Series(np.arange(3)) - with pytest.warns(FutureWarning): - gs = gs.append(a3) + gs = cudf.concat([gs, a3]) assert len(gs) == 18 a4 = np.hstack([a1, a2.to_numpy(), a3.to_numpy()]) np.testing.assert_equal(gs.to_numpy(), a4) @@ -314,13 +305,11 @@ def test_series_append_existing_buffers(): # Appending different dtype a5 = cudf.Series(np.array([1, 2, 3], dtype=np.int32)) a6 = cudf.Series(np.array([4.5, 5.5, 6.5], dtype=np.float64)) - with pytest.warns(FutureWarning): - gs = a5.append(a6) + gs = cudf.concat([a5, a6]) np.testing.assert_equal( gs.to_numpy(), np.hstack([a5.to_numpy(), a6.to_numpy()]) ) - with pytest.warns(FutureWarning): - gs = cudf.Series(a6).append(a5) + gs = cudf.concat([cudf.Series(a6), a5]) np.testing.assert_equal( gs.to_numpy(), np.hstack([a6.to_numpy(), a5.to_numpy()]) ) From 4a87cbde1df22eb1029269491757ce173648cf74 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 13 Mar 2023 14:58:10 -0500 Subject: [PATCH 010/162] Drop `na_sentinel` from `factorize` (#12924) This PR drops support for `na_sentinel` in factorize APIs, to match with pandas-2.0 --- python/cudf/cudf/core/algorithms.py | 52 ++------------------ python/cudf/cudf/core/multiindex.py | 8 +-- python/cudf/cudf/core/single_column_frame.py | 11 +---- python/cudf/cudf/tests/test_series.py | 23 --------- 4 files changed, 7 insertions(+), 87 deletions(-) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 7012496434a..50ec4b774ee 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -11,9 +11,7 @@ from cudf.core.series import Series -def factorize( - values, sort=False, na_sentinel=None, use_na_sentinel=None, size_hint=None -): +def factorize(values, sort=False, use_na_sentinel=True, size_hint=None): """Encode the input values as integer labels Parameters @@ -22,14 +20,6 @@ def factorize( The data to be factorized. sort : bool, default True Sort uniques and shuffle codes to maintain the relationship. - na_sentinel : number, default -1 - Value to indicate missing category. - - .. deprecated:: 23.04 - - The na_sentinel argument is deprecated and will be removed in - a future version of cudf. Specify use_na_sentinel as - either True or False. use_na_sentinel : bool, default True If True, the sentinel -1 will be used for NA values. If False, NA values will be encoded as non-negative @@ -83,51 +73,19 @@ def factorize( >>> uniques Float64Index([, 1.0, 2.0], dtype='float64') """ - # TODO: Drop `na_sentinel` in the next release immediately after - # pandas 2.0 upgrade. - if na_sentinel is not None and use_na_sentinel is not None: - raise ValueError( - "Cannot specify both `na_sentinel` and `use_na_sentile`; " - f"got `na_sentinel={na_sentinel}` and " - f"`use_na_sentinel={use_na_sentinel}`" - ) return_cupy_array = isinstance(values, cp.ndarray) values = Series(values) - if na_sentinel is None: - na_sentinel = ( - -1 - if use_na_sentinel is None or use_na_sentinel - else Scalar(None, dtype=values.dtype) - ) - else: - if na_sentinel is None: - msg = ( - "Specifying `na_sentinel=None` is deprecated, specify " - "`use_na_sentinel=False` instead." - ) - elif na_sentinel == -1: - msg = ( - "Specifying `na_sentinel=-1` is deprecated, specify " - "`use_na_sentinel=True` instead." - ) - else: - msg = ( - "Specifying the specific value to use for `na_sentinel` is " - "deprecated and will be removed in a future version of cudf. " - "Specify `use_na_sentinel=True` to use the sentinel value -1, " - "and `use_na_sentinel=False` to encode NA values.", - ) - warnings.warn(msg, FutureWarning) - if size_hint: warnings.warn("size_hint is not applicable for cudf.factorize") - if use_na_sentinel is None or use_na_sentinel: + if use_na_sentinel: + na_sentinel = Scalar(-1) cats = values._column.dropna() else: + na_sentinel = Scalar(None, dtype=values.dtype) cats = values._column cats = cats.unique().astype(values.dtype) @@ -136,7 +94,7 @@ def factorize( cats, _ = cats.sort_by_values() labels = values._column._label_encoding( - cats=cats, na_sentinel=Scalar(na_sentinel) + cats=cats, na_sentinel=na_sentinel ).values return labels, cats.values if return_cupy_array else Index(cats) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index d9d1aecb9d6..4cd3f0b3837 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -671,13 +671,7 @@ def _compute_levels_and_codes(self): codes = {} for name, col in self._data.items(): - with warnings.catch_warnings(): - # TODO: Remove this filter when - # `na_sentinel` is removed from `factorize`. - # This is a filter to not let the warnings from - # `factorize` show up in other parts of public APIs. - warnings.simplefilter("ignore") - code, cats = cudf.Series._from_data({None: col}).factorize() + code, cats = cudf.Series._from_data({None: col}).factorize() codes[name] = code.astype(np.int64) levels.append(cudf.Series(cats, name=None)) diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 9e380e63ae0..1ffb48fe19e 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -249,21 +249,13 @@ def __cuda_array_interface__(self): return self._column.__cuda_array_interface__ @_cudf_nvtx_annotate - def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None): + def factorize(self, sort=False, use_na_sentinel=True): """Encode the input values as integer labels. Parameters ---------- sort : bool, default True Sort uniques and shuffle codes to maintain the relationship. - na_sentinel : number, default -1 - Value to indicate missing category. - - .. deprecated:: 23.04 - - The na_sentinel argument is deprecated and will be removed in - a future version of cudf. Specify use_na_sentinel as - either True or False. use_na_sentinel : bool, default True If True, the sentinel -1 will be used for NA values. If False, NA values will be encoded as non-negative @@ -290,7 +282,6 @@ def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None): return cudf.core.algorithms.factorize( self, sort=sort, - na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel, ) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 719dee308b9..cb50e21094a 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -459,29 +459,6 @@ def test_series_describe_other_types(ps): assert_eq(expected.astype("str"), actual) -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 2, 1], - [1, 2, None, 3, 1, 1], - [], - ["a", "b", "c", None, "z", "a"], - ], -) -@pytest.mark.parametrize("na_sentinel", [99999, 11, -1, 0]) -def test_series_factorize(data, na_sentinel): - gsr = cudf.Series(data) - psr = gsr.to_pandas() - - with pytest.warns(FutureWarning): - expected_labels, expected_cats = psr.factorize(na_sentinel=na_sentinel) - with pytest.warns(FutureWarning): - actual_labels, actual_cats = gsr.factorize(na_sentinel=na_sentinel) - - assert_eq(expected_labels, actual_labels.get()) - assert_eq(expected_cats.values, actual_cats.to_pandas().values) - - @pytest.mark.parametrize( "data", [ From d1377a580656526119a33cabb898becd15cf152a Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 13 Mar 2023 16:06:56 -0500 Subject: [PATCH 011/162] Add information about `Index.is_*` method deprecation (#12909) This PR adds additional information for the following Index APIs to match with pandas 2.0: is_numeric is_boolean is_integer is_floating is_object is_categorical is_interval --- python/cudf/cudf/core/_base_index.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 88763b8a011..1d0a30b556d 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -9,6 +9,7 @@ import pandas as pd import cudf +import warnings from cudf._lib.copying import _gather_map_is_valid, gather from cudf._lib.stream_compaction import ( apply_boolean_mask, @@ -858,6 +859,7 @@ def is_numeric(self): >>> idx.is_numeric() False """ + # TODO: Only remove this deprecation after pandas removes this API. warnings.warn( f"{type(self).__name__}.is_numeric is deprecated. " "Use cudf.api.types.is_any_real_numeric_dtype instead", @@ -902,6 +904,7 @@ def is_boolean(self): >>> idx.is_boolean() False """ + # TODO: Only remove this deprecation after pandas removes this API. warnings.warn( f"{type(self).__name__}.is_boolean is deprecated. " "Use cudf.api.types.is_bool_dtype instead", @@ -946,6 +949,7 @@ def is_integer(self): >>> idx.is_integer() False """ + # TODO: Only remove this deprecation after pandas removes this API. warnings.warn( f"{type(self).__name__}.is_integer is deprecated. " "Use cudf.api.types.is_integer_dtype instead", @@ -997,6 +1001,7 @@ def is_floating(self): >>> idx.is_floating() False """ + # TODO: Only remove this deprecation after pandas removes this API. warnings.warn( f"{type(self).__name__}.is_floating is deprecated. " "Use cudf.api.types.is_float_dtype instead", @@ -1042,6 +1047,7 @@ def is_object(self): >>> idx.is_object() False """ + # TODO: Only remove this deprecation after pandas removes this API. warnings.warn( f"{type(self).__name__}.is_object is deprecated. " "Use cudf.api.types.is_object_dtype instead", @@ -1094,6 +1100,7 @@ def is_categorical(self): >>> s.index.is_categorical() False """ + # TODO: Only remove this deprecation after pandas removes this API. warnings.warn( f"{type(self).__name__}.is_categorical is deprecated. " "Use cudf.api.types.is_categorical_dtype instead", @@ -1140,6 +1147,7 @@ def is_interval(self): >>> idx.is_interval() False """ + # TODO: Only remove this deprecation after pandas removes this API. warnings.warn( f"{type(self).__name__}.is_interval is deprecated. " "Use cudf.api.types.is_interval_dtype instead", From 48c1016b6882c79c2fdece65e55ff82a76e67e63 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 31 Mar 2023 13:53:15 -0500 Subject: [PATCH 012/162] [REVIEW] Miscellaneous pytest fixes for pandas-2.0 (#12962) This PR contains miscellaneous fixes in pytests. The changes in pytests are due to suttle changes in behaviors from the pandas-2.0 side. --- python/cudf/cudf/tests/test_parquet.py | 2 +- python/cudf/cudf/tests/test_reshape.py | 4 +++- python/cudf/cudf/tests/test_stats.py | 17 +++++------------ python/cudf/cudf/tests/test_string.py | 12 ++++++++++-- 4 files changed, 19 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index c24ff080033..aad163736c2 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2454,7 +2454,7 @@ def test_parquet_writer_decimal(decimal_type, data): buff = BytesIO() gdf.to_parquet(buff) - got = pd.read_parquet(buff, use_nullable_dtypes=True) + got = pd.read_parquet(buff, dtype_backend="numpy_nullable") assert_eq(gdf.to_pandas(nullable=True), got) diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index 78e95fdbd81..bf2c1a32b64 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -76,7 +76,9 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype): expect = pd.melt(frame=pdf, id_vars=id_vars, value_vars=value_vars) # pandas' melt makes the 'variable' column of 'object' type (string) # cuDF's melt makes it Categorical because it doesn't support strings - expect["variable"] = expect["variable"].astype("category") + expect["variable"] = expect["variable"].astype( + got["variable"].dtype.to_pandas() + ) assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 6ca64fdcfa3..126a90e580c 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -290,25 +290,18 @@ def test_kurtosis_series(data, null_flag): ], ) @pytest.mark.parametrize("null_flag", [False, True]) -def test_skew_series(data, null_flag): +@pytest.mark.parametrize("numeric_only", [False, True]) +def test_skew_series(data, null_flag, numeric_only): pdata = data.to_pandas() if null_flag and len(data) > 2: data.iloc[[0, 2]] = None pdata.iloc[[0, 2]] = None - got = data.skew() - expected = pdata.skew() - got = got if np.isscalar(got) else got.to_numpy() - np.testing.assert_array_almost_equal(got, expected) - - got = data.skew(numeric_only=False) - expected = pdata.skew(numeric_only=False) - got = got if np.isscalar(got) else got.to_numpy() - np.testing.assert_array_almost_equal(got, expected) + got = data.skew(numeric_only=numeric_only) + expected = pdata.skew(numeric_only=numeric_only) - with pytest.raises(NotImplementedError): - data.skew(numeric_only=True) + assert_eq(got, expected) @pytest.mark.parametrize("dtype", params_dtypes) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 10208611f13..693c9ef8044 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1848,7 +1848,11 @@ def test_string_count(data, pat, flags): ps.str.count(pat=pat, flags=flags), check_dtype=False, ) - assert_eq(as_index(gs).str.count(pat=pat), pd.Index(ps).str.count(pat=pat)) + assert_eq( + cudf.Index(gs).str.count(pat=pat), + pd.Index(ps).str.count(pat=pat), + exact=False, + ) @pytest.mark.parametrize( @@ -2214,7 +2218,11 @@ def test_string_str_rindex(data, sub, er): if er is None: assert_eq(ps.str.rindex(sub), gs.str.rindex(sub), check_dtype=False) - assert_eq(pd.Index(ps).str.rindex(sub), as_index(gs).str.rindex(sub)) + assert_eq( + pd.Index(ps).str.rindex(sub), + as_index(gs).str.rindex(sub), + exact=False, + ) try: ps.str.rindex(sub) From dd15a19516df71040a227d581505658626e2e308 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 7 Apr 2023 09:48:16 -0700 Subject: [PATCH 013/162] Add get_indexer --- python/cudf/cudf/core/_base_index.py | 5 +- python/cudf/cudf/core/index.py | 98 +++++--- python/cudf/cudf/core/multiindex.py | 81 ++++++- python/cudf/cudf/tests/test_index.py | 335 +++++++++++++++++++++++---- 4 files changed, 427 insertions(+), 92 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 1d0a30b556d..8d448b99ac6 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -91,7 +91,10 @@ def size(self): def values(self): raise NotImplementedError - def get_loc(self, key, method=None, tolerance=None): + def get_indexer(self, target, method=None, limit=None, tolerance=None): + raise NotImplementedError + + def get_loc(self, key): raise NotImplementedError def __getitem__(self, key): diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 209276215c8..bf57a8f115f 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -575,19 +575,9 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): ) @_cudf_nvtx_annotate - def get_loc(self, key, method=None, tolerance=None): - # We should not actually remove this code until we have implemented the - # get_indexers method as an alternative, see - # https://github.com/rapidsai/cudf/issues/12312 - if method is not None: - warnings.warn( - f"Passing method to {self.__class__.__name__}.get_loc is " - "deprecated and will raise in a future version.", - FutureWarning, - ) - + def get_indexer(self, target, method=None, limit=None, tolerance=None): # Given an actual integer, - idx = (key - self._start) / self._step + idx = (target - self._start) / self._step idx_int_upper_bound = (self._stop - self._start) // self._step if method is None: if tolerance is not None: @@ -597,17 +587,17 @@ def get_loc(self, key, method=None, tolerance=None): ) if idx > idx_int_upper_bound or idx < 0: - raise KeyError(key) + raise KeyError(target) - idx_int = (key - self._start) // self._step + idx_int = (target - self._start) // self._step if idx_int != idx: - raise KeyError(key) + raise KeyError(target) return idx_int if (method == "ffill" and idx < 0) or ( method == "bfill" and idx > idx_int_upper_bound ): - raise KeyError(key) + raise KeyError(target) round_method = { "ffill": math.floor, @@ -615,9 +605,16 @@ def get_loc(self, key, method=None, tolerance=None): "nearest": round, }[method] if tolerance is not None and (abs(idx) * self._step > tolerance): - raise KeyError(key) + raise KeyError(target) return np.clip(round_method(idx), 0, idx_int_upper_bound, dtype=int) + @_cudf_nvtx_annotate + def get_loc(self, key): + # Given an actual integer, + if is_scalar(key): + key = [key] + return self.get_indexer(key) + @_cudf_nvtx_annotate def _union(self, other, sort=None): if isinstance(other, RangeIndex): @@ -1128,12 +1125,12 @@ def astype(self, dtype, copy: bool = True): return _index_from_data(super().astype({self.name: dtype}, copy)) @_cudf_nvtx_annotate - def get_loc(self, key, method=None, tolerance=None): + def get_indexer(self, target, method=None, limit=None, tolerance=None): """Get integer location, slice or boolean mask for requested label. Parameters ---------- - key : label + target : label method : {None, 'pad'/'fill', 'backfill'/'bfill', 'nearest'}, optional - default: exact matches only. - pad / ffill: find the PREVIOUS index value if no exact match. @@ -1144,7 +1141,7 @@ def get_loc(self, key, method=None, tolerance=None): tolerance : int or float, optional Maximum distance from index value for inexact matches. The value of the index at the matching location must satisfy the equation - ``abs(index[loc] - key) <= tolerance``. + ``abs(index[loc] - target) <= tolerance``. Returns ------- @@ -1168,15 +1165,8 @@ def get_loc(self, key, method=None, tolerance=None): >>> numeric_unique_index.get_loc(3) 2 """ - # We should not actually remove this code until we have implemented the - # get_indexers method as an alternative, see - # https://github.com/rapidsai/cudf/issues/12312 - if method is not None: - warnings.warn( - f"Passing method to {self.__class__.__name__}.get_loc is " - "deprecated and will raise in a future version.", - FutureWarning, - ) + if is_scalar(target): + raise TypeError("Should be a sequence") if tolerance is not None: raise NotImplementedError( "Parameter tolerance is not supported yet." @@ -1204,22 +1194,20 @@ def get_loc(self, key, method=None, tolerance=None): "is specified." ) - key_as_table = cudf.core.frame.Frame( - {"None": as_column(key, length=1)} - ) + target_as_table = cudf.core.frame.Frame({"None": as_column(target)}) lower_bound, upper_bound, sort_inds = _lexsorted_equal_range( - self, key_as_table, is_sorted + self, target_as_table, is_sorted ) if lower_bound == upper_bound: - # Key not found, apply method + # target not found, apply method if method in ("pad", "ffill"): if lower_bound == 0: - raise KeyError(key) + raise KeyError(target) return lower_bound - 1 elif method in ("backfill", "bfill"): if lower_bound == self._data.nrows: - raise KeyError(key) + raise KeyError(target) return lower_bound elif method == "nearest": if lower_bound == self._data.nrows: @@ -1230,11 +1218,11 @@ def get_loc(self, key, method=None, tolerance=None): upper_val = self._column.element_indexing(lower_bound) return ( lower_bound - 1 - if abs(lower_val - key) < abs(upper_val - key) + if abs(lower_val - target) < abs(upper_val - target) else lower_bound ) else: - raise KeyError(key) + raise KeyError(target) if lower_bound + 1 == upper_bound: # Search result is unique, return int. @@ -1255,6 +1243,40 @@ def get_loc(self, key, method=None, tolerance=None): mask[true_inds] = True return mask + @_cudf_nvtx_annotate + def get_loc(self, key): + """Get integer location, slice or boolean mask for requested label. + + Parameters + ---------- + key : label + + Returns + ------- + int or slice or boolean mask + - If result is unique, return integer index + - If index is monotonic, loc is returned as a slice object + - Otherwise, a boolean mask is returned + + Examples + -------- + >>> unique_index = cudf.Index(list('abc')) + >>> unique_index.get_loc('b') + 1 + >>> monotonic_index = cudf.Index(list('abbc')) + >>> monotonic_index.get_loc('b') + slice(1, 3, None) + >>> non_monotonic_index = cudf.Index(list('abcb')) + >>> non_monotonic_index.get_loc('b') + array([False, True, False, True]) + >>> numeric_unique_index = cudf.Index([1, 2, 3]) + >>> numeric_unique_index.get_loc(3) + 2 + """ + if is_scalar(key): + key = [key] + return self.get_indexer(target=key) + @_cudf_nvtx_annotate def __repr__(self): max_seq_items = get_option("max_seq_items") or len(self) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 4cd3f0b3837..f533cff7c12 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1642,7 +1642,7 @@ def _level_index_from_level(self, level): return level @_cudf_nvtx_annotate - def get_loc(self, key, method=None, tolerance=None): + def get_indexer(self, target, method=None, tolerance=None): """ Get location for a label or a tuple of labels. @@ -1650,7 +1650,7 @@ def get_loc(self, key, method=None, tolerance=None): Parameters ---------- - key : label or tuple of labels (one for each level) + target : label or tuple of labels (one for each level) method : None Returns @@ -1712,24 +1712,26 @@ def get_loc(self, key, method=None, tolerance=None): self.is_monotonic_increasing or self.is_monotonic_decreasing ) is_unique = self.is_unique - key = (key,) if not isinstance(key, tuple) else key + target = (target,) if not isinstance(target, tuple) else target - # Handle partial key search. If length of `key` is less than `nlevels`, - # Only search levels up to `len(key)` level. - key_as_table = cudf.core.frame.Frame( - {i: column.as_column(k, length=1) for i, k in enumerate(key)} + # Handle partial target search. If length of `target` is less than `nlevels`, + # Only search levels up to `len(target)` level. + target_as_table = cudf.core.frame.Frame( + {i: column.as_column(k, length=1) for i, k in enumerate(target)} ) partial_index = self.__class__._from_data( - data=self._data.select_by_index(slice(key_as_table._num_columns)) + data=self._data.select_by_index( + slice(target_as_table._num_columns) + ) ) ( lower_bound, upper_bound, sort_inds, - ) = _lexsorted_equal_range(partial_index, key_as_table, is_sorted) + ) = _lexsorted_equal_range(partial_index, target_as_table, is_sorted) if lower_bound == upper_bound: - raise KeyError(key) + raise KeyError(target) if is_unique and lower_bound + 1 == upper_bound: # Indices are unique (Pandas constraint), search result is unique, @@ -1755,6 +1757,65 @@ def get_loc(self, key, method=None, tolerance=None): mask[true_inds] = True return mask + @_cudf_nvtx_annotate + def get_loc(self, key): + """ + Get location for a label or a tuple of labels. + + The location is returned as an integer/slice or boolean mask. + + Parameters + ---------- + key : label or tuple of labels (one for each level) + method : None + + Returns + ------- + loc : int, slice object or boolean mask + - If index is unique, search result is unique, return a single int. + - If index is monotonic, index is returned as a slice object. + - Otherwise, cudf attempts a best effort to convert the search + result into a slice object, and will return a boolean mask if + failed to do so. Notice this can deviate from Pandas behavior + in some situations. + + Examples + -------- + >>> import cudf + >>> mi = cudf.MultiIndex.from_tuples( + ... [('a', 'd'), ('b', 'e'), ('b', 'f')]) + >>> mi.get_loc('b') + slice(1, 3, None) + >>> mi.get_loc(('b', 'e')) + 1 + >>> non_monotonic_non_unique_idx = cudf.MultiIndex.from_tuples( + ... [('c', 'd'), ('b', 'e'), ('a', 'f'), ('b', 'e')]) + >>> non_monotonic_non_unique_idx.get_loc('b') # differ from pandas + slice(1, 4, 2) + + .. pandas-compat:: + **MultiIndex.get_loc** + + The return types of this function may deviates from the + method provided by Pandas. If the index is neither + lexicographically sorted nor unique, a best effort attempt is made + to coerce the found indices into a slice. For example: + + .. code-block:: + + >>> import pandas as pd + >>> import cudf + >>> x = pd.MultiIndex.from_tuples([ + ... (2, 1, 1), (1, 2, 3), (1, 2, 1), + ... (1, 1, 1), (1, 1, 1), (2, 2, 1), + ... ]) + >>> x.get_loc(1) + array([False, True, True, True, True, False]) + >>> cudf.from_pandas(x).get_loc(1) + slice(1, 5, 1) + """ + return self.get_indexer(target=key) + def _get_reconciled_name_object(self, other) -> MultiIndex: """ If the result of a set operation will be self, diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index f9ad48c48af..312baf2d7c6 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1967,30 +1967,66 @@ def test_get_loc_single_unique_numeric(idx, key, method): "idx", [pd.RangeIndex(3, 100, 4)], ) -@pytest.mark.parametrize("key", list(range(1, 110, 3))) +@pytest.mark.parametrize( + "key", + [ + list(range(1, 20, 3)), + list(range(20, 35, 3)), + list(range(35, 77, 3)), + list(range(77, 110, 3)), + ], +) @pytest.mark.parametrize("method", [None, "ffill"]) -def test_get_loc_rangeindex(idx, key, method): +def test_get_indexer_rangeindex(idx, key, method): pi = idx gi = cudf.from_pandas(pi) + # if ( + # (any(k not in pi for k in key) and method is None) + # # Get key before the first element is KeyError + # or (key < pi.start and method in "ffill") + # # Get key after the last element is KeyError + # or (key >= pi.stop and method in "bfill") + # ): + # assert_exceptions_equal( + # lfunc=pi.get_indexer, + # rfunc=gi.get_indexer, + # lfunc_args_and_kwargs=([], {"key": key, "method": method}), + # rfunc_args_and_kwargs=([], {"key": key, "method": method}), + # ) + # else: + # with expect_warning_if(method is not None): + expected = pi.get_indexer(key, method=method) + # with expect_warning_if(method is not None): + got = gi.get_indexer(key, method=method) + + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "idx", + [pd.RangeIndex(3, 100, 4)], +) +@pytest.mark.parametrize("key", list(range(1, 110, 3))) +def test_get_loc_rangeindex(idx, key): + pi = idx + gi = cudf.from_pandas(pi) if ( - (key not in pi and method is None) + (key not in pi) # Get key before the first element is KeyError - or (key < pi.start and method in "ffill") + or (key < pi.start) # Get key after the last element is KeyError - or (key >= pi.stop and method in "bfill") + or (key >= pi.stop) ): assert_exceptions_equal( lfunc=pi.get_loc, rfunc=gi.get_loc, - lfunc_args_and_kwargs=([], {"key": key, "method": method}), - rfunc_args_and_kwargs=([], {"key": key, "method": method}), + lfunc_args_and_kwargs=([], {"key": key}), + rfunc_args_and_kwargs=([], {"key": key}), ) else: - with expect_warning_if(method is not None): - expected = pi.get_loc(key, method=method) - with expect_warning_if(method is not None): - got = gi.get_loc(key, method=method) + expected = pi.get_loc(key) + got = gi.get_loc(key) assert_eq(expected, got) @@ -2003,8 +2039,7 @@ def test_get_loc_rangeindex(idx, key, method): ], ) @pytest.mark.parametrize("key", [0, 3, 6, 7]) -@pytest.mark.parametrize("method", [None]) -def test_get_loc_single_duplicate_numeric(idx, key, method): +def test_get_loc_single_duplicate_numeric(idx, key): pi = idx gi = cudf.from_pandas(pi) @@ -2012,14 +2047,61 @@ def test_get_loc_single_duplicate_numeric(idx, key, method): assert_exceptions_equal( lfunc=pi.get_loc, rfunc=gi.get_loc, + lfunc_args_and_kwargs=([], {"key": key}), + rfunc_args_and_kwargs=([], {"key": key}), + ) + else: + expected = pi.get_loc(key) + got = gi.get_loc(key) + + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "idx", + [ + pd.Index([1, 3, 3, 6]), # monotonic + pd.Index([6, 1, 3, 3]), # non-monotonic + ], +) +@pytest.mark.parametrize("key", [0, 3, 6, 7]) +@pytest.mark.parametrize("method", [None]) +def test_get_indexer_single_duplicate_numeric(idx, key, method): + pi = idx + gi = cudf.from_pandas(pi) + + if key not in pi: + assert_exceptions_equal( + lfunc=pi.get_indexer, + rfunc=gi.get_indexer, lfunc_args_and_kwargs=([], {"key": key, "method": method}), rfunc_args_and_kwargs=([], {"key": key, "method": method}), ) else: - with expect_warning_if(method is not None): - expected = pi.get_loc(key, method=method) - with expect_warning_if(method is not None): - got = gi.get_loc(key, method=method) + expected = pi.get_indexer(key, method=method) + got = gi.get_indexer(key, method=method) + + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "idx", [pd.Index(["b", "f", "m", "q"]), pd.Index(["m", "f", "b", "q"])] +) +@pytest.mark.parametrize("key", ["a", "f", "n", "z"]) +def test_get_loc_single_unique_string(idx, key): + pi = idx + gi = cudf.from_pandas(pi) + + if key not in pi: + assert_exceptions_equal( + lfunc=pi.get_loc, + rfunc=gi.get_loc, + lfunc_args_and_kwargs=([], {"key": key}), + rfunc_args_and_kwargs=([], {"key": key}), + ) + else: + expected = pi.get_loc(key) + got = gi.get_loc(key) assert_eq(expected, got) @@ -2029,7 +2111,7 @@ def test_get_loc_single_duplicate_numeric(idx, key, method): ) @pytest.mark.parametrize("key", ["a", "f", "n", "z"]) @pytest.mark.parametrize("method", [None, "ffill", "bfill"]) -def test_get_loc_single_unique_string(idx, key, method): +def test_get_indexer_single_unique_string(idx, key, method): pi = idx gi = cudf.from_pandas(pi) @@ -2043,16 +2125,14 @@ def test_get_loc_single_unique_string(idx, key, method): or (key == "z" and method == "bfill") ): assert_exceptions_equal( - lfunc=pi.get_loc, - rfunc=gi.get_loc, + lfunc=pi.get_indexer, + rfunc=gi.get_indexer, lfunc_args_and_kwargs=([], {"key": key, "method": method}), rfunc_args_and_kwargs=([], {"key": key, "method": method}), ) else: - with expect_warning_if(method is not None): - expected = pi.get_loc(key, method=method) - with expect_warning_if(method is not None): - got = gi.get_loc(key, method=method) + expected = pi.get_indexer(key, method=method) + got = gi.get_indexer(key, method=method) assert_eq(expected, got) @@ -2061,8 +2141,7 @@ def test_get_loc_single_unique_string(idx, key, method): "idx", [pd.Index(["b", "m", "m", "q"]), pd.Index(["m", "f", "m", "q"])] ) @pytest.mark.parametrize("key", ["a", "f", "n", "z"]) -@pytest.mark.parametrize("method", [None]) -def test_get_loc_single_duplicate_string(idx, key, method): +def test_get_loc_single_duplicate_string(idx, key): pi = idx gi = cudf.from_pandas(pi) @@ -2070,14 +2149,35 @@ def test_get_loc_single_duplicate_string(idx, key, method): assert_exceptions_equal( lfunc=pi.get_loc, rfunc=gi.get_loc, + lfunc_args_and_kwargs=([], {"key": key}), + rfunc_args_and_kwargs=([], {"key": key}), + ) + else: + expected = pi.get_loc(key) + got = gi.get_loc(key) + + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "idx", [pd.Index(["b", "m", "m", "q"]), pd.Index(["m", "f", "m", "q"])] +) +@pytest.mark.parametrize("key", ["a", "f", "n", "z"]) +@pytest.mark.parametrize("method", [None]) +def test_get_indexer_single_duplicate_string(idx, key, method): + pi = idx + gi = cudf.from_pandas(pi) + + if key not in pi: + assert_exceptions_equal( + lfunc=pi.get_indexer, + rfunc=gi.get_indexer, lfunc_args_and_kwargs=([], {"key": key, "method": method}), rfunc_args_and_kwargs=([], {"key": key, "method": method}), ) else: - with expect_warning_if(method is not None): - expected = pi.get_loc(key, method=method) - with expect_warning_if(method is not None): - got = gi.get_loc(key, method=method) + expected = pi.get_indexer(key, method=method) + got = gi.get_indexer(key, method=method) assert_eq(expected, got) @@ -2097,8 +2197,7 @@ def test_get_loc_single_duplicate_string(idx, key, method): ], ) @pytest.mark.parametrize("key", [1, (1, 2), (1, 2, 3), (2, 1, 1), (9, 9, 9)]) -@pytest.mark.parametrize("method", [None]) -def test_get_loc_multi_numeric(idx, key, method): +def test_get_loc_multi_numeric(idx, key): pi = idx.sort_values() gi = cudf.from_pandas(pi) @@ -2106,14 +2205,90 @@ def test_get_loc_multi_numeric(idx, key, method): assert_exceptions_equal( lfunc=pi.get_loc, rfunc=gi.get_loc, + lfunc_args_and_kwargs=([], {"key": key}), + rfunc_args_and_kwargs=([], {"key": key}), + ) + else: + expected = pi.get_loc(key) + got = gi.get_loc(key) + + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "idx", + [ + pd.MultiIndex.from_tuples( + [(1, 1, 1), (1, 1, 2), (1, 2, 1), (1, 2, 3), (2, 1, 1), (2, 2, 1)] + ), + pd.MultiIndex.from_tuples( + [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 2), (2, 2, 1), (1, 1, 1)] + ), + pd.MultiIndex.from_tuples( + [(1, 1, 1), (1, 1, 2), (1, 1, 2), (1, 2, 3), (2, 1, 1), (2, 2, 1)] + ), + ], +) +@pytest.mark.parametrize("key", [1, (1, 2), (1, 2, 3), (2, 1, 1), (9, 9, 9)]) +@pytest.mark.parametrize("method", [None]) +def test_get_indexer_multi_numeric(idx, key, method): + pi = idx.sort_values() + gi = cudf.from_pandas(pi) + + if key not in pi: + assert_exceptions_equal( + lfunc=pi.get_indexer, + rfunc=gi.get_indexer, lfunc_args_and_kwargs=([], {"key": key, "method": method}), rfunc_args_and_kwargs=([], {"key": key, "method": method}), ) else: - with expect_warning_if(method is not None): - expected = pi.get_loc(key, method=method) - with expect_warning_if(method is not None): - got = gi.get_loc(key, method=method) + expected = pi.get_indexer(key, method=method) + got = gi.get_indexer(key, method=method) + + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "idx", + [ + pd.MultiIndex.from_tuples( + [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 1), (1, 1, 1), (2, 2, 1)] + ) + ], +) +@pytest.mark.parametrize( + "key, result", + [ + (1, slice(1, 5, 1)), # deviates + ((1, 2), slice(1, 3, 1)), + ((1, 2, 3), slice(1, 2, None)), + ((2, 1, 1), slice(0, 1, None)), + ((9, 9, 9), None), + ], +) +def test_get_loc_multi_numeric_deviate(idx, key, result): + pi = idx + gi = cudf.from_pandas(pi) + + with expect_warning_if( + isinstance(key, tuple), pd.errors.PerformanceWarning + ): + key_flag = key not in pi + + if key_flag: + with expect_warning_if( + isinstance(key, tuple), pd.errors.PerformanceWarning + ): + assert_exceptions_equal( + lfunc=pi.get_loc, + rfunc=gi.get_loc, + lfunc_args_and_kwargs=([], {"key": key}), + rfunc_args_and_kwargs=([], {"key": key}), + ) + else: + expected = result + got = gi.get_loc(key) assert_eq(expected, got) @@ -2137,7 +2312,7 @@ def test_get_loc_multi_numeric(idx, key, method): ], ) @pytest.mark.parametrize("method", [None]) -def test_get_loc_multi_numeric_deviate(idx, key, result, method): +def test_get_indexer_multi_numeric_deviate(idx, key, result, method): pi = idx gi = cudf.from_pandas(pi) @@ -2222,8 +2397,7 @@ def test_get_loc_multi_numeric_deviate(idx, key, result, method): @pytest.mark.parametrize( "key", ["a", ("a", "a"), ("a", "b", "c"), ("b", "c", "a"), ("z", "z", "z")] ) -@pytest.mark.parametrize("method", [None]) -def test_get_loc_multi_string(idx, key, method): +def test_get_loc_multi_string(idx, key): pi = idx.sort_values() gi = cudf.from_pandas(pi) @@ -2231,14 +2405,89 @@ def test_get_loc_multi_string(idx, key, method): assert_exceptions_equal( lfunc=pi.get_loc, rfunc=gi.get_loc, + lfunc_args_and_kwargs=([], {"key": key}), + rfunc_args_and_kwargs=([], {"key": key}), + ) + else: + expected = pi.get_loc(key) + got = gi.get_loc(key) + + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "idx", + [ + pd.MultiIndex.from_tuples( + [ + ("a", "a", "a"), + ("a", "a", "b"), + ("a", "b", "a"), + ("a", "b", "c"), + ("b", "a", "a"), + ("b", "c", "a"), + ] + ), + pd.MultiIndex.from_tuples( + [ + ("a", "a", "b"), + ("a", "b", "c"), + ("b", "a", "a"), + ("a", "a", "a"), + ("a", "b", "a"), + ("b", "c", "a"), + ] + ), + pd.MultiIndex.from_tuples( + [ + ("a", "a", "a"), + ("a", "b", "c"), + ("b", "a", "a"), + ("a", "a", "b"), + ("a", "b", "a"), + ("b", "c", "a"), + ] + ), + pd.MultiIndex.from_tuples( + [ + ("a", "a", "a"), + ("a", "a", "b"), + ("a", "a", "b"), + ("a", "b", "c"), + ("b", "a", "a"), + ("b", "c", "a"), + ] + ), + pd.MultiIndex.from_tuples( + [ + ("a", "a", "b"), + ("b", "a", "a"), + ("b", "a", "a"), + ("a", "a", "a"), + ("a", "b", "a"), + ("b", "c", "a"), + ] + ), + ], +) +@pytest.mark.parametrize( + "key", ["a", ("a", "a"), ("a", "b", "c"), ("b", "c", "a"), ("z", "z", "z")] +) +@pytest.mark.parametrize("method", [None]) +def test_get_indexer_multi_string(idx, key, method): + pi = idx.sort_values() + gi = cudf.from_pandas(pi) + + if key not in pi: + assert_exceptions_equal( + lfunc=pi.get_indexer, + rfunc=gi.get_indexer, lfunc_args_and_kwargs=([], {"key": key, "method": method}), rfunc_args_and_kwargs=([], {"key": key, "method": method}), ) else: - with expect_warning_if(method is not None): - expected = pi.get_loc(key, method=method) - with expect_warning_if(method is not None): - got = gi.get_loc(key, method=method) + expected = pi.get_indexer(key, method=method) + got = gi.get_indexer(key, method=method) assert_eq(expected, got) From 6dce4ef93c0e4e40c764922a0370417cb78150b6 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 7 Apr 2023 12:05:38 -0500 Subject: [PATCH 014/162] Fix ufunc tests (#13083) Pandas 2.0 introduced support for ufuncs when the two columns are indexed. This PR updates the pytests accordingly. --- python/cudf/cudf/core/indexed_frame.py | 8 -------- python/cudf/cudf/tests/test_array_ufunc.py | 18 ++++++++++++++---- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 074bd554601..43085b297b0 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3330,14 +3330,6 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): fname = ufunc.__name__ if ret is not None: - # pandas bitwise operations return bools if indexes are misaligned. - if "bitwise" in fname: - reflect = self is not inputs[0] - other = inputs[0] if reflect else inputs[1] - if isinstance(other, self.__class__) and not self.index.equals( - other.index - ): - ret = ret.astype(bool) return ret # Attempt to dispatch all other functions to cupy. diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index ac77c6b89f3..8f4ae7c23a0 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -10,7 +10,7 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_150 +from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_200 from cudf.testing._utils import assert_eq, set_random_null_mask_inplace _UFUNCS = [ @@ -165,6 +165,16 @@ def test_ufunc_series(request, ufunc, has_nulls, indexed): ) ) + request.applymarker( + pytest.mark.xfail( + condition=PANDAS_GE_200 + and fname.startswith("bitwise") + and indexed + and has_nulls, + reason="https://github.com/pandas-dev/pandas/issues/52500", + ) + ) + N = 100 # Avoid zeros in either array to skip division by 0 errors. Also limit the # scale to avoid issues with overflow, etc. We use ints because some @@ -342,8 +352,8 @@ def test_ufunc_dataframe(request, ufunc, has_nulls, indexed): request.applymarker( pytest.mark.xfail( condition=( - indexed - and fname + not PANDAS_GE_200 + and indexed in { "add", "arctan2", @@ -379,7 +389,7 @@ def test_ufunc_dataframe(request, ufunc, has_nulls, indexed): } ), reason=( - "pandas does not currently support misaligned " + "pandas<2.0 does not currently support misaligned " "indexes in DataFrames" ), ) From 192e2045ffff4366b501e6a1b3d781cc30a4cdf6 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 7 Apr 2023 12:06:19 -0500 Subject: [PATCH 015/162] [REVIEW] datetime and timedelta improvements (#12934) This PR fixes 1046 pytest failures that are related to `datetime64` & `timedelta64` types. This PR(`time_2.0`): ```bash = 990 failed, 86109 passed, 2034 skipped, 995 xfailed, 165 xpassed in 546.70s (0:09:06) = ``` on `pandas_2.0_feature_branch`: ```bash == 2036 failed, 85423 passed, 2034 skipped, 860 xfailed in 720.53s (0:12:00) === ``` --- python/cudf/cudf/core/column/datetime.py | 63 ++++-- python/cudf/cudf/core/column/timedelta.py | 191 ++++++++++-------- python/cudf/cudf/core/index.py | 22 +- python/cudf/cudf/core/tools/datetimes.py | 12 +- python/cudf/cudf/tests/test_binops.py | 2 +- python/cudf/cudf/tests/test_dataframe.py | 19 +- python/cudf/cudf/tests/test_datetime.py | 42 ++-- python/cudf/cudf/tests/test_parquet.py | 23 ++- python/cudf/cudf/tests/test_timedelta.py | 34 +++- python/cudf/cudf/tests/test_udf_masked_ops.py | 13 +- python/cudf/cudf/utils/dtypes.py | 10 +- 11 files changed, 283 insertions(+), 148 deletions(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 14aa7bdd84b..107ebfbbcc3 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -9,9 +9,9 @@ from typing import Any, Mapping, Sequence, cast import numpy as np -import pandas as pd import cudf +import pandas as pd from cudf import _lib as libcudf from cudf._typing import ( ColumnBinaryOperand, @@ -21,6 +21,7 @@ ScalarLike, ) from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype +from cudf.core._compat import PANDAS_GE_200 from cudf.core.buffer import Buffer, cuda_array_interface_wrapper from cudf.core.column import ColumnBase, as_column, column, string from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion @@ -200,9 +201,16 @@ def to_pandas( # Workaround until following issue is fixed: # https://issues.apache.org/jira/browse/ARROW-9772 - # Pandas supports only `datetime64[ns]`, hence the cast. + if PANDAS_GE_200: + host_values = self.fillna("NaT").values_host + else: + # Pandas<2.0 supports only `datetime64[ns]`, hence the cast. + host_values = ( + self.astype("datetime64[ns]").fillna("NaT").values_host + ) + return pd.Series( - self.astype("datetime64[ns]").fillna("NaT").values_host, + host_values, copy=False, index=index, ) @@ -243,19 +251,30 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike: if isinstance(other, np.datetime64): if np.isnat(other): - return cudf.Scalar(None, dtype=self.dtype) + other_time_unit = cudf.utils.dtypes.get_time_unit(other) + if other_time_unit not in {"s", "ms", "ns", "us"}: + other_time_unit = "ns" + + return cudf.Scalar( + None, dtype=f"datetime64[{other_time_unit}]" + ) other = other.astype(self.dtype) return cudf.Scalar(other) elif isinstance(other, np.timedelta64): other_time_unit = cudf.utils.dtypes.get_time_unit(other) + if np.isnat(other): + return cudf.Scalar( + None, + dtype="timedelta64[ns]" + if other_time_unit not in {"s", "ms", "ns", "us"} + else other.dtype, + ) + if other_time_unit not in {"s", "ms", "ns", "us"}: other = other.astype("timedelta64[s]") - if np.isnat(other): - return cudf.Scalar(None, dtype=other.dtype) - return cudf.Scalar(other) elif isinstance(other, str): try: @@ -352,7 +371,7 @@ def mean( skipna=skipna, min_count=min_count, dtype=dtype ), unit=self.time_unit, - ) + ).as_unit(self.time_unit) def std( self, @@ -366,12 +385,12 @@ def std( skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof ) * _unit_to_nanoseconds_conversion[self.time_unit], - ) + ).as_unit(self.time_unit) def median(self, skipna: bool = None) -> pd.Timestamp: return pd.Timestamp( self.as_numerical.median(skipna=skipna), unit=self.time_unit - ) + ).as_unit(self.time_unit) def quantile( self, @@ -387,7 +406,9 @@ def quantile( return_scalar=return_scalar, ) if return_scalar: - return pd.Timestamp(result, unit=self.time_unit) + return pd.Timestamp(result, unit=self.time_unit).as_unit( + self.time_unit + ) return result.astype(self.dtype) def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: @@ -396,7 +417,9 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: if other is NotImplemented: return NotImplemented if isinstance(other, cudf.DateOffset): - return other._datetime_binop(self, op, reflect=reflect) + return other._datetime_binop(self, op, reflect=reflect).astype( + self.dtype + ) # We check this on `other` before reflection since we already know the # dtype of `self`. @@ -441,7 +464,11 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: if out_dtype is None: return NotImplemented - return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) + result_col = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) + if out_dtype != cudf.dtype(np.bool_) and op == "__add__": + return result_col # .astype(lhs.dtype) + else: + return result_col def fillna( self, fill_value: Any = None, method: str = None, dtype: Dtype = None @@ -525,7 +552,15 @@ def infer_format(element: str, **kwargs) -> str: fmt = _guess_datetime_format(element, **kwargs) if fmt is not None: - return fmt + if ".%f" in fmt: + # For context read: + # https://github.com/pandas-dev/pandas/issues/52418 + # We cannot rely on format containing only %f + # c++/libcudf expects .%3f, .%6f, .%9f + # Logic below handles those cases well. + pass + else: + return fmt element_parts = element.split(".") if len(element_parts) != 2: diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index e7979fa4d27..29fe448db75 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -17,6 +17,7 @@ from cudf.core.column import ColumnBase, column, string from cudf.utils.dtypes import np_to_pa_dtype from cudf.utils.utils import _fillna_natwise +from cudf.core._compat import PANDAS_GE_200 _dtype_to_format_conversion = { "timedelta64[ns]": "%D days %H:%M:%S", @@ -149,9 +150,16 @@ def to_pandas( # Workaround until following issue is fixed: # https://issues.apache.org/jira/browse/ARROW-9772 - # Pandas supports only `timedelta64[ns]`, hence the cast. + if PANDAS_GE_200: + host_values = self.fillna("NaT").values_host + else: + # Pandas<2.0 supports only `timedelta64[ns]`, hence the cast. + host_values = ( + self.astype("timedelta64[ns]").fillna("NaT").values_host + ) + pd_series = pd.Series( - self.astype("timedelta64[ns]").fillna("NaT").values_host, + host_values, copy=False, ) @@ -213,16 +221,21 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: def normalize_binop_value(self, other) -> ColumnBinaryOperand: if isinstance(other, (ColumnBase, cudf.Scalar)): return other - if isinstance(other, datetime.timedelta): - other = np.timedelta64(other) - elif isinstance(other, pd.Timestamp): + if isinstance(other, pd.Timestamp): other = other.to_datetime64() elif isinstance(other, pd.Timedelta): other = other.to_timedelta64() + elif isinstance(other, datetime.timedelta): + other = np.timedelta64(other) if isinstance(other, np.timedelta64): other_time_unit = cudf.utils.dtypes.get_time_unit(other) if np.isnat(other): - return cudf.Scalar(None, dtype=self.dtype) + return cudf.Scalar( + None, + dtype="timedelta64[ns]" + if other_time_unit not in {"s", "ms", "ns", "us"} + else self.dtype, + ) if other_time_unit not in {"s", "ms", "ns", "us"}: common_dtype = "timedelta64[s]" @@ -259,9 +272,8 @@ def fillna( col: ColumnBase = self if is_scalar(fill_value): if isinstance(fill_value, np.timedelta64): - dtype = determine_out_dtype(self.dtype, fill_value.dtype) + dtype = self.dtype fill_value = fill_value.astype(dtype) - col = col.astype(dtype) if not isinstance(fill_value, cudf.Scalar): fill_value = cudf.Scalar(fill_value, dtype=dtype) else: @@ -311,12 +323,12 @@ def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta: return pd.Timedelta( self.as_numerical.mean(skipna=skipna, dtype=dtype), unit=self.time_unit, - ) + ).as_unit(self.time_unit) def median(self, skipna: bool = None) -> pd.Timedelta: return pd.Timedelta( self.as_numerical.median(skipna=skipna), unit=self.time_unit - ) + ).as_unit(self.time_unit) def isin(self, values: Sequence) -> ColumnBase: return cudf.core.tools.datetimes._isin_datetimelike(self, values) @@ -335,7 +347,9 @@ def quantile( return_scalar=return_scalar, ) if return_scalar: - return pd.Timedelta(result, unit=self.time_unit) + return pd.Timedelta(result, unit=self.time_unit).as_unit( + self.time_unit + ) return result.astype(self.dtype) def sum( @@ -352,7 +366,7 @@ def sum( skipna=skipna, min_count=min_count, dtype=dtype ), unit=self.time_unit, - ) + ).as_unit(self.time_unit) def std( self, @@ -366,7 +380,7 @@ def std( skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype ), unit=self.time_unit, - ) + ).as_unit(self.time_unit) def components(self, index=None) -> "cudf.DataFrame": """ @@ -397,79 +411,72 @@ def components(self, index=None) -> "cudf.DataFrame": 4 37 13 12 14 234 0 0 """ # noqa: E501 - return cudf.DataFrame( - data={ - "days": self - // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["D"], "ns") - ), - "hours": ( - self - % cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["D"], "ns" - ) - ) - ) - // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["h"], "ns") - ), - "minutes": ( - self - % cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["h"], "ns" - ) - ) - ) - // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["m"], "ns") - ), - "seconds": ( - self - % cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["m"], "ns" - ) - ) - ) - // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["s"], "ns") - ), - "milliseconds": ( - self - % cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["s"], "ns" - ) - ) + date_meta = { + "seconds": ["m", "s"], + "milliseconds": ["s", "ms"], + "microseconds": ["ms", "us"], + "nanoseconds": ["us", "ns"], + } + data = { + "days": self + // cudf.Scalar( + np.timedelta64( + _unit_to_nanoseconds_conversion["D"], "ns" + ).astype(self.dtype) + ), + "hours": ( + self + % cudf.Scalar( + np.timedelta64( + _unit_to_nanoseconds_conversion["D"], "ns" + ).astype(self.dtype) ) - // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["ms"], "ns") - ), - "microseconds": ( - self - % cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["ms"], "ns" - ) - ) + ) + // cudf.Scalar( + np.timedelta64( + _unit_to_nanoseconds_conversion["h"], "ns" + ).astype(self.dtype) + ), + "minutes": ( + self + % cudf.Scalar( + np.timedelta64( + _unit_to_nanoseconds_conversion["h"], "ns" + ).astype(self.dtype) ) - // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["us"], "ns") - ), - "nanoseconds": ( - self - % cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["us"], "ns" - ) - ) + ) + // cudf.Scalar( + np.timedelta64( + _unit_to_nanoseconds_conversion["m"], "ns" + ).astype(self.dtype) + ), + } + keys_list = iter(date_meta.keys()) + for name in keys_list: + value = date_meta[name] + data[name] = ( + self + % cudf.Scalar( + np.timedelta64( + _unit_to_nanoseconds_conversion[value[0]], "ns" + ).astype(self.dtype) ) - // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["ns"], "ns") - ), - }, + ) // cudf.Scalar( + np.timedelta64( + _unit_to_nanoseconds_conversion[value[1]], "ns" + ).astype(self.dtype) + ) + if self._time_unit == value[1]: + break + + for name in keys_list: + res_col = cudf.core.column.full(len(self), 0, dtype="int64") + if self.nullable: + res_col = res_col.set_mask(self.mask) + data[name] = res_col + + return cudf.DataFrame( + data=data, index=index, ) @@ -483,7 +490,9 @@ def days(self) -> "cudf.core.column.NumericalColumn": NumericalColumn """ return self // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["D"], "ns") + np.timedelta64(_unit_to_nanoseconds_conversion["D"], "ns").astype( + self.dtype + ) ) @property @@ -503,7 +512,9 @@ def seconds(self) -> "cudf.core.column.NumericalColumn": return ( self % cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["D"], "ns") + np.timedelta64( + _unit_to_nanoseconds_conversion["D"], "ns" + ).astype(self.dtype) ) ) // cudf.Scalar( np.timedelta64(_unit_to_nanoseconds_conversion["s"], "ns") @@ -524,7 +535,10 @@ def microseconds(self) -> "cudf.core.column.NumericalColumn": # division operation to extract the number of microseconds. return ( - self % np.timedelta64(_unit_to_nanoseconds_conversion["s"], "ns") + self + % np.timedelta64( + _unit_to_nanoseconds_conversion["s"], "ns" + ).astype(self.dtype) ) // cudf.Scalar( np.timedelta64(_unit_to_nanoseconds_conversion["us"], "ns") ) @@ -544,6 +558,11 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn": # performing division operation to extract the number # of nanoseconds. + if self._time_unit != "ns": + res_col = cudf.core.column.full(len(self), 0, dtype="int64") + if self.nullable: + res_col = res_col.set_mask(self.mask) + return cast("cudf.core.column.NumericalColumn", res_col) return ( self % cudf.Scalar( diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 209276215c8..0ce4ccfa00e 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -64,6 +64,7 @@ numeric_normalize_types, ) from cudf.utils.utils import _cudf_nvtx_annotate, search_range +from cudf.core._compat import PANDAS_GE_200 T = TypeVar("T", bound="Frame") @@ -2289,7 +2290,10 @@ def isocalendar(self): @_cudf_nvtx_annotate def to_pandas(self, nullable=False): - nanos = self._values.astype("datetime64[ns]") + if PANDAS_GE_200: + nanos = self._values + else: + nanos = self._values.astype("datetime64[ns]") return pd.DatetimeIndex(nanos.to_pandas(), name=self.name) @_cudf_nvtx_annotate @@ -2507,7 +2511,9 @@ def days(self): """ Number of days for each element. """ - return as_index(arbitrary=self._values.days, name=self.name) + return as_index( + arbitrary=self._values.days, name=self.name, dtype="int32" + ) @property # type: ignore @_cudf_nvtx_annotate @@ -2515,7 +2521,9 @@ def seconds(self): """ Number of seconds (>= 0 and less than 1 day) for each element. """ - return as_index(arbitrary=self._values.seconds, name=self.name) + return as_index( + arbitrary=self._values.seconds, name=self.name, dtype="int32" + ) @property # type: ignore @_cudf_nvtx_annotate @@ -2523,7 +2531,9 @@ def microseconds(self): """ Number of microseconds (>= 0 and less than 1 second) for each element. """ - return as_index(arbitrary=self._values.microseconds, name=self.name) + return as_index( + arbitrary=self._values.microseconds, name=self.name, dtype="int32" + ) @property # type: ignore @_cudf_nvtx_annotate @@ -2532,7 +2542,9 @@ def nanoseconds(self): Number of nanoseconds (>= 0 and less than 1 microsecond) for each element. """ - return as_index(arbitrary=self._values.nanoseconds, name=self.name) + return as_index( + arbitrary=self._values.nanoseconds, name=self.name, dtype="int32" + ) @property # type: ignore @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 92ef49e92d9..e5a03d76721 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. import math import re @@ -55,7 +55,7 @@ def to_datetime( format=None, exact=True, unit="ns", - infer_datetime_format=False, + infer_datetime_format=True, origin="unix", cache=True, ): @@ -90,7 +90,7 @@ def to_datetime( origin(unix epoch start). Example, with unit='ms' and origin='unix' (the default), this would calculate the number of milliseconds to the unix epoch start. - infer_datetime_format : bool, default False + infer_datetime_format : bool, default True If True and no `format` is given, attempt to infer the format of the datetime strings, and if it can be inferred, switch to a faster method of parsing them. In some cases this can increase the parsing @@ -130,6 +130,12 @@ def to_datetime( f"{['ignore', 'raise', 'coerce', 'warn']}, found: " f"{errors}" ) + if infer_datetime_format in {None, False}: + warnings.warn( + "`infer_datetime_format` is deprecated and will " + "be removed in a future version of cudf.", + FutureWarning, + ) if arg is None: return None diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 7d01f89eada..8b9a25fa865 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -1721,7 +1721,7 @@ def test_datetime_dateoffset_binaryop( date_col, n_periods, frequency, dtype, op ): gsr = cudf.Series(date_col, dtype=dtype) - psr = gsr.to_pandas() # converts to nanos + psr = gsr.to_pandas() kwargs = {frequency: n_periods} diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 90bc7ad8414..f696ad2fe4d 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2643,6 +2643,12 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type): pa_chunk_array = pa.chunked_array(np_list_data) expect = pd.Series(pa_chunk_array.to_pandas()) + if cudf.api.types.is_datetime64_dtype( + data_type + ) or cudf.api.types.is_timedelta64_dtype(data_type): + # Workaround for an Arrow Bug: + # https://github.com/apache/arrow/issues/34462 + expect = expect.astype(data_type) got = cudf.Series(pa_chunk_array) assert_eq(expect, got) @@ -2657,6 +2663,12 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type): ) expect = pa_table.to_pandas() + if cudf.api.types.is_datetime64_dtype( + data_type + ) or cudf.api.types.is_timedelta64_dtype(data_type): + # Workaround for an Arrow Bug: + # https://github.com/apache/arrow/issues/34462 + expect = expect.astype(data_type) got = cudf.DataFrame.from_arrow(pa_table) assert_eq(expect, got) @@ -3929,9 +3941,6 @@ def test_all(data): got = gdata.all(bool_only=True) expected = pdata.all(bool_only=True) assert_eq(got, expected) - else: - with pytest.raises(NotImplementedError): - gdata.all(level="a") got = gdata.all() expected = pdata.all() @@ -3990,9 +3999,6 @@ def test_any(data, axis): got = gdata.any(bool_only=True) expected = pdata.any(bool_only=True) assert_eq(got, expected) - else: - with pytest.raises(NotImplementedError): - gdata.any(level="a") got = gdata.any(axis=axis) expected = pdata.any(axis=axis) @@ -5187,7 +5193,6 @@ def test_rowwise_ops_nullable_int_dtypes(op, expected): def test_rowwise_ops_datetime_dtypes(data, op, skipna): gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() with expect_warning_if( diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 1211938ff10..5f76ed81cc8 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -191,8 +191,8 @@ def test_dt_series(data, field): pd_data = pd.Series(data.copy()) gdf_data = Series(pd_data) base = getattr(pd_data.dt, field) - test = getattr(gdf_data.dt, field).to_pandas().astype("int64") - assert_eq(base, test) + test = getattr(gdf_data.dt, field) + assert_eq(base, test, check_dtype=False) @pytest.mark.parametrize("data", [data1(), data2()]) @@ -200,7 +200,7 @@ def test_dt_series(data, field): def test_dt_index(data, field): pd_data = data.copy() gdf_data = DatetimeIndex(pd_data) - assert_eq(getattr(gdf_data, field), getattr(pd_data, field)) + assert_eq(getattr(gdf_data, field), getattr(pd_data, field), exact=False) def test_setitem_datetime(): @@ -614,8 +614,7 @@ def test_datetime_dataframe(): ], ) @pytest.mark.parametrize("dayfirst", [True, False]) -@pytest.mark.parametrize("infer_datetime_format", [True, False]) -def test_cudf_to_datetime(data, dayfirst, infer_datetime_format): +def test_cudf_to_datetime(data, dayfirst): pd_data = data if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): gd_data = cudf.from_pandas(pd_data) @@ -625,14 +624,24 @@ def test_cudf_to_datetime(data, dayfirst, infer_datetime_format): else: gd_data = pd_data - expected = pd.to_datetime( - pd_data, dayfirst=dayfirst, infer_datetime_format=infer_datetime_format - ) - actual = cudf.to_datetime( - gd_data, dayfirst=dayfirst, infer_datetime_format=infer_datetime_format - ) - - assert_eq(actual, expected) + expected = pd.to_datetime(pd_data, dayfirst=dayfirst) + actual = cudf.to_datetime(gd_data, dayfirst=dayfirst) + + # TODO: Remove typecast to `ns` and following if/else + # workaround after following issue is fixed: + # https://github.com/pandas-dev/pandas/issues/52449 + + if actual is not None and expected is not None: + assert_eq( + actual.astype(pd_data.dtype) + if pd_data is not None + and hasattr(pd_data, "dtype") + and cudf.api.types.is_datetime_dtype(pd_data.dtype) + else actual.astype("datetime64[ns]"), + expected, + ) + else: + assert_eq(actual, expected) @pytest.mark.parametrize( @@ -722,7 +731,11 @@ def test_to_datetime_units(data, unit): expected = pd.to_datetime(pd_data, unit=unit) actual = cudf.to_datetime(gd_data, unit=unit) - assert_eq(actual, expected) + # TODO: Remove typecast to `ns` after following + # issue is fixed: + # https://github.com/pandas-dev/pandas/issues/52449 + + assert_eq(actual.astype("datetime64[ns]"), expected) @pytest.mark.parametrize( @@ -896,6 +909,7 @@ def test_str_to_datetime_error(): np.datetime64("2005-02-25"), np.datetime64("2005-02-25T03:30"), np.datetime64("nat"), + # TODO: https://github.com/pandas-dev/pandas/issues/52295 ], ) @pytest.mark.parametrize("data_dtype", DATETIME_TYPES) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index aad163736c2..fe692a87ca8 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -20,7 +20,7 @@ from pyarrow import fs as pa_fs, parquet as pq import cudf -from cudf.core._compat import PANDAS_LT_153 +from cudf.core._compat import PANDAS_LT_153, PANDAS_GE_200 from cudf.io.parquet import ( ParquetDatasetWriter, ParquetWriter, @@ -286,6 +286,16 @@ def test_parquet_reader_basic(parquet_file, columns, engine): if "col_category" in got.columns: got = got.drop(columns=["col_category"]) + if PANDAS_GE_200 and columns is None: + # https://github.com/pandas-dev/pandas/issues/52412 + assert expect["col_datetime64[ms]"].dtype == np.dtype("datetime64[ns]") + assert expect["col_datetime64[us]"].dtype == np.dtype("datetime64[ns]") + expect["col_datetime64[ms]"] = expect["col_datetime64[ms]"].astype( + "datetime64[ms]" + ) + expect["col_datetime64[us]"] = expect["col_datetime64[us]"].astype( + "datetime64[us]" + ) assert_eq(expect, got, check_categorical=False) @@ -1432,7 +1442,16 @@ def test_parquet_writer_int96_timestamps(tmpdir, pdf, gdf): expect = pdf got = pd.read_parquet(gdf_fname) - + if PANDAS_GE_200: + # https://github.com/pandas-dev/pandas/issues/52412 + assert got["col_datetime64[ms]"].dtype == np.dtype("datetime64[ns]") + assert got["col_datetime64[us]"].dtype == np.dtype("datetime64[ns]") + got["col_datetime64[ms]"] = got["col_datetime64[ms]"].astype( + "datetime64[ms]" + ) + got["col_datetime64[us]"] = got["col_datetime64[us]"].astype( + "datetime64[us]" + ) # verify INT96 timestamps were converted back to the same data. assert_eq(expect, got, check_categorical=False) diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 4b1e8cf1027..7f501373be3 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -11,6 +11,7 @@ import cudf from cudf.testing import _utils as utils from cudf.testing._utils import assert_eq, assert_exceptions_equal +from cudf.core._compat import PANDAS_GE_200 _TIMEDELTA_DATA = [ [1000000, 200000, 3000000], @@ -528,7 +529,13 @@ def test_timedelta_series_mod_with_scalar_zero(reverse): datetime.timedelta(seconds=768), datetime.timedelta(microseconds=7), np.timedelta64(4, "s"), - np.timedelta64("nat", "s"), + pytest.param( + np.timedelta64("nat", "s"), + marks=pytest.mark.xfail( + strict=False, + reason="https://github.com/pandas-dev/pandas/issues/52295", + ), + ), np.timedelta64(1, "s"), np.timedelta64(1, "ms"), np.timedelta64(1, "us"), @@ -686,38 +693,41 @@ def test_timedelta_dt_components(data, dtype): @pytest.mark.parametrize( "data", - _TIMEDELTA_DATA, + _TIMEDELTA_DATA_NON_OVERFLOW, + # TODO-PANDAS-2.0: Replace above with `_TIMEDELTA_DATA` + # after the following issue is fixed: + # https://github.com/pandas-dev/pandas/issues/52386 ) @pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) def test_timedelta_dt_properties(data, dtype): gsr = cudf.Series(data, dtype=dtype) psr = gsr.to_pandas() - def local_assert(expected, actual): + def local_assert(expected, actual, **kwargs): if gsr.isnull().any(): - assert_eq(expected, actual.astype("float")) + assert_eq(expected, actual.astype("float"), **kwargs) else: - assert_eq(expected, actual) + assert_eq(expected, actual, **kwargs) expected_days = psr.dt.days actual_days = gsr.dt.days - local_assert(expected_days, actual_days) + local_assert(expected_days, actual_days, check_dtype=False) expected_seconds = psr.dt.seconds actual_seconds = gsr.dt.seconds - local_assert(expected_seconds, actual_seconds) + local_assert(expected_seconds, actual_seconds, check_dtype=False) expected_microseconds = psr.dt.microseconds actual_microseconds = gsr.dt.microseconds - local_assert(expected_microseconds, actual_microseconds) + local_assert(expected_microseconds, actual_microseconds, check_dtype=False) expected_nanoseconds = psr.dt.nanoseconds actual_nanoseconds = gsr.dt.nanoseconds - local_assert(expected_nanoseconds, actual_nanoseconds) + local_assert(expected_nanoseconds, actual_nanoseconds, check_dtype=False) @pytest.mark.parametrize( @@ -1315,7 +1325,11 @@ def test_numeric_to_timedelta(data, dtype, timedelta_dtype): psr = sr.to_pandas() actual = sr.astype(timedelta_dtype) - expected = pd.Series(psr.to_numpy().astype(timedelta_dtype)) + + if PANDAS_GE_200: + expected = psr.astype(timedelta_dtype) + else: + expected = pd.Series(psr.to_numpy().astype(timedelta_dtype)) assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index ab0205df677..3c827b4f242 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -184,7 +184,18 @@ def func(row): ) gdf["a"] = gdf["a"].astype(dtype_l) gdf["b"] = gdf["b"].astype(dtype_r) - run_masked_udf_test(func, gdf, check_dtype=False) + + pdf = gdf.to_pandas(nullable=True) + + expect = op(pdf["a"], pdf["b"]) + obtain = gdf.apply(func, axis=1) + assert_eq(expect, obtain, check_dtype=False) + # TODO: After the following pandas issue is + # fixed, uncomment the following line and delete + # through `to_pandas(nullable=True)` statement. + # https://github.com/pandas-dev/pandas/issues/52411 + + # run_masked_udf_test(func, gdf, check_dtype=False) @pytest.mark.parametrize("op", comparison_ops) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index c7a8c8b4096..9fbc099b1a1 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -269,14 +269,14 @@ def to_cudf_compatible_scalar(val, dtype=None): # the string value directly (cudf.DeviceScalar will DTRT) return val - if isinstance(val, datetime.datetime): - val = np.datetime64(val) - elif isinstance(val, datetime.timedelta): - val = np.timedelta64(val) - elif isinstance(val, pd.Timestamp): + if isinstance(val, pd.Timestamp): val = val.to_datetime64() elif isinstance(val, pd.Timedelta): val = val.to_timedelta64() + elif isinstance(val, datetime.datetime): + val = np.datetime64(val) + elif isinstance(val, datetime.timedelta): + val = np.timedelta64(val) val = _maybe_convert_to_default_type( cudf.api.types.pandas_dtype(type(val)) From 60c257aad843bcd262f42d116a908e06de3aac32 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 7 Apr 2023 16:15:44 -0500 Subject: [PATCH 016/162] Fix MultiIndex construction in pandas 2.0 (#13092) This PR removes a `MultiIndex` construction workaround that retains correct dtypes of each level. Thus fixing 19 pytests: ```bash = 907 failed, 86196 passed, 2034 skipped, 992 xfailed, 165 xpassed in 536.13s (0:08:56) = ``` On `pandas_2.0_feature_branch`: ```bash = 926 failed, 86177 passed, 2034 skipped, 992 xfailed, 165 xpassed in 545.17s (0:09:05) = ``` --- python/cudf/cudf/core/column_accessor.py | 40 +++++++++++++----------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 707eda3f5e6..d1bfa4dd55d 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -18,10 +18,10 @@ ) import pandas as pd -from packaging.version import Version import cudf from cudf.core import column +from cudf.core._compat import PANDAS_GE_200 if TYPE_CHECKING: from cudf.core.column import ColumnBase @@ -247,24 +247,28 @@ def _clear_cache(self): def to_pandas_index(self) -> pd.Index: """Convert the keys of the ColumnAccessor to a Pandas Index object.""" if self.multiindex and len(self.level_names) > 0: - # Using `from_frame()` instead of `from_tuples` - # prevents coercion of values to a different type - # (e.g., ''->NaT) - with warnings.catch_warnings(): - # Specifying `dtype="object"` here and passing that to - # `from_frame` is deprecated in pandas, but we cannot remove - # that without also losing compatibility with other current - # pandas behaviors like the NaT inference above. For now we - # must catch the warnings internally, but we will need to - # remove this when we implement compatibility with pandas 2.0, - # which will remove these compatibility layers. - assert Version(pd.__version__) < Version("2.0.0") - warnings.simplefilter("ignore") - result = pd.MultiIndex.from_frame( - pd.DataFrame( - self.names, columns=self.level_names, dtype="object" - ), + if PANDAS_GE_200: + result = pd.MultiIndex.from_tuples( + self.names, + names=self.level_names, ) + else: + # Using `from_frame()` instead of `from_tuples` + # prevents coercion of values to a different type + # (e.g., ''->NaT) + with warnings.catch_warnings(): + # Specifying `dtype="object"` here and passing that to + # `from_frame` is deprecated in pandas, but we cannot + # remove that without also losing compatibility with other + # current pandas behaviors like the NaT inference above. + warnings.simplefilter("ignore") + result = pd.MultiIndex.from_frame( + pd.DataFrame( + self.names, + columns=self.level_names, + dtype="object", + ), + ) else: result = pd.Index(self.names, name=self.name, tupleize_cols=False) return result From be199680e1bbdfea789e3d44400ab8b1b1940588 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 13 Apr 2023 20:13:14 -0500 Subject: [PATCH 017/162] [REVIEW] Enable `numeric_only` for row-wise ops (#13090) This PR enables numeric_only for row-wise ops to be on parity with pandas-2.0. --- python/cudf/cudf/core/dataframe.py | 38 ++---- python/cudf/cudf/core/frame.py | 4 +- python/cudf/cudf/tests/test_array_ufunc.py | 2 +- python/cudf/cudf/tests/test_dataframe.py | 151 ++++++++------------- 4 files changed, 77 insertions(+), 118 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 416a2047fb2..d9f9f65a9d1 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5738,7 +5738,7 @@ def make_false_column_like_self(): # Stats # @_cudf_nvtx_annotate - def _prepare_for_rowwise_op(self, method, skipna): + def _prepare_for_rowwise_op(self, method, skipna, numeric_only): """Prepare a DataFrame for CuPy-based row-wise operations.""" if method not in _cupy_nan_methods_map and any( @@ -5752,26 +5752,23 @@ def _prepare_for_rowwise_op(self, method, skipna): ) raise ValueError(msg) - is_pure_dt = all(is_datetime_dtype(dt) for dt in self.dtypes) - - if not is_pure_dt: + if numeric_only: filtered = self.select_dtypes(include=[np.number, np.bool_]) else: filtered = self.copy(deep=False) - common_dtype = find_common_type(filtered.dtypes) + is_pure_dt = all(is_datetime_dtype(dt) for dt in filtered.dtypes) - if filtered._num_columns < self._num_columns: - # When we update our pandas compatibility target to 2.0, pandas - # will stop supporting numeric_only=None and users will have to - # specify True/False. At that time we should also top our implicit - # removal of non-numeric columns here. - assert Version(pd.__version__) < Version("2.0.0") - msg = ( - "Row-wise operations currently only support int, float " - "and bool dtypes. Non numeric columns are ignored." + common_dtype = find_common_type(filtered.dtypes) + if ( + not numeric_only + and is_string_dtype(common_dtype) + and any(not is_string_dtype(dt) for dt in filtered.dtypes) + ): + raise TypeError( + f"Cannot perform row-wise {method} across mixed-dtype columns," + " try type-casting all the columns to same dtype." ) - warnings.warn(msg) if not skipna and any(col.nullable for col in filtered._columns): mask = DataFrame( @@ -5857,7 +5854,7 @@ def _reduce( ) source = self._get_columns_by_label(numeric_cols) if source.empty: - return Series(index=cudf.StringIndex([])) + return Series(index=self.index) axis = source._get_axis_from_axis_arg(axis) @@ -6063,12 +6060,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): "Row-wise operations currently do not support `level`." ) - numeric_only = kwargs.pop("numeric_only", None) - if numeric_only not in (None, True): - raise NotImplementedError( - "Row-wise operations currently do not " - "support `numeric_only=False`." - ) + numeric_only = kwargs.pop("numeric_only", False) min_count = kwargs.pop("min_count", None) if min_count not in (None, 0): @@ -6088,7 +6080,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): kwargs.pop("cast_to_int", None) prepared, mask, common_dtype = self._prepare_for_rowwise_op( - method, skipna + method, skipna, numeric_only ) for col in prepared._data.names: if prepared._data[col].nullable: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index a07bc60922d..03ac9b6688b 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1945,7 +1945,7 @@ def max( self, axis=0, skipna=True, - numeric_only=None, + numeric_only=False, **kwargs, ): """ @@ -1957,7 +1957,7 @@ def max( Axis for the function to be applied on. skipna: bool, default True Exclude NA/null values when computing the result. - numeric_only: bool, default None + numeric_only: bool, default False If True, includes only float, int, boolean columns. If False, will raise error in-case there are non-numeric columns. diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index 8f4ae7c23a0..2daf942d4b0 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import operator import warnings diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index f696ad2fe4d..bed9b81b803 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -4890,7 +4890,9 @@ def test_df_constructor_dtype(dtype): { "a": [1, 2, 3, 4], "b": [7, np.NaN, 9, 10], - "c": [np.NaN, np.NaN, np.NaN, np.NaN], + "c": cudf.Series( + [np.NaN, np.NaN, np.NaN, np.NaN], nan_as_null=False + ), "d": cudf.Series([None, None, None, None], dtype="int64"), "e": [100, None, 200, None], "f": cudf.Series([10, None, np.NaN, 11], nan_as_null=False), @@ -4910,38 +4912,34 @@ def test_df_constructor_dtype(dtype): "op", ["max", "min", "sum", "product", "mean", "var", "std"] ) @pytest.mark.parametrize("skipna", [True, False]) -def test_rowwise_ops(data, op, skipna): +@pytest.mark.parametrize("numeric_only", [True, False]) +def test_rowwise_ops(data, op, skipna, numeric_only): gdf = data pdf = gdf.to_pandas() - kwargs = {"axis": 1, "skipna": skipna} + kwargs = {"axis": 1, "skipna": skipna, "numeric_only": numeric_only} if op in ("var", "std"): kwargs["ddof"] = 0 - with expect_warning_if( - not all( - ( - (pdf[column].count() == 0) - if skipna - else (pdf[column].notna().count() == 0) - ) - or cudf.api.types.is_numeric_dtype(pdf[column].dtype) - or cudf.api.types.is_bool_dtype(pdf[column].dtype) - for column in pdf + if not numeric_only and not all( + ( + (pdf[column].count() == 0) + if skipna + else (pdf[column].notna().count() == 0) ) + or cudf.api.types.is_numeric_dtype(pdf[column].dtype) + or cudf.api.types.is_bool_dtype(pdf[column].dtype) + for column in pdf ): + with pytest.raises(TypeError): + expected = getattr(pdf, op)(**kwargs) + with pytest.raises(TypeError): + got = getattr(gdf, op)(**kwargs) + else: expected = getattr(pdf, op)(**kwargs) - with expect_warning_if( - not all( - cudf.api.types.is_numeric_dtype(gdf[column].dtype) - or cudf.api.types.is_bool_dtype(gdf[column].dtype) - for column in gdf - ), - UserWarning, - ): got = getattr(gdf, op)(**kwargs) - assert_eq(expected, got, check_exact=False) + assert_eq(expected, got, check_dtype=False) @pytest.mark.parametrize( @@ -4971,67 +4969,18 @@ def test_rowwise_ops_nullable_dtypes_all_null(op): @pytest.mark.parametrize( - "op,expected", + "op", [ - ( - "max", - cudf.Series( - [10.0, None, np.NaN, 2234.0, None, np.NaN], - dtype="float64", - nan_as_null=False, - ), - ), - ( - "min", - cudf.Series( - [10.0, None, np.NaN, 13.0, None, np.NaN], - dtype="float64", - nan_as_null=False, - ), - ), - ( - "sum", - cudf.Series( - [20.0, None, np.NaN, 2247.0, None, np.NaN], - dtype="float64", - nan_as_null=False, - ), - ), - ( - "product", - cudf.Series( - [100.0, None, np.NaN, 29042.0, None, np.NaN], - dtype="float64", - nan_as_null=False, - ), - ), - ( - "mean", - cudf.Series( - [10.0, None, np.NaN, 1123.5, None, np.NaN], - dtype="float64", - nan_as_null=False, - ), - ), - ( - "var", - cudf.Series( - [0.0, None, np.NaN, 1233210.25, None, np.NaN], - dtype="float64", - nan_as_null=False, - ), - ), - ( - "std", - cudf.Series( - [0.0, None, np.NaN, 1110.5, None, np.NaN], - dtype="float64", - nan_as_null=False, - ), - ), + "max", + "min", + "sum", + "product", + "mean", + "var", + "std", ], ) -def test_rowwise_ops_nullable_dtypes_partial_null(op, expected): +def test_rowwise_ops_nullable_dtypes_partial_null(op): gdf = cudf.DataFrame( { "a": [10, 11, 12, 13, 14, 15], @@ -5044,10 +4993,12 @@ def test_rowwise_ops_nullable_dtypes_partial_null(op, expected): if op in ("var", "std"): got = getattr(gdf, op)(axis=1, ddof=0, skipna=False) + expected = getattr(gdf.to_pandas(), op)(axis=1, ddof=0, skipna=False) else: got = getattr(gdf, op)(axis=1, skipna=False) + expected = getattr(gdf.to_pandas(), op)(axis=1, skipna=False) - assert_eq(got.null_count, expected.null_count) + assert_eq(got.null_count, 2) assert_eq(got, expected) @@ -5190,23 +5141,39 @@ def test_rowwise_ops_nullable_int_dtypes(op, expected): ) @pytest.mark.parametrize("op", ["max", "min"]) @pytest.mark.parametrize("skipna", [True, False]) -def test_rowwise_ops_datetime_dtypes(data, op, skipna): +@pytest.mark.parametrize("numeric_only", [True, False]) +def test_rowwise_ops_datetime_dtypes(data, op, skipna, numeric_only): gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() - with expect_warning_if( - not all(cudf.api.types.is_datetime64_dtype(dt) for dt in gdf.dtypes), - UserWarning, - ): - got = getattr(gdf, op)(axis=1, skipna=skipna) - with expect_warning_if( - not all(pd.api.types.is_datetime64_dtype(dt) for dt in gdf.dtypes), - FutureWarning, + if not numeric_only and not all( + cudf.api.types.is_datetime64_dtype(dt) for dt in gdf.dtypes ): - expected = getattr(pdf, op)(axis=1, skipna=skipna) + with pytest.raises(TypeError): + got = getattr(gdf, op)( + axis=1, skipna=skipna, numeric_only=numeric_only + ) + with pytest.raises(TypeError): + expected = getattr(pdf, op)( + axis=1, skipna=skipna, numeric_only=numeric_only + ) + else: + got = getattr(gdf, op)( + axis=1, skipna=skipna, numeric_only=numeric_only + ) + expected = getattr(pdf, op)( + axis=1, skipna=skipna, numeric_only=numeric_only + ) + if got.dtype == cudf.dtype( + "datetime64[us]" + ) and expected.dtype == np.dtype("datetime64[ns]"): + # Workaround for a PANDAS-BUG: + # https://github.com/pandas-dev/pandas/issues/52524 + assert_eq(got.astype("datetime64[ns]"), expected) + else: - assert_eq(got, expected) + assert_eq(got, expected, check_dtype=False) @pytest.mark.parametrize( From 8ff4861685087996b2fd411f2224c0df26687595 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 14 Apr 2023 21:46:14 -0500 Subject: [PATCH 018/162] [REVIEW] Fix `DataFrame.__getitem__` to work with `pandas-2.0` (#13139) This PR updates `DataFrame.__getitem__` to be able to work with pandas-2.0. For which, we conditionally pass `dtype` to `pandas.Series` constructor so that we don't get a warning in `<2.0` versions. This PR also fixes 76 pytests: ``` = 907 failed, 86353 passed, 2034 skipped, 992 xfailed, 165 xpassed in 504.93s (0:08:24) = ``` on `pandas_2.0_feature_branch`: ``` = 983 failed, 86277 passed, 2034 skipped, 992 xfailed, 165 xpassed in 515.47s (0:08:35) = ``` --- python/cudf/cudf/core/dataframe.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index efb3def1eac..760fcef826c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -32,7 +32,6 @@ import pandas as pd import pyarrow as pa from nvtx import annotate -from packaging.version import Version from pandas._config import get_option from pandas.core.dtypes.common import is_float, is_integer from pandas.io.formats import console @@ -104,6 +103,7 @@ _cudf_nvtx_annotate, _external_only_api, ) +from cudf.core._compat import PANDAS_GE_200 T = TypeVar("T", bound="DataFrame") @@ -1167,13 +1167,13 @@ def __getitem__(self, arg): elif can_convert_to_column(arg): mask = arg if is_list_like(mask): - # An explicit dtype is needed to avoid pandas warnings from - # empty sets of columns. This shouldn't be needed in pandas - # 2.0, we don't need to specify a dtype when we know we're not - # trying to match any columns so the default is fine. dtype = None - if len(mask) == 0: - assert Version(pd.__version__) < Version("2.0.0") + if len(mask) == 0 and not PANDAS_GE_200: + # An explicit dtype is needed to avoid pandas + # warnings from empty sets of columns. This + # shouldn't be needed in pandas 2.0, we don't + # need to specify a dtype when we know we're not + # trying to match any columns so the default is fine. dtype = "float64" mask = pd.Series(mask, dtype=dtype) if mask.dtype == "bool": From bd38d702955331450e81be1ee83f982c15d04f77 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 18 Apr 2023 10:05:35 -0500 Subject: [PATCH 019/162] Drop backfill and pad in GroupBy (#13156) This PR drops support for `pad` and `backfill` in `GroupBy`. This PR: ``` = 881 failed, 86383 passed, 2034 skipped, 956 xfailed, 165 xpassed in 522.05s (0:08:42) = ``` On `pandas_2.0_feature_branch`: ``` = 911 failed, 86389 passed, 2034 skipped, 956 xfailed, 165 xpassed in 521.12s (0:08:41) = ``` --- docs/cudf/source/api_docs/groupby.rst | 2 - python/cudf/cudf/core/groupby/groupby.py | 56 +++--------------------- python/cudf/cudf/tests/test_groupby.py | 21 ++++----- 3 files changed, 14 insertions(+), 65 deletions(-) diff --git a/docs/cudf/source/api_docs/groupby.rst b/docs/cudf/source/api_docs/groupby.rst index 550a0ef1c89..26dd9bb354b 100644 --- a/docs/cudf/source/api_docs/groupby.rst +++ b/docs/cudf/source/api_docs/groupby.rst @@ -42,7 +42,6 @@ Computations / descriptive stats :toctree: api/ GroupBy.bfill - GroupBy.backfill GroupBy.count GroupBy.cumcount GroupBy.cummax @@ -63,7 +62,6 @@ Computations / descriptive stats GroupBy.ngroup GroupBy.nth GroupBy.nunique - GroupBy.pad GroupBy.prod GroupBy.shift GroupBy.size diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 5e98db0d575..d137651679d 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1945,28 +1945,6 @@ def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries: result = self._mimic_pandas_order(result) return result._copy_type_metadata(values) - def pad(self, limit=None): - """Forward fill NA values. - - .. deprecated:: 23.06 - `pad` is deprecated, use `ffill` instead. - - Parameters - ---------- - limit : int, default None - Unsupported - """ - - if limit is not None: - raise NotImplementedError("Does not support limit param yet.") - - warnings.warn( - "pad is deprecated and will be removed in a future version. " - "Use ffill instead.", - FutureWarning, - ) - return self._scan_fill("ffill", limit) - def ffill(self, limit=None): """Forward fill NA values. @@ -1981,27 +1959,6 @@ def ffill(self, limit=None): return self._scan_fill("ffill", limit) - def backfill(self, limit=None): - """Backward fill NA values. - - .. deprecated:: 23.06 - `backfill` is deprecated, use `bfill` instead. - - Parameters - ---------- - limit : int, default None - Unsupported - """ - if limit is not None: - raise NotImplementedError("Does not support limit param yet.") - - warnings.warn( - "backfill is deprecated and will be removed in a future version. " - "Use bfill instead.", - FutureWarning, - ) - return self._scan_fill("bfill", limit) - def bfill(self, limit=None): """Backward fill NA values. @@ -2030,11 +1987,11 @@ def fillna( ---------- value : scalar, dict Value to use to fill the holes. Cannot be specified with method. - method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + method : { 'bfill', 'ffill', None}, default None Method to use for filling holes in reindexed Series - - pad/ffill: propagate last valid observation forward to next valid - - backfill/bfill: use next valid observation to fill gap + - ffill: propagate last valid observation forward to next valid + - bfill: use next valid observation to fill gap axis : {0 or 'index', 1 or 'columns'} Unsupported inplace : bool, default False @@ -2064,11 +2021,8 @@ def fillna( raise ValueError("Cannot specify both 'value' and 'method'.") if method is not None: - if method not in {"pad", "ffill", "backfill", "bfill"}: - raise ValueError( - "Method can only be of 'pad', 'ffill'," - "'backfill', 'bfill'." - ) + if method not in {"ffill", "bfill"}: + raise ValueError("Method can only be of 'ffill', 'bfill'.") return getattr(self, method, limit)() values = self.obj.__class__._from_data( diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 24db7008804..472a3fa5976 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -2499,7 +2499,7 @@ def test_groupby_various_by_fillna(by, data, args): @pytest.mark.parametrize("nelem", [10, 100, 1000]) -@pytest.mark.parametrize("method", ["pad", "ffill", "backfill", "bfill"]) +@pytest.mark.parametrize("method", ["ffill", "bfill"]) def test_groupby_fillna_method(nelem, method): t = rand_dataframe( dtypes_meta=[ @@ -2538,8 +2538,7 @@ def test_groupby_fillna_method(nelem, method): gdf = cudf.from_pandas(pdf) expect = pdf.groupby(key_col).fillna(method=method) - with expect_warning_if(method in {"pad", "backfill"}): - got = gdf.groupby(key_col).fillna(method=method) + got = gdf.groupby(key_col).fillna(method=method) assert_groupby_results_equal( expect[value_cols], got[value_cols], sort=False @@ -2879,19 +2878,17 @@ def test_groupby_transform_maintain_index(by): ], ) @pytest.mark.parametrize("periods", [-5, -2, 0, 2, 5]) -@pytest.mark.parametrize("fill_method", ["ffill", "bfill", "pad", "backfill"]) +@pytest.mark.parametrize("fill_method", ["ffill", "bfill"]) def test_groupby_pct_change(data, gkey, periods, fill_method): gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() - with expect_warning_if(fill_method in ("pad", "backfill")): - actual = gdf.groupby(gkey).pct_change( - periods=periods, fill_method=fill_method - ) - with expect_warning_if(fill_method in ("pad", "backfill")): - expected = pdf.groupby(gkey).pct_change( - periods=periods, fill_method=fill_method - ) + actual = gdf.groupby(gkey).pct_change( + periods=periods, fill_method=fill_method + ) + expected = pdf.groupby(gkey).pct_change( + periods=periods, fill_method=fill_method + ) assert_eq(expected, actual) From 81565cfa490348892b7e883faf78a7000fa2ada0 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 18 Apr 2023 10:06:47 -0500 Subject: [PATCH 020/162] [REVIEW] Add `no_default` and adapt `Series.reset_index` to differentiate `None` for `name` parameter (#13152) In `pandas-2.0` the behavior for `name` parameter has changed to actually name a column `0` if no value is passed to `name`. But if `name=None`, the column will be named `None` too: ```python In [1]: import pandas as pd In [2]: s = pd.Series([10, 11, 23], index=[2, 3, 5]) In [3]: s Out[3]: 2 10 3 11 5 23 dtype: int64 In [4]: s.reset_index() Out[4]: index 0 0 2 10 1 3 11 2 5 23 In [5]: s.reset_index(name=None) Out[5]: index None 0 2 10 1 3 11 2 5 23 ``` To achieve the same behavior in `cudf`, we had to introduce `no_default` value(which is same as pandas's `no_default` value). This also fixes 18 pytests: ``` = 965 failed, 86325 passed, 2044 skipped, 992 xfailed, 165 xpassed in 508.32s (0:08:28) = ``` On `pandas_2.0_feature_branch`: ``` = 983 failed, 86277 passed, 2034 skipped, 992 xfailed, 165 xpassed in 541.87s (0:09:01) = ``` --- python/cudf/cudf/api/extensions/__init__.py | 4 +++- python/cudf/cudf/core/series.py | 7 +++++-- python/cudf/cudf/tests/test_series.py | 17 ++++++----------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/api/extensions/__init__.py b/python/cudf/cudf/api/extensions/__init__.py index eeb5dcdb32a..c51fa5dc7ca 100644 --- a/python/cudf/cudf/api/extensions/__init__.py +++ b/python/cudf/cudf/api/extensions/__init__.py @@ -1,12 +1,14 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from cudf.api.extensions.accessor import ( register_dataframe_accessor, register_index_accessor, register_series_accessor, ) +from pandas.api.extensions import no_default __all__ = [ + "no_default", "register_dataframe_accessor", "register_index_accessor", "register_series_accessor", diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index d1a5922f0a5..8fafa97bd47 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -79,6 +79,7 @@ to_cudf_compatible_scalar, ) from cudf.utils.utils import _cudf_nvtx_annotate +from cudf.api.extensions import no_default def _format_percentile_names(percentiles): @@ -996,7 +997,9 @@ def reindex(self, *args, **kwargs): """, ) ) - def reset_index(self, level=None, drop=False, name=None, inplace=False): + def reset_index( + self, level=None, drop=False, name=no_default, inplace=False + ): if not drop and inplace: raise TypeError( "Cannot reset_index inplace on a Series " @@ -1004,7 +1007,7 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): ) data, index = self._reset_index(level=level, drop=drop) if not drop: - if name is None: + if name is no_default: name = 0 if self.name is None else self.name data[name] = data.pop(self.name) return cudf.core.dataframe.DataFrame._from_data(data, index) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index b9b40b9744c..85e5397c7c1 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -23,6 +23,7 @@ expect_warning_if, gen_rand, ) +from cudf.api.extensions import no_default def _series_na_data(): @@ -1407,7 +1408,7 @@ def test_nullable_bool_dtype_series(data, bool_dtype): @pytest.mark.parametrize("level", [None, 0, "l0", 1, ["l0", 1]]) @pytest.mark.parametrize("drop", [True, False]) @pytest.mark.parametrize("original_name", [None, "original_ser"]) -@pytest.mark.parametrize("name", [None, "ser"]) +@pytest.mark.parametrize("name", [None, "ser", no_default]) @pytest.mark.parametrize("inplace", [True, False]) def test_reset_index(level, drop, inplace, original_name, name): midx = pd.MultiIndex.from_tuples( @@ -1422,10 +1423,8 @@ def test_reset_index(level, drop, inplace, original_name, name): "test_reset_index_dup_level_name_exceptions" ) - with expect_warning_if(name is None and not drop): - expect = ps.reset_index( - level=level, drop=drop, name=name, inplace=inplace - ) + expect = ps.reset_index(level=level, drop=drop, name=name, inplace=inplace) + got = gs.reset_index(level=level, drop=drop, name=name, inplace=inplace) if inplace: expect = ps @@ -1450,10 +1449,7 @@ def test_reset_index_dup_level_name(level, drop, inplace, original_name, name): "test_reset_index_dup_level_name_exceptions" ) - with expect_warning_if(name is None and not drop): - expect = ps.reset_index( - level=level, drop=drop, inplace=inplace, name=name - ) + expect = ps.reset_index(level=level, drop=drop, inplace=inplace, name=name) got = gs.reset_index(level=level, drop=drop, inplace=inplace, name=name) if inplace: expect = ps @@ -1479,8 +1475,7 @@ def test_reset_index_named(drop, inplace, original_name, name): "test_reset_index_dup_level_name_exceptions" ) - with expect_warning_if(name is None and not drop): - expect = ps.reset_index(drop=drop, inplace=inplace, name=name) + expect = ps.reset_index(drop=drop, inplace=inplace, name=name) got = gs.reset_index(drop=drop, inplace=inplace, name=name) if inplace: From 199787d25b5e61f135a66bf1fdea9648d796cdae Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 18 Apr 2023 10:50:25 -0500 Subject: [PATCH 021/162] Fix `is_string_dtype` to adapt to `pandas-2.0` changes (#13141) With `pandas-2.0`, `pd.api.types.is_string_dtype(obj)` is going to perform a data-introspection to determine the true dtype of the underlying data. This path won't work for gpu objects, hence this PR adds special handling for GPU objects before we hit `pd.api.types.is_string_dtype(obj)` API. This PR fixes 56 pytests: ``` = 927 failed, 86333 passed, 2034 skipped, 992 xfailed, 165 xpassed in 506.69s (0:08:26) = ``` On `pandas_2.0_feature_branch`: ``` = 983 failed, 86277 passed, 2034 skipped, 992 xfailed, 165 xpassed in 557.07s (0:09:17) = ``` --- python/cudf/cudf/api/types.py | 21 ++++++++++++++------- python/cudf/cudf/tests/test_api_types.py | 7 ++++--- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 06e383f2275..c112132adb6 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -104,13 +104,20 @@ def is_string_dtype(obj): Whether or not the array or dtype is of the string dtype. """ return ( - pd.api.types.is_string_dtype(obj) - # Reject all cudf extension types. - and not is_categorical_dtype(obj) - and not is_decimal_dtype(obj) - and not is_list_dtype(obj) - and not is_struct_dtype(obj) - and not is_interval_dtype(obj) + ( + isinstance(obj, (cudf.Index, cudf.Series)) + and obj.dtype == cudf.dtype("O") + ) + or (isinstance(obj, cudf.core.column.StringColumn)) + or ( + pd.api.types.is_string_dtype(obj) + # Reject all cudf extension types. + and not is_categorical_dtype(obj) + and not is_decimal_dtype(obj) + and not is_list_dtype(obj) + and not is_struct_dtype(obj) + and not is_interval_dtype(obj) + ) ) diff --git a/python/cudf/cudf/tests/test_api_types.py b/python/cudf/cudf/tests/test_api_types.py index c2cd78f88a0..04c2aa0b263 100644 --- a/python/cudf/cudf/tests/test_api_types.py +++ b/python/cudf/cudf/tests/test_api_types.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -6,6 +6,7 @@ from pandas.api import types as pd_types import cudf +from cudf.core._compat import PANDAS_GE_200 from cudf.api import types @@ -497,8 +498,8 @@ def test_is_integer(obj, expect): (pd.Series(dtype="int"), False), (pd.Series(dtype="float"), False), (pd.Series(dtype="complex"), False), - (pd.Series(dtype="str"), True), - (pd.Series(dtype="unicode"), True), + (pd.Series(dtype="str"), not PANDAS_GE_200), + (pd.Series(dtype="unicode"), not PANDAS_GE_200), (pd.Series(dtype="datetime64[s]"), False), (pd.Series(dtype="timedelta64[s]"), False), (pd.Series(dtype="category"), False), From 47492da6d930b5ae80a45b4422ea422fa3c03621 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 18 Apr 2023 19:37:35 -0500 Subject: [PATCH 022/162] Handle pandas warnings for pad and backfill (#13168) This PR adds pytest handling for wanring incase of pad and backfill. --- python/cudf/cudf/tests/test_dataframe.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index bb544343a74..4af24434cfb 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -7572,7 +7572,8 @@ def test_dataframe_concat_dataframe_lists(df, other, sort, ignore_index): def test_dataframe_bfill(df, alias): gdf = cudf.from_pandas(df) - actual = getattr(df, alias)() + with expect_warning_if(PANDAS_GE_200 and alias == "backfill"): + actual = getattr(df, alias)() with expect_warning_if(alias == "backfill"): expected = getattr(gdf, alias)() assert_eq(expected, actual) @@ -7589,7 +7590,8 @@ def test_dataframe_bfill(df, alias): def test_dataframe_ffill(df, alias): gdf = cudf.from_pandas(df) - actual = getattr(df, alias)() + with expect_warning_if(PANDAS_GE_200 and alias == "pad"): + actual = getattr(df, alias)() with expect_warning_if(alias == "pad"): expected = getattr(gdf, alias)() assert_eq(expected, actual) From fbe184863af6170e2d482302e3fd2e12286bb443 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 19 Apr 2023 10:34:18 -0500 Subject: [PATCH 023/162] [REVIEW] Fix datetime pytests & raise errors for timezone un-aware typecasting (#13164) This PR fixes some of the `to_datetime` related pytests and also raises error while constructing a time-zone un-aware type to datetime types. This PR fixes 62 pytests: ``` = 745 failed, 87877 passed, 2044 skipped, 956 xfailed, 165 xpassed in 492.06s (0:08:12) = ``` On `pandas_2.0_feature_branch`: ``` = 807 failed, 87819 passed, 2044 skipped, 956 xfailed, 165 xpassed in 488.43s (0:08:08) = ``` --- python/cudf/cudf/core/column/datetime.py | 14 ++++-- python/cudf/cudf/core/column/string.py | 4 ++ python/cudf/cudf/tests/test_datetime.py | 60 +++++++++++++++--------- 3 files changed, 51 insertions(+), 27 deletions(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 107ebfbbcc3..2c49f17f21c 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -580,11 +580,15 @@ def infer_format(element: str, **kwargs) -> str: raise ValueError("Unable to infer the timestamp format from the data") if len(second_parts) > 1: - # "Z" indicates Zulu time(widely used in aviation) - Which is - # UTC timezone that currently cudf only supports. Having any other - # unsupported timezone will let the code fail below - # with a ValueError. - second_parts.remove("Z") + if "Z" in second_parts: + # "Z" indicates Zulu time(widely used in aviation) - Which is + # UTC timezone that currently cudf only supports. Having any other + # unsupported timezone will let the code fail below + # with a ValueError. + raise NotImplementedError( + "cuDF does not yet support timezone-aware datetimes" + ) + second_part = "".join(second_parts[1:]) if len(second_part) > 1: diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index d9a6c6c4cd6..19e30eeeb89 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5526,6 +5526,10 @@ def as_datetime_column( self.apply_boolean_mask(self.notnull()).element_indexing(0) ) + if format.endswith("%z"): + raise NotImplementedError( + "cuDF does not yet support timezone-aware datetimes" + ) return self._as_datetime_or_timedelta_column(out_dtype, format) def as_timedelta_column( diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 26dd4f69dbd..10b23745fbd 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -10,6 +10,7 @@ import pytest import cudf +import warnings import cudf.testing.dataset_generator as dataset_generator from cudf import DataFrame, Series from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140 @@ -680,12 +681,14 @@ def test_to_datetime_errors(data): else: gd_data = pd_data - assert_exceptions_equal( - pd.to_datetime, - cudf.to_datetime, - ([pd_data],), - ([gd_data],), - ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + assert_exceptions_equal( + pd.to_datetime, + cudf.to_datetime, + ([pd_data],), + ([gd_data],), + ) def test_to_datetime_not_implemented(): @@ -785,14 +788,19 @@ def test_to_datetime_format(data, format, infer_datetime_format): else: gd_data = pd_data - expected = pd.to_datetime( - pd_data, format=format, infer_datetime_format=infer_datetime_format - ) - actual = cudf.to_datetime( - gd_data, format=format, infer_datetime_format=infer_datetime_format - ) + with expect_warning_if(True, UserWarning): + expected = pd.to_datetime( + pd_data, format=format, infer_datetime_format=infer_datetime_format + ) + with expect_warning_if(not infer_datetime_format): + actual = cudf.to_datetime( + gd_data, format=format, infer_datetime_format=infer_datetime_format + ) + # TODO: Remove typecast to `ns` after following + # issue is fixed: + # https://github.com/pandas-dev/pandas/issues/52449 - assert_eq(actual, expected) + assert_eq(actual.astype("datetime64[ns]"), expected) def test_datetime_can_cast_safely(): @@ -847,7 +855,11 @@ def test_datetime_scalar_timeunit_cast(timeunit): gs = Series(testscalar) ps = pd.Series(testscalar) - assert_eq(ps, gs) + # TODO: Remove typecast to `ns` after following + # issue is fixed: + # https://github.com/pandas-dev/pandas/issues/52449 + + assert_eq(ps, gs.astype("datetime64[ns]")) gdf = DataFrame() gdf["a"] = np.arange(5) @@ -857,6 +869,11 @@ def test_datetime_scalar_timeunit_cast(timeunit): pdf["a"] = np.arange(5) pdf["b"] = testscalar + assert gdf["b"].dtype == cudf.dtype("datetime64[s]") + # TODO: Remove typecast to `ns` after following + # issue is fixed: + # https://github.com/pandas-dev/pandas/issues/52449 + gdf["b"] = gdf["b"].astype("datetime64[ns]") assert_eq(pdf, gdf) @@ -1267,10 +1284,6 @@ def test_datetime_reductions(data, op, dtype): @pytest.mark.parametrize( "data", [ - np.datetime_as_string( - np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[m]"), - timezone="UTC", - ), np.datetime_as_string( np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[m]"), timezone="UTC", @@ -1294,10 +1307,13 @@ def test_datetime_infer_format(data, dtype): sr = cudf.Series(data) psr = pd.Series(data) - expected = psr.astype(dtype) - actual = sr.astype(dtype) - - assert_eq(expected, actual) + assert_exceptions_equal( + lfunc=psr.astype, + rfunc=sr.astype, + lfunc_args_and_kwargs=([], {"dtype": dtype}), + rfunc_args_and_kwargs=([], {"dtype": dtype}), + check_exception_type=False, + ) def test_dateoffset_instance_subclass_check(): From 615828d4e47da49c029b68de4f8357f81773cce1 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 19 Apr 2023 10:39:20 -0500 Subject: [PATCH 024/162] [REVIEW] Fix pytests where empty column indexes are compared (#13166) This PR fixes pytests where empty column object comparisons fail, this is because of the following inconsistency between pandas & cudf: ```python In [1]: import pandas as pd In [2]: import cudf In [3]: pd.DataFrame().columns Out[3]: RangeIndex(start=0, stop=0, step=1) In [4]: cudf.DataFrame().columns Out[4]: Index([], dtype='object') In [5]: pd.DataFrame().columns.dtype Out[5]: dtype('int64') In [6]: cudf.DataFrame().columns.dtype Out[6]: dtype('O') ``` This PR fixes 28 failures: ``` = 779 failed, 87847 passed, 2044 skipped, 956 xfailed, 165 xpassed in 483.17s (0:08:03) = ``` On `pandas_2.0_feature_branch`: ``` = 807 failed, 87819 passed, 2044 skipped, 956 xfailed, 165 xpassed in 488.43s (0:08:08) = ``` --- python/cudf/cudf/tests/test_concat.py | 27 ++++++++++----- python/cudf/cudf/tests/test_dataframe.py | 44 ++++++++++++++++++++---- python/cudf/cudf/tests/test_groupby.py | 12 ++++++- python/cudf/cudf/tests/test_parquet.py | 9 +++-- 4 files changed, 74 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 910f0b9cf86..925a522399d 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -8,9 +8,13 @@ import cudf as gd from cudf.api.types import is_categorical_dtype -from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140 +from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140, PANDAS_GE_200 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype -from cudf.testing._utils import assert_eq, assert_exceptions_equal +from cudf.testing._utils import ( + assert_eq, + assert_exceptions_equal, + expect_warning_if, +) def make_frames(index=None, nulls="none"): @@ -365,7 +369,7 @@ def test_pandas_concat_compatibility_axis1_eq_index(): ps1 = s1.to_pandas() ps2 = s2.to_pandas() - with pytest.warns(FutureWarning): + with expect_warning_if(not PANDAS_GE_200): assert_exceptions_equal( lfunc=pd.concat, rfunc=gd.concat, @@ -596,7 +600,12 @@ def test_concat_empty_dataframes(df, other, ignore_index): actual[key] = col.fillna(-1) assert_eq(expected, actual, check_dtype=False, check_index_type=True) else: - assert_eq(expected, actual, check_index_type=not gdf.empty) + assert_eq( + expected, + actual, + check_index_type=not gdf.empty, + check_column_type=not PANDAS_GE_200, + ) @pytest.mark.parametrize("ignore_index", [True, False]) @@ -1084,10 +1093,12 @@ def test_concat_join_no_overlapping_columns_empty_df_basic( ignore_index=ignore_index, axis=axis, ) - # TODO: change `check_index_type` to `True` - # after following bug from pandas is fixed: - # https://github.com/pandas-dev/pandas/issues/46675 - assert_eq(expected, actual, check_index_type=False) + assert_eq( + expected, + actual, + check_index_type=PANDAS_GE_150, + check_column_type=not PANDAS_GE_200, + ) @pytest.mark.parametrize("ignore_index", [True, False]) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 4af24434cfb..66453dd544d 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -6325,12 +6325,22 @@ def test_dataframe_init_1d_list(data, columns): expect = pd.DataFrame(data, columns=columns) actual = cudf.DataFrame(data, columns=columns) - assert_eq(expect, actual, check_index_type=len(data) != 0) + assert_eq( + expect, + actual, + check_index_type=len(data) != 0, + check_column_type=not PANDAS_GE_200 and len(data) == 0, + ) expect = pd.DataFrame(data, columns=None) actual = cudf.DataFrame(data, columns=None) - assert_eq(expect, actual, check_index_type=len(data) != 0) + assert_eq( + expect, + actual, + check_index_type=len(data) != 0, + check_column_type=not PANDAS_GE_200 and len(data) == 0, + ) @pytest.mark.parametrize( @@ -7190,7 +7200,11 @@ def test_dataframe_from_dict_cp_np_arrays( def test_dataframe_keys(df): gdf = cudf.from_pandas(df) - assert_eq(df.keys(), gdf.keys()) + assert_eq( + df.keys(), + gdf.keys(), + exact=not (PANDAS_GE_200 and len(gdf.columns) == 0), + ) @pytest.mark.parametrize( @@ -7662,7 +7676,12 @@ def test_dataframe_concat_lists(df, other, sort, ignore_index): check_column_type=not gdf.empty, ) else: - assert_eq(expected, actual, check_index_type=not gdf.empty) + assert_eq( + expected, + actual, + check_index_type=not gdf.empty, + check_column_type=PANDAS_GE_200 and len(gdf.columns) != 0, + ) def test_dataframe_concat_series_without_name(): @@ -7943,6 +7962,7 @@ def test_dataframe_init_with_columns(data, columns): gdf, check_index_type=len(pdf.index) != 0, check_dtype=not (pdf.empty and len(pdf.columns)), + check_column_type=not PANDAS_GE_200, ) @@ -8023,7 +8043,12 @@ def test_dataframe_init_from_series_list(data, ignore_dtype, columns): check_index_type=True, ) else: - assert_eq(expected, actual, check_index_type=True) + assert_eq( + expected, + actual, + check_index_type=True, + check_column_type=not PANDAS_GE_200, + ) @pytest_unmark_spilling @@ -8114,7 +8139,7 @@ def test_dataframe_init_from_series_list_with_index( actual = actual.sort_index(axis=1) assert_eq(expected.fillna(-1), actual.fillna(-1), check_dtype=False) else: - assert_eq(expected, actual) + assert_eq(expected, actual, check_column_type=not PANDAS_GE_200) @pytest.mark.parametrize( @@ -8715,7 +8740,12 @@ def assert_local_eq(actual, df, expected, host_columns): check_index_type=check_index_type, ) else: - assert_eq(expected, actual, check_index_type=check_index_type) + assert_eq( + expected, + actual, + check_index_type=check_index_type, + check_column_type=not PANDAS_GE_200, + ) gdf = cudf.from_pandas(df) host_columns = ( diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 472a3fa5976..f5ce7ec95a0 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -18,7 +18,7 @@ import cudf from cudf import DataFrame, Series -from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140 +from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140, PANDAS_GE_200 from cudf.core.udf.groupby_typing import SUPPORTED_GROUPBY_NUMPY_TYPES from cudf.core.udf.utils import precompiled from cudf.testing._utils import ( @@ -1971,11 +1971,16 @@ def test_groupby_apply_return_series_dataframe(func, args): ) def test_groupby_no_keys(pdf): gdf = cudf.from_pandas(pdf) + if isinstance(pdf, pd.DataFrame): + kwargs = {"check_column_type": not PANDAS_GE_200} + else: + kwargs = {} assert_groupby_results_equal( pdf.groupby([]).max(), gdf.groupby([]).max(), check_dtype=False, check_index_type=False, # Int64Index v/s Float64Index + **kwargs, ) @@ -1985,10 +1990,15 @@ def test_groupby_no_keys(pdf): ) def test_groupby_apply_no_keys(pdf): gdf = cudf.from_pandas(pdf) + if isinstance(pdf, pd.DataFrame): + kwargs = {"check_column_type": not PANDAS_GE_200} + else: + kwargs = {} assert_groupby_results_equal( pdf.groupby([], group_keys=False).apply(lambda x: x.max()), gdf.groupby([]).apply(lambda x: x.max()), check_index_type=False, # Int64Index v/s Float64Index + **kwargs, ) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index fe692a87ca8..2c3f4176674 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -311,7 +311,7 @@ def test_parquet_reader_empty_pandas_dataframe(tmpdir, engine): expect = expect.reset_index(drop=True) got = got.reset_index(drop=True) - assert_eq(expect, got) + assert_eq(expect, got, check_column_type=not PANDAS_GE_200) @pytest.mark.parametrize("has_null", [False, True]) @@ -2210,7 +2210,12 @@ def run_parquet_index(pdf, index): expected = pd.read_parquet(pandas_buffer) actual = cudf.read_parquet(cudf_buffer) - assert_eq(expected, actual, check_index_type=True) + assert_eq( + expected, + actual, + check_index_type=True, + check_column_type=not PANDAS_GE_200, + ) @pytest.mark.parametrize( From 8e8a1ea40558745c806dd695d8b3472442eb653c Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 19 Apr 2023 13:56:27 -0500 Subject: [PATCH 025/162] [REVIEW] Raise error when there is a binary operation between certain `DataFrame` and `Series` objects (#13138) This PR raises an error when there is a binary operation performed between `DataFrame` & `Series` with unequal `columns` and `index` respectively. This PR fixes 120 pytests: ``` = 833 failed, 86451 passed, 2034 skipped, 968 xfailed, 165 xpassed in 490.86s (0:08:10) = ``` on `pandas_2.0_feature_branch`: ``` = 953 failed, 86307 passed, 2034 skipped, 992 xfailed, 165 xpassed in 511.09s (0:08:31) = ``` --- python/cudf/cudf/core/dataframe.py | 14 ++++++++ python/cudf/cudf/tests/test_dataframe.py | 46 +++++++++++------------- 2 files changed, 34 insertions(+), 26 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 760fcef826c..b6de299e387 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1943,6 +1943,20 @@ def _make_operands_and_index_for_binop( if _is_scalar_or_zero_d_array(other): rhs = {name: other for name in self._data} elif isinstance(other, Series): + if ( + not can_reindex + and fn in cudf.utils.utils._EQUALITY_OPS + and ( + not self._data.to_pandas_index().equals( + other.index.to_pandas() + ) + ) + ): + raise ValueError( + "Can only compare DataFrame & Series objects " + "whose columns & index are same respectively, " + "please reindex." + ) rhs = dict(zip(other.index.values_host, other.values_host)) # For keys in right but not left, perform binops between NaN (not # NULL!) and the right value (result is NaN). diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 66453dd544d..7f22ffc0df2 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9,7 +9,6 @@ import re import string import textwrap -import warnings from collections import OrderedDict, defaultdict from copy import copy @@ -5338,7 +5337,9 @@ def test_cov_nans(): cudf.Series([4, 2, 3], index=cudf.core.index.RangeIndex(0, 3)), pytest.param( cudf.Series([4, 2, 3, 4, 5], index=["a", "b", "d", "0", "12"]), - marks=pytest_xfail, + marks=pytest.mark.xfail( + not PANDAS_GE_200, reason="works only with pandas 2.0+" + ), ), ], ) @@ -5361,39 +5362,32 @@ def test_cov_nans(): ], ) def test_df_sr_binop(gsr, colnames, op): - # Anywhere that the column names of the DataFrame don't match the index - # names of the Series will trigger a deprecated reindexing. Since this - # behavior is deprecated in pandas, this test is temporarily silencing - # those warnings until cudf updates to pandas 2.0 as its compatibility - # target, at which point a large number of the parametrizations can be - # removed altogether (along with this warnings filter). - with warnings.catch_warnings(): - assert version.parse(pd.__version__) < version.parse("2.0.0") - warnings.filterwarnings( - action="ignore", - category=FutureWarning, - message=( - "Automatic reindexing on DataFrame vs Series comparisons is " - "deprecated" - ), - ) - data = [[3.0, 2.0, 5.0], [3.0, None, 5.0], [6.0, 7.0, np.nan]] - data = dict(zip(colnames, data)) + data = [[3.0, 2.0, 5.0], [3.0, None, 5.0], [6.0, 7.0, np.nan]] + data = dict(zip(colnames, data)) - gsr = gsr.astype("float64") + gsr = gsr.astype("float64") - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas(nullable=True) + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas(nullable=True) - psr = gsr.to_pandas(nullable=True) + psr = gsr.to_pandas(nullable=True) + try: expect = op(pdf, psr) + except ValueError: + with pytest.raises(ValueError): + op(gdf, gsr) + with pytest.raises(ValueError): + op(psr, pdf) + with pytest.raises(ValueError): + op(gsr, gdf) + else: got = op(gdf, gsr).to_pandas(nullable=True) - assert_eq(expect, got, check_dtype=False) + assert_eq(expect, got, check_dtype=False, check_like=True) expect = op(psr, pdf) got = op(gsr, gdf).to_pandas(nullable=True) - assert_eq(expect, got, check_dtype=False) + assert_eq(expect, got, check_dtype=False, check_like=True) @pytest_unmark_spilling From 901a9716c18505f7a29749df5bf2f2eece89a49f Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 19 Apr 2023 19:53:57 -0500 Subject: [PATCH 026/162] Fix `datetime64` related inconsistencies in pytests (#13175) This PR fixes `datetime64` related pytest failures where pandas returns `ns` time resolutions for quite a lot of cases, i.e., mostly on the IO APIs side. Fixes 72 pytests: ``` = 484 failed, 88162 passed, 2044 skipped, 932 xfailed, 165 xpassed in 444.53s (0:07:24) = ``` On `pandas_2.0_feature_branch`: ``` = 556 failed, 88090 passed, 2044 skipped, 932 xfailed, 165 xpassed in 456.49s (0:07:36) = ``` --- python/cudf/cudf/tests/test_csv.py | 7 ++++- python/cudf/cudf/tests/test_groupby.py | 32 ++++++++++++++++++++--- python/cudf/cudf/tests/test_index.py | 8 +++++- python/cudf/cudf/tests/test_joining.py | 7 +++++ python/cudf/cudf/tests/test_orc.py | 21 ++++++++++++++- python/cudf/cudf/tests/test_parquet.py | 17 ++++++++++++ python/cudf/cudf/tests/test_resampling.py | 4 ++- python/cudf/cudf/tests/test_string.py | 12 ++++----- 8 files changed, 94 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 4a7804da62c..b66e6bc74fb 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -16,7 +16,7 @@ import cudf from cudf import read_csv -from cudf.core._compat import PANDAS_LT_140 +from cudf.core._compat import PANDAS_LT_140, PANDAS_GE_200 from cudf.testing._utils import assert_eq, assert_exceptions_equal @@ -367,6 +367,11 @@ def test_csv_reader_skiprows_skipfooter(tmpdir, pd_mixed_dataframe): assert len(out.columns) == len(df_out.columns) assert len(out) == len(df_out) + if PANDAS_GE_200: + # TODO: Remove typecast to `ns` after following + # issue is fixed: + # https://github.com/pandas-dev/pandas/issues/52449 + out["2"] = out["2"].astype("datetime64[ns]") assert_eq(df_out, out) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index f5ce7ec95a0..63d98ada905 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -2648,7 +2648,13 @@ def test_groupby_freq_week(label, closed): got = gdf.groupby( cudf.Grouper(key="Publish date", freq="1W", label=label, closed=closed) ).mean() - assert_eq(expect, got, check_like=True, check_dtype=False) + assert_eq( + expect, + got, + check_like=True, + check_dtype=False, + check_index_type=not PANDAS_GE_200, + ) @pytest.mark.parametrize("label", [None, "left", "right"]) @@ -2675,7 +2681,13 @@ def test_groupby_freq_day(label, closed): got = gdf.groupby( cudf.Grouper(key="Publish date", freq="3D", label=label, closed=closed) ).mean() - assert_eq(expect, got, check_like=True, check_dtype=False) + assert_eq( + expect, + got, + check_like=True, + check_dtype=False, + check_index_type=not PANDAS_GE_200, + ) @pytest.mark.parametrize("label", [None, "left", "right"]) @@ -2702,7 +2714,13 @@ def test_groupby_freq_min(label, closed): got = gdf.groupby( cudf.Grouper(key="Publish date", freq="1h", label=label, closed=closed) ).mean() - assert_eq(expect, got, check_like=True, check_dtype=False) + assert_eq( + expect, + got, + check_like=True, + check_dtype=False, + check_index_type=not PANDAS_GE_200, + ) @pytest.mark.parametrize("label", [None, "left", "right"]) @@ -2729,7 +2747,13 @@ def test_groupby_freq_s(label, closed): got = gdf.groupby( cudf.Grouper(key="Publish date", freq="3s", label=label, closed=closed) ).mean() - assert_eq(expect, got, check_like=True, check_dtype=False) + assert_eq( + expect, + got, + check_like=True, + check_dtype=False, + check_index_type=not PANDAS_GE_200, + ) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index f9ad48c48af..d5d330d7177 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1764,7 +1764,13 @@ def test_index_from_arrow(data): arrow_array = pa.Array.from_pandas(pdi) expected_index = pd.Index(arrow_array.to_pandas()) gdi = cudf.Index.from_arrow(arrow_array) - + if PANDAS_GE_200: + # Arrow bug: + # https://github.com/apache/arrow/issues/33321 + # arrow cannot convert non-nanosecond + # resolution to appropriate type in pandas. + # Hence need to type-cast. + expected_index = expected_index.astype(gdi.dtype) assert_eq(expected_index, gdi) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index b197e91882a..c578266ac22 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -15,6 +15,7 @@ assert_exceptions_equal, expect_warning_if, ) +from cudf.core._compat import PANDAS_GE_200 _JOIN_TYPES = ("left", "inner", "outer", "right", "leftanti", "leftsemi") @@ -785,6 +786,12 @@ def test_join_datetimes_index(dtype): assert gdf["d"].dtype == cudf.dtype(dtype) + if PANDAS_GE_200: + # TODO: Remove typecast to `ns` after following + # issue is fixed: + # https://github.com/pandas-dev/pandas/issues/52449 + gdf = gdf.astype("datetime64[ns]") + assert_join_results_equal(pdf, gdf, how="inner") diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 33095761fde..4701f69d862 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. import datetime import decimal @@ -23,6 +23,7 @@ gen_rand_series, supported_numpy_dtypes, ) +from cudf.core._compat import PANDAS_GE_200 # Removal of these deprecated features is no longer imminent. They will not be # removed until a suitable alternative has been implemented. As a result, we @@ -159,6 +160,12 @@ def test_orc_reader_datetimestamp(datadir, inputfile, use_index): pdf = orcfile.read().to_pandas(date_as_object=False) gdf = cudf.read_orc(path, use_index=use_index) + if PANDAS_GE_200: + # TODO: Remove typecast to `ns` after following + # issue is fixed: + # https://github.com/pandas-dev/pandas/issues/52449 + gdf = gdf.astype("datetime64[ns]") + assert_eq(pdf, gdf, check_categorical=False) @@ -1847,6 +1854,12 @@ def test_orc_reader_negative_timestamp(negative_timestamp_df, engine): with expect_warning_if(engine == "pyarrow", UserWarning): got = cudf.read_orc(buffer, engine=engine) + if PANDAS_GE_200: + # TODO: Remove typecast to `ns` after following + # issue is fixed: + # https://github.com/pandas-dev/pandas/issues/52449 + negative_timestamp_df = negative_timestamp_df.astype("datetime64[ns]") + assert_eq(negative_timestamp_df, got) @@ -1854,6 +1867,12 @@ def test_orc_writer_negative_timestamp(negative_timestamp_df): buffer = BytesIO() negative_timestamp_df.to_orc(buffer) + if PANDAS_GE_200: + # TODO: Remove typecast to `ns` after following + # issue is fixed: + # https://github.com/pandas-dev/pandas/issues/52449 + negative_timestamp_df = negative_timestamp_df.astype("datetime64[ns]") + assert_eq(negative_timestamp_df, pd.read_orc(buffer)) assert_eq(negative_timestamp_df, pyarrow.orc.ORCFile(buffer).read()) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 2c3f4176674..ebebd857231 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -664,6 +664,13 @@ def test_parquet_reader_microsecond_timestamps(datadir): expect = pd.read_parquet(fname) got = cudf.read_parquet(fname) + if PANDAS_GE_200: + # TODO: Remove typecast to `ns` after following + # issue is fixed: + # https://github.com/pandas-dev/pandas/issues/52449 + assert got["a"].dtype == cudf.dtype("datetime64[us]") + got = got.astype("datetime64[ns]") + assert_eq(expect, got) @@ -2513,6 +2520,16 @@ def test_parquet_writer_nulls_pandas_read(tmpdir, pdf): got = pd.read_parquet(fname) nullable = num_rows > 0 + if PANDAS_GE_200: + # TODO: Remove typecast to `ns` after following + # issue is fixed: + # https://github.com/pandas-dev/pandas/issues/52449 + gdf["col_datetime64[ms]"] = gdf["col_datetime64[ms]"].astype( + "datetime64[ns]" + ) + gdf["col_datetime64[us]"] = gdf["col_datetime64[us]"].astype( + "datetime64[ns]" + ) assert_eq(gdf.to_pandas(nullable=nullable), got) diff --git a/python/cudf/cudf/tests/test_resampling.py b/python/cudf/cudf/tests/test_resampling.py index f0101803995..ce5b05adff1 100644 --- a/python/cudf/cudf/tests/test_resampling.py +++ b/python/cudf/cudf/tests/test_resampling.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -6,6 +6,7 @@ import cudf from cudf.testing._utils import assert_eq +from cudf.core._compat import PANDAS_GE_200 def assert_resample_results_equal(lhs, rhs, **kwargs): @@ -14,6 +15,7 @@ def assert_resample_results_equal(lhs, rhs, **kwargs): rhs.sort_index(), check_dtype=False, check_freq=False, + check_index_type=not PANDAS_GE_200, **kwargs, ) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index ad47c79a3cf..2af9c70e706 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -200,12 +200,12 @@ def test_string_astype(dtype): data = ["True", "False", "True", "False", "False"] elif dtype.startswith("datetime64"): data = [ - "2019-06-04T00:00:00Z", - "2019-06-04T12:12:12Z", - "2019-06-03T00:00:00Z", - "2019-05-04T00:00:00Z", - "2018-06-04T00:00:00Z", - "1922-07-21T01:02:03Z", + "2019-06-04T00:00:00", + "2019-06-04T12:12:12", + "2019-06-03T00:00:00", + "2019-05-04T00:00:00", + "2018-06-04T00:00:00", + "1922-07-21T01:02:03", ] elif dtype == "str" or dtype == "object": data = ["ab", "cd", "ef", "gh", "ij"] From 31e08c97ee166ed0b457c310509ad70c85d150e2 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 20 Apr 2023 15:20:39 -0500 Subject: [PATCH 027/162] Fix `DataFrame.describe` pytests (#13191) https://github.com/rapidsai/cudf/pull/12890 dropped support for `datetime_is_numeric` from `describe` API. This PR cleans-up a remaining pytest that was using this parameter. This PR fixes 20 pytests: ``` = 464 failed, 88182 passed, 2044 skipped, 932 xfailed, 165 xpassed in 440.68s (0:07:20) = ``` On `pandas_2.0_feature_branch`: ``` = 484 failed, 88162 passed, 2044 skipped, 932 xfailed, 165 xpassed in 457.87s (0:07:37) = ``` --- python/cudf/cudf/tests/test_dataframe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 7f22ffc0df2..eb7d6ecbc9c 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8273,8 +8273,8 @@ def test_dataframe_iterrows_itertuples(): def test_describe_misc_include(df, include): pdf = df.to_pandas() - expected = pdf.describe(include=include, datetime_is_numeric=True) - actual = df.describe(include=include, datetime_is_numeric=True) + expected = pdf.describe(include=include) + actual = df.describe(include=include) for col in expected.columns: if expected[col].dtype == np.dtype("object"): From 27e18c83f4768ec938d5421627302dfcc047c7a8 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 21 Apr 2023 19:27:29 -0500 Subject: [PATCH 028/162] Change default `dtype` for `get_dummies` to `bool` (#13174) This PR changes the default dtype for get_dummies to bool from uint8 to match pandas-2.0: pandas-dev/pandas#48022 --- python/cudf/cudf/_lib/transform.pyx | 2 +- python/cudf/cudf/core/reshape.py | 40 +++++++++++------------ python/cudf/cudf/tests/test_onehot.py | 47 +++++++++++---------------- 3 files changed, 40 insertions(+), 49 deletions(-) diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index a0a8279b213..d8eb6134042 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -163,7 +163,7 @@ def one_hot_encode(Column input_column, Column categories): move(c_result.second), owner=owner, column_names=[ - x if x is not None else 'null' for x in pylist_categories + x if x is not None else '' for x in pylist_categories ] ) return encodings diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index df1a543c4aa..e1b425cab9f 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -609,7 +609,7 @@ def get_dummies( cats=None, sparse=False, drop_first=False, - dtype="uint8", + dtype="bool", ): """Returns a dataframe whose columns are the one hot encodings of all columns in `df` @@ -640,7 +640,7 @@ def get_dummies( columns. Note this is different from pandas default behavior, which encodes all columns with dtype object or categorical dtype : str, optional - Output dtype, default 'uint8' + Output dtype, default 'bool' Examples -------- @@ -648,15 +648,15 @@ def get_dummies( >>> df = cudf.DataFrame({"a": ["value1", "value2", None], "b": [0, 0, 0]}) >>> cudf.get_dummies(df) b a_value1 a_value2 - 0 0 1 0 - 1 0 0 1 - 2 0 0 0 + 0 0 True False + 1 0 False True + 2 0 False False >>> cudf.get_dummies(df, dummy_na=True) - b a_None a_value1 a_value2 - 0 0 0 1 0 - 1 0 0 0 1 - 2 0 1 0 0 + b a_ a_value1 a_value2 + 0 0 False True False + 1 0 False False True + 2 0 True False False >>> import numpy as np >>> df = cudf.DataFrame({"a":cudf.Series([1, 2, np.nan, None], @@ -669,11 +669,11 @@ def get_dummies( 3 >>> cudf.get_dummies(df, dummy_na=True, columns=["a"]) - a_1.0 a_2.0 a_nan a_null - 0 1 0 0 0 - 1 0 1 0 0 - 2 0 0 1 0 - 3 0 0 0 1 + a_ a_1.0 a_2.0 a_nan + 0 False True False False + 1 False False True False + 2 False False False True + 3 True False False False >>> series = cudf.Series([1, 2, None, 2, 4]) >>> series @@ -684,12 +684,12 @@ def get_dummies( 4 4 dtype: int64 >>> cudf.get_dummies(series, dummy_na=True) - null 1 2 4 - 0 0 1 0 0 - 1 0 0 1 0 - 2 1 0 0 0 - 3 0 0 1 0 - 4 0 0 0 1 + 1 2 4 + 0 False True False False + 1 False False True False + 2 True False False False + 3 False False True False + 4 False False False True """ if cats is None: diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py index d42b0e85d28..17ce145a2c2 100644 --- a/python/cudf/cudf/tests/test_onehot.py +++ b/python/cudf/cudf/tests/test_onehot.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. from string import ascii_lowercase @@ -23,19 +23,13 @@ (range(10), [1, 2, 3, 4, 5] * 2), ], ) -def test_get_dummies(data, index): +@pytest.mark.parametrize("dtype", ["bool", "uint8"]) +def test_get_dummies(data, index, dtype): gdf = DataFrame({"x": data}, index=index) pdf = pd.DataFrame({"x": data}, index=index) - encoded_expected = pd.get_dummies(pdf, prefix="test") - encoded_actual = cudf.get_dummies(gdf, prefix="test") - - utils.assert_eq( - encoded_expected, - encoded_actual, - check_dtype=len(data) != 0, - ) - encoded_actual = cudf.get_dummies(gdf, prefix="test", dtype=np.uint8) + encoded_expected = pd.get_dummies(pdf, prefix="test", dtype=dtype) + encoded_actual = cudf.get_dummies(gdf, prefix="test", dtype=dtype) utils.assert_eq( encoded_expected, @@ -63,16 +57,13 @@ def test_onehot_get_dummies_multicol(n_cols): @pytest.mark.parametrize("nan_as_null", [True, False]) @pytest.mark.parametrize("dummy_na", [True, False]) def test_onehost_get_dummies_dummy_na(nan_as_null, dummy_na): - pdf = pd.DataFrame({"a": [0, 1, np.nan]}) - df = DataFrame.from_pandas(pdf, nan_as_null=nan_as_null) + df = cudf.DataFrame({"a": [0, 1, np.nan]}, nan_as_null=nan_as_null) + pdf = df.to_pandas(nullable=nan_as_null) expected = pd.get_dummies(pdf, dummy_na=dummy_na, columns=["a"]) got = cudf.get_dummies(df, dummy_na=dummy_na, columns=["a"]) - if dummy_na and nan_as_null: - got = got.rename(columns={"a_null": "a_nan"})[expected.columns] - - utils.assert_eq(expected, got) + utils.assert_eq(expected, got, check_like=True) @pytest.mark.parametrize( @@ -120,12 +111,12 @@ def test_get_dummies_with_nan(): ) expected = cudf.DataFrame( { - "a_null": [0, 0, 0, 1], - "a_1.0": [1, 0, 0, 0], - "a_2.0": [0, 1, 0, 0], - "a_nan": [0, 0, 1, 0], + "a_": [False, False, False, True], + "a_1.0": [True, False, False, False], + "a_2.0": [False, True, False, False], + "a_nan": [False, False, True, False], }, - dtype="uint8", + dtype="bool", ) actual = cudf.get_dummies(df, dummy_na=True, columns=["a"]) @@ -163,13 +154,13 @@ def test_get_dummies_array_like_with_nan(): ser = cudf.Series([0.1, 2, 3, None, np.nan], nan_as_null=False) expected = cudf.DataFrame( { - "a_null": [0, 0, 0, 1, 0], - "a_0.1": [1, 0, 0, 0, 0], - "a_2.0": [0, 1, 0, 0, 0], - "a_3.0": [0, 0, 1, 0, 0], - "a_nan": [0, 0, 0, 0, 1], + "a_": [False, False, False, True, False], + "a_0.1": [True, False, False, False, False], + "a_2.0": [False, True, False, False, False], + "a_3.0": [False, False, True, False, False], + "a_nan": [False, False, False, False, True], }, - dtype="uint8", + dtype="bool", ) actual = cudf.get_dummies(ser, dummy_na=True, prefix="a", prefix_sep="_") From 6a863854d1029bba61c5b2164be39c3979bf0ae7 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 21 Apr 2023 19:27:47 -0500 Subject: [PATCH 029/162] [REVIEW] Update parameter ordering in `DataFrame.pivot` (#13190) This PR updates parameter ordering in `DataFrame.pivot` to match pandas-2.0. This PR fixes 7 related pytests: ``` = 477 failed, 88169 passed, 2044 skipped, 932 xfailed, 165 xpassed in 438.55s (0:07:18) = ``` On `pandas_2.0_feature_branch`: ``` = 484 failed, 88162 passed, 2044 skipped, 932 xfailed, 165 xpassed in 457.87s (0:07:37) = ``` --- python/cudf/cudf/core/dataframe.py | 3 ++- python/cudf/cudf/core/reshape.py | 11 ++++++----- python/cudf/cudf/tests/test_reshape.py | 10 ++-------- 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b6de299e387..d4d3591a360 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -104,6 +104,7 @@ _external_only_api, ) from cudf.core._compat import PANDAS_GE_200 +from cudf.api.extensions import no_default T = TypeVar("T", bound="DataFrame") @@ -6636,7 +6637,7 @@ def iterrows(self): @_cudf_nvtx_annotate @copy_docstring(reshape.pivot) - def pivot(self, index, columns, values=None): + def pivot(self, *, columns, index=no_default, values=no_default): return cudf.core.reshape.pivot( self, index=index, columns=columns, values=values ) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index e1b425cab9f..43d683490b8 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -13,6 +13,7 @@ from cudf._typing import Dtype from cudf.core.column import ColumnBase, as_column, column_empty_like from cudf.core.column.categorical import CategoricalColumn +from cudf.api.extensions import no_default _AXIS_MAP = {0: 0, 1: 1, "index": 0, "columns": 1} @@ -905,7 +906,7 @@ def as_tuple(x): ) -def pivot(data, index=None, columns=None, values=None): +def pivot(data, columns=None, index=no_default, values=no_default): """ Return reshaped DataFrame organized by the given index and column values. @@ -915,10 +916,10 @@ def pivot(data, index=None, columns=None, values=None): Parameters ---------- - index : column name, optional - Column used to construct the index of the result. columns : column name, optional Column used to construct the columns of the result. + index : column name, optional + Column used to construct the index of the result. values : column name or list of column names, optional Column(s) whose values are rearranged to produce the result. If not specified, all remaining columns of the DataFrame @@ -957,7 +958,7 @@ def pivot(data, index=None, columns=None, values=None): """ df = data values_is_list = True - if values is None: + if values is no_default: values = df._columns_view( col for col in df._column_names if col not in (index, columns) ) @@ -966,7 +967,7 @@ def pivot(data, index=None, columns=None, values=None): values = [values] values_is_list = False values = df._columns_view(values) - if index is None: + if index is no_default: index = df.index else: index = cudf.core.index.Index(df.loc[:, index]) diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index bf2c1a32b64..b70d6554c0f 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -382,14 +382,8 @@ def test_pivot_simple(index, column, data): pdf = pd.DataFrame({"index": index, "column": column, "data": data}) gdf = cudf.from_pandas(pdf) - # In pandas 2.0 this will be a failure because pandas will require all of - # these as keyword arguments. Matching that check in cudf is a bit - # cumbersome and not worth the effort to match the warning, so this code - # just catches pandas's warning (rather than updating the signature) so - # that when it starts failing we know to update our impl of pivot. - with pytest.warns(FutureWarning): - expect = pdf.pivot("index", "column") - got = gdf.pivot("index", "column") + expect = pdf.pivot(columns="column", index="index") + got = gdf.pivot(columns="column", index="index") check_index_and_columns = expect.shape != (0, 0) assert_eq( From ea7d18cc5f3640c13cba1bbccdba3d65e588fdbb Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 26 Apr 2023 11:40:01 -0500 Subject: [PATCH 030/162] Fix ceil, floor and round pytests (#13218) A fix for https://github.com/pandas-dev/pandas/issues/52761 has been merged by @mroeschke , this PR xfails the pytests conditionally for `2.0.0` and passes for rest of the versions. This PR fixes 27 pytests: ``` = 404 failed, 88221 passed, 2044 skipped, 959 xfailed, 165 xpassed in 442.21s (0:07:22) = ``` On `pandas_2.0_feature_branch`: ``` = 431 failed, 88221 passed, 2044 skipped, 932 xfailed, 165 xpassed in 456.25s (0:07:36) = ``` --- python/cudf/cudf/core/_compat.py | 1 + python/cudf/cudf/tests/test_datetime.py | 54 ++++++++++++++++++++++--- 2 files changed, 50 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py index 6ecbe414ebb..183faa12904 100644 --- a/python/cudf/cudf/core/_compat.py +++ b/python/cudf/cudf/core/_compat.py @@ -9,4 +9,5 @@ PANDAS_LT_140 = PANDAS_VERSION < version.parse("1.4.0") PANDAS_GE_150 = PANDAS_VERSION >= version.parse("1.5.0") PANDAS_LT_153 = PANDAS_VERSION < version.parse("1.5.3") +PANDAS_EQ_200 = PANDAS_VERSION == version.parse("2.0.0") PANDAS_GE_200 = PANDAS_VERSION >= version.parse("2.0.0") diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 10b23745fbd..68c9f725aa7 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -13,7 +13,7 @@ import warnings import cudf.testing.dataset_generator as dataset_generator from cudf import DataFrame, Series -from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140 +from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140, PANDAS_EQ_200 from cudf.core.index import DatetimeIndex from cudf.testing._utils import ( DATETIME_TYPES, @@ -1906,8 +1906,22 @@ def test_error_values(): @pytest.mark.parametrize( "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"] ) -def test_ceil(data, time_type, resolution): - +def test_ceil(request, data, time_type, resolution): + alias_map = {"L": "ms", "U": "us", "N": "ns"} + request.applymarker( + pytest.mark.xfail( + condition=( + PANDAS_EQ_200 + and resolution in {"L", "ms", "U", "us", "N"} + and np.dtype( + f"datetime64[{alias_map.get(resolution, resolution)}]" + ) + > np.dtype(time_type) + ), + reason="https://github.com/pandas-dev/pandas/issues/52761", + strict=True, + ) + ) gs = cudf.Series(data, dtype=time_type) ps = gs.to_pandas() @@ -1937,7 +1951,22 @@ def test_ceil(data, time_type, resolution): @pytest.mark.parametrize( "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"] ) -def test_floor(data, time_type, resolution): +def test_floor(request, data, time_type, resolution): + alias_map = {"L": "ms", "U": "us", "N": "ns"} + request.applymarker( + pytest.mark.xfail( + condition=( + PANDAS_EQ_200 + and resolution in {"L", "ms", "U", "us", "N"} + and np.dtype( + f"datetime64[{alias_map.get(resolution, resolution)}]" + ) + > np.dtype(time_type) + ), + reason="https://github.com/pandas-dev/pandas/issues/52761", + strict=True, + ) + ) gs = cudf.Series(data, dtype=time_type) ps = gs.to_pandas() @@ -1968,7 +1997,22 @@ def test_floor(data, time_type, resolution): @pytest.mark.parametrize( "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"] ) -def test_round(data, time_type, resolution): +def test_round(request, data, time_type, resolution): + alias_map = {"L": "ms", "U": "us", "N": "ns"} + request.applymarker( + pytest.mark.xfail( + condition=( + PANDAS_EQ_200 + and resolution in {"L", "ms", "U", "us", "N"} + and np.dtype( + f"datetime64[{alias_map.get(resolution, resolution)}]" + ) + > np.dtype(time_type) + ), + reason="https://github.com/pandas-dev/pandas/issues/52761", + strict=True, + ) + ) gs = cudf.Series(data, dtype=time_type) ps = gs.to_pandas() From e355ba46f2a742f1625918854dead3c92553cc68 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 26 Apr 2023 19:08:47 -0700 Subject: [PATCH 031/162] More implementation for get_indexer --- python/cudf/cudf/core/index.py | 199 +++++++++++++++------------ python/cudf/cudf/core/multiindex.py | 134 +++++++++++------- python/cudf/cudf/tests/test_index.py | 177 +++++++----------------- 3 files changed, 245 insertions(+), 265 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 9e41d5ed75e..14f7c91eea0 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2,7 +2,6 @@ from __future__ import annotations -import math import pickle import warnings from functools import cached_property @@ -576,45 +575,45 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): ) @_cudf_nvtx_annotate - def get_indexer(self, target, method=None, limit=None, tolerance=None): - # Given an actual integer, - idx = (target - self._start) / self._step - idx_int_upper_bound = (self._stop - self._start) // self._step + def get_indexer(self, target, method=None, tolerance=None): if method is None: - if tolerance is not None: - raise ValueError( - "tolerance argument only valid if using pad, " - "backfill or nearest lookups" - ) - - if idx > idx_int_upper_bound or idx < 0: - raise KeyError(target) - - idx_int = (target - self._start) // self._step - if idx_int != idx: - raise KeyError(target) - return idx_int - - if (method == "ffill" and idx < 0) or ( - method == "bfill" and idx > idx_int_upper_bound - ): - raise KeyError(target) - - round_method = { - "ffill": math.floor, - "bfill": math.ceil, - "nearest": round, - }[method] - if tolerance is not None and (abs(idx) * self._step > tolerance): - raise KeyError(target) - return np.clip(round_method(idx), 0, idx_int_upper_bound, dtype=int) + if self.step > 0: + start, stop, step = self.start, self.stop, self.step + else: + # Reversed + reverse = self._range[::-1] + start, stop, step = reverse.start, reverse.stop, reverse.step + + target_array = cupy.asarray(target) + locs = target_array - start + valid = (locs % step == 0) & (locs >= 0) & (target_array < stop) + locs[~valid] = -1 + locs[valid] = locs[valid] / step + + if step != self.step: + # Reversed + locs[valid] = len(self) - 1 - locs[valid] + return locs + else: + return self._as_int_index().get_indexer( + target=target, method=method, tolerance=tolerance + ) @_cudf_nvtx_annotate def get_loc(self, key): # Given an actual integer, - if is_scalar(key): - key = [key] - return self.get_indexer(key) + if not is_scalar(key): + raise TypeError("Should be a sequence") + # Given an actual integer, + idx = (key - self._start) / self._step + idx_int_upper_bound = (self._stop - self._start) // self._step + if idx > idx_int_upper_bound or idx < 0: + raise KeyError(key) + + idx_int = (key - self._start) // self._step + if idx_int != idx: + raise KeyError(key) + return idx_int @_cudf_nvtx_annotate def _union(self, other, sort=None): @@ -1168,10 +1167,10 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): """ if is_scalar(target): raise TypeError("Should be a sequence") - if tolerance is not None: - raise NotImplementedError( - "Parameter tolerance is not supported yet." - ) + # if tolerance is not None: + # raise NotImplementedError( + # "Parameter tolerance is not supported yet." + # ) if method not in { None, "ffill", @@ -1185,6 +1184,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): f" or nearest. Got {method}" ) + if not self.is_unique: + raise ValueError("Cannot get index for a non-unique Index.") + is_sorted = ( self.is_monotonic_increasing or self.is_monotonic_decreasing ) @@ -1195,54 +1197,45 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): "is specified." ) - target_as_table = cudf.core.frame.Frame({"None": as_column(target)}) - lower_bound, upper_bound, sort_inds = _lexsorted_equal_range( - self, target_as_table, is_sorted + needle_table = cudf.DataFrame( + {"None": as_column(target), "order": arange(0, len(target))} ) - - if lower_bound == upper_bound: - # target not found, apply method - if method in ("pad", "ffill"): - if lower_bound == 0: - raise KeyError(target) - return lower_bound - 1 - elif method in ("backfill", "bfill"): - if lower_bound == self._data.nrows: - raise KeyError(target) - return lower_bound - elif method == "nearest": - if lower_bound == self._data.nrows: - return lower_bound - 1 - elif lower_bound == 0: - return 0 - lower_val = self._column.element_indexing(lower_bound - 1) - upper_val = self._column.element_indexing(lower_bound) - return ( - lower_bound - 1 - if abs(lower_val - target) < abs(upper_val - target) - else lower_bound - ) - else: - raise KeyError(target) - - if lower_bound + 1 == upper_bound: - # Search result is unique, return int. - return ( - lower_bound - if is_sorted - else sort_inds.element_indexing(lower_bound) + haystack_table = cudf.DataFrame( + {"None": self._column, "order": arange(0, len(self))} + ) + merged_table = haystack_table.merge( + needle_table, on="None", how="outer" + ) + result_series = ( + merged_table.sort_values(by="order_y") + .head(len(target))["order_x"] + .reset_index(drop=True) + ) + if method is None: + result_series = result_series.fillna(-1) + else: + nonexact = result_series.isnull() + result_series[nonexact] = self.searchsorted( + needle_table["None"][nonexact], + side="left" if method in {"pad", "ffill"} else "right", ) - - if is_sorted: - # In monotonic index, lex search result is continuous. A slice for - # the range is returned. - return slice(lower_bound, upper_bound) - - # Not sorted and not unique. Return a boolean mask - mask = cupy.full(self._data.nrows, False) - true_inds = sort_inds.slice(lower_bound, upper_bound).values - mask[true_inds] = True - return mask + if method in {"pad", "ffill"}: + # searchsorted returns "indices into a sorted array such that, + # if the corresponding elements in v were inserted before the + # indices, the order of a would be preserved". + # Thus, we need to subtract 1 to find values to the left. + result_series[nonexact] -= 1 + # This also mapped not found values (values of 0 from + # np.searchsorted) to -1, which conveniently is also our + # sentinel for missing values + else: + # Mark indices to the right of the largest value as not found + result_series[result_series == len(self)] = -1 + if tolerance is not None: + distance = self[result_series] - needle_table["None"] + # return cupy.where(distance <= tolerance, result_series, -1) + return result_series.where(distance <= tolerance, -1).to_cupy() + return result_series.to_cupy() @_cudf_nvtx_annotate def get_loc(self, key): @@ -1275,8 +1268,40 @@ def get_loc(self, key): 2 """ if is_scalar(key): - key = [key] - return self.get_indexer(target=key) + target = [key] + else: + target = key + + is_sorted = ( + self.is_monotonic_increasing or self.is_monotonic_decreasing + ) + + target_as_table = cudf.core.frame.Frame({"None": as_column(target)}) + lower_bound, upper_bound, sort_inds = _lexsorted_equal_range( + self, target_as_table, is_sorted + ) + + if lower_bound == upper_bound: + raise KeyError(target) + + if lower_bound + 1 == upper_bound: + # Search result is unique, return int. + return ( + lower_bound + if is_sorted + else sort_inds.element_indexing(lower_bound) + ) + + if is_sorted: + # In monotonic index, lex search result is continuous. A slice for + # the range is returned. + return slice(lower_bound, upper_bound) + + # Not sorted and not unique. Return a boolean mask + mask = cupy.full(self._data.nrows, False) + true_inds = sort_inds.slice(lower_bound, upper_bound).values + mask[true_inds] = True + return mask @_cudf_nvtx_annotate def __repr__(self): diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index f533cff7c12..01e7df28020 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1702,60 +1702,45 @@ def get_indexer(self, target, method=None, tolerance=None): raise NotImplementedError( "Parameter tolerance is not supported yet." ) - if method is not None: - raise NotImplementedError( - "only the default get_loc method is currently supported for" - " MultiIndex" - ) - - is_sorted = ( - self.is_monotonic_increasing or self.is_monotonic_decreasing - ) - is_unique = self.is_unique - target = (target,) if not isinstance(target, tuple) else target - - # Handle partial target search. If length of `target` is less than `nlevels`, - # Only search levels up to `len(target)` level. - target_as_table = cudf.core.frame.Frame( - {i: column.as_column(k, length=1) for i, k in enumerate(target)} + target = cudf.MultiIndex.from_tuples(target) + needle_table = target.to_frame(index=False) + col_names = list(range(0, self.nlevels)) + needle_table["order"] = needle_table.index + haystack_table = self.copy(deep=True).to_frame(index=False) + haystack_table["order"] = haystack_table.index + merged_table = haystack_table.merge( + needle_table, on=col_names, how="outer" ) - partial_index = self.__class__._from_data( - data=self._data.select_by_index( - slice(target_as_table._num_columns) - ) + result_series = ( + merged_table.sort_values(by="order_y") + .head(len(target))["order_x"] + .reset_index(drop=True) ) - ( - lower_bound, - upper_bound, - sort_inds, - ) = _lexsorted_equal_range(partial_index, target_as_table, is_sorted) - - if lower_bound == upper_bound: - raise KeyError(target) - - if is_unique and lower_bound + 1 == upper_bound: - # Indices are unique (Pandas constraint), search result is unique, - # return int. - return ( - lower_bound - if is_sorted - else sort_inds.element_indexing(lower_bound) + if method is None: + result_series = result_series.fillna(-1) + else: + nonexact = result_series.isnull() + result_series[nonexact] = self.searchsorted( + needle_table[col_names][nonexact], + side="left" if method in {"pad", "ffill"} else "right", ) - - if is_sorted: - # In monotonic index, lex search result is continuous. A slice for - # the range is returned. - return slice(lower_bound, upper_bound) - - true_inds = sort_inds.slice(lower_bound, upper_bound).values - true_inds = _maybe_indices_to_slice(true_inds) - if isinstance(true_inds, slice): - return true_inds - - # Not sorted and not unique. Return a boolean mask - mask = cp.full(self._data.nrows, False) - mask[true_inds] = True - return mask + if method in {"pad", "ffill"}: + # searchsorted returns "indices into a sorted array such that, + # if the corresponding elements in v were inserted before the + # indices, the order of a would be preserved". + # Thus, we need to subtract 1 to find values to the left. + result_series[nonexact] -= 1 + # This also mapped not found values (values of 0 from + # np.searchsorted) to -1, which conveniently is also our + # sentinel for missing values + else: + # Mark indices to the right of the largest value as not found + result_series[result_series == len(self)] = -1 + if tolerance is not None: + distance = self[result_series] - needle_table["None"] + # return cupy.where(distance <= tolerance, result_series, -1) + return result_series.where(distance <= tolerance, -1).to_cupy() + return result_series.to_cupy() @_cudf_nvtx_annotate def get_loc(self, key): @@ -1814,7 +1799,52 @@ def get_loc(self, key): >>> cudf.from_pandas(x).get_loc(1) slice(1, 5, 1) """ - return self.get_indexer(target=key) + is_sorted = ( + self.is_monotonic_increasing or self.is_monotonic_decreasing + ) + is_unique = self.is_unique + key = (key,) if not isinstance(key, tuple) else key + + # Handle partial key search. If length of `key` is less than `nlevels`, + # Only search levels up to `len(key)` level. + key_as_table = cudf.core.frame.Frame( + {i: column.as_column(k, length=1) for i, k in enumerate(key)} + ) + partial_index = self.__class__._from_data( + data=self._data.select_by_index(slice(key_as_table._num_columns)) + ) + ( + lower_bound, + upper_bound, + sort_inds, + ) = _lexsorted_equal_range(partial_index, key_as_table, is_sorted) + + if lower_bound == upper_bound: + raise KeyError(key) + + if is_unique and lower_bound + 1 == upper_bound: + # Indices are unique (Pandas constraint), search result is unique, + # return int. + return ( + lower_bound + if is_sorted + else sort_inds.element_indexing(lower_bound) + ) + + if is_sorted: + # In monotonic index, lex search result is continuous. A slice for + # the range is returned. + return slice(lower_bound, upper_bound) + + true_inds = sort_inds.slice(lower_bound, upper_bound).values + true_inds = _maybe_indices_to_slice(true_inds) + if isinstance(true_inds, slice): + return true_inds + + # Not sorted and not unique. Return a boolean mask + mask = cp.full(self._data.nrows, False) + mask[true_inds] = True + return mask def _get_reconciled_name_object(self, other) -> MultiIndex: """ diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 823209881c8..50afcf4a902 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1939,20 +1939,16 @@ def test_index_set_names_error(idx, level, names): "idx", [pd.Index([1, 3, 6]), pd.Index([6, 1, 3])], # monotonic # non-monotonic ) -@pytest.mark.parametrize("key", list(range(0, 8))) +@pytest.mark.parametrize("key", [list(range(0, 8))]) @pytest.mark.parametrize("method", [None, "ffill", "bfill", "nearest"]) -def test_get_loc_single_unique_numeric(idx, key, method): +def test_get_indexer_single_unique_numeric(idx, key, method): pi = idx gi = cudf.from_pandas(pi) if ( - (key not in pi and method is None) # `method` only applicable to monotonic index - or (not pi.is_monotonic_increasing and method is not None) - # Get key before the first element is KeyError - or (key == 0 and method in "ffill") - # Get key after the last element is KeyError - or (key == 7 and method in "bfill") + not pi.is_monotonic_increasing + and method is not None ): assert_exceptions_equal( lfunc=pi.get_loc, @@ -1961,10 +1957,9 @@ def test_get_loc_single_unique_numeric(idx, key, method): rfunc_args_and_kwargs=([], {"key": key, "method": method}), ) else: - with expect_warning_if(method is not None): - expected = pi.get_loc(key, method=method) - with expect_warning_if(method is not None): - got = gi.get_loc(key, method=method) + with expect_warning_if(not PANDAS_GE_200 and method is not None): + expected = pi.get_indexer(key, method=method) + got = gi.get_indexer(key, method=method) assert_eq(expected, got) @@ -1982,29 +1977,18 @@ def test_get_loc_single_unique_numeric(idx, key, method): list(range(77, 110, 3)), ], ) -@pytest.mark.parametrize("method", [None, "ffill"]) -def test_get_indexer_rangeindex(idx, key, method): +@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) +@pytest.mark.parametrize("tolerance", [None, 0, 1, 13, 20]) +def test_get_indexer_rangeindex(idx, key, method, tolerance): pi = idx gi = cudf.from_pandas(pi) - # if ( - # (any(k not in pi for k in key) and method is None) - # # Get key before the first element is KeyError - # or (key < pi.start and method in "ffill") - # # Get key after the last element is KeyError - # or (key >= pi.stop and method in "bfill") - # ): - # assert_exceptions_equal( - # lfunc=pi.get_indexer, - # rfunc=gi.get_indexer, - # lfunc_args_and_kwargs=([], {"key": key, "method": method}), - # rfunc_args_and_kwargs=([], {"key": key, "method": method}), - # ) - # else: - # with expect_warning_if(method is not None): - expected = pi.get_indexer(key, method=method) - # with expect_warning_if(method is not None): - got = gi.get_indexer(key, method=method) + expected = pi.get_indexer( + key, method=method, tolerance=None if method is None else tolerance + ) + got = gi.get_indexer( + key, method=method, tolerance=None if method is None else tolerance + ) assert_eq(expected, got) @@ -2066,17 +2050,17 @@ def test_get_loc_single_duplicate_numeric(idx, key): @pytest.mark.parametrize( "idx", [ - pd.Index([1, 3, 3, 6]), # monotonic - pd.Index([6, 1, 3, 3]), # non-monotonic + pd.Index([-1, 2, 3, 6]), # monotonic + pd.Index([6, 1, 3, 4]), # non-monotonic ], ) -@pytest.mark.parametrize("key", [0, 3, 6, 7]) -@pytest.mark.parametrize("method", [None]) +@pytest.mark.parametrize("key", [[0, 3, 1], [6, 7]]) +@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) def test_get_indexer_single_duplicate_numeric(idx, key, method): pi = idx gi = cudf.from_pandas(pi) - if key not in pi: + if not pi.is_monotonic_increasing and method is not None: assert_exceptions_equal( lfunc=pi.get_indexer, rfunc=gi.get_indexer, @@ -2115,21 +2099,13 @@ def test_get_loc_single_unique_string(idx, key): @pytest.mark.parametrize( "idx", [pd.Index(["b", "f", "m", "q"]), pd.Index(["m", "f", "b", "q"])] ) -@pytest.mark.parametrize("key", ["a", "f", "n", "z"]) +@pytest.mark.parametrize("key", [["a", "f", "n", "z"], ["p", "p", "b"]]) @pytest.mark.parametrize("method", [None, "ffill", "bfill"]) def test_get_indexer_single_unique_string(idx, key, method): pi = idx gi = cudf.from_pandas(pi) - if ( - (key not in pi and method is None) - # `method` only applicable to monotonic index - or (not pi.is_monotonic_increasing and method is not None) - # Get key before the first element is KeyError - or (key == "a" and method == "ffill") - # Get key after the last element is KeyError - or (key == "z" and method == "bfill") - ): + if not pi.is_monotonic_increasing and method is not None: assert_exceptions_equal( lfunc=pi.get_indexer, rfunc=gi.get_indexer, @@ -2166,15 +2142,19 @@ def test_get_loc_single_duplicate_string(idx, key): @pytest.mark.parametrize( - "idx", [pd.Index(["b", "m", "m", "q"]), pd.Index(["m", "f", "m", "q"])] + "idx", [pd.Index(["b", "m", "m", "q"]), pd.Index(["a", "f", "m", "q"])] ) -@pytest.mark.parametrize("key", ["a", "f", "n", "z"]) -@pytest.mark.parametrize("method", [None]) +@pytest.mark.parametrize("key", [["a"], ["f", "n", "z"]]) +@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) def test_get_indexer_single_duplicate_string(idx, key, method): pi = idx gi = cudf.from_pandas(pi) - if key not in pi: + if ( + # `method` only applicable to monotonic index + (not pi.is_monotonic_increasing and method is not None) + or not pi.is_unique + ): assert_exceptions_equal( lfunc=pi.get_indexer, rfunc=gi.get_indexer, @@ -2231,28 +2211,20 @@ def test_get_loc_multi_numeric(idx, key): [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 2), (2, 2, 1), (1, 1, 1)] ), pd.MultiIndex.from_tuples( - [(1, 1, 1), (1, 1, 2), (1, 1, 2), (1, 2, 3), (2, 1, 1), (2, 2, 1)] + [(1, 1, 1), (1, 1, 2), (1, 1, 24), (1, 2, 3), (2, 1, 1), (2, 2, 1)] ), ], ) -@pytest.mark.parametrize("key", [1, (1, 2), (1, 2, 3), (2, 1, 1), (9, 9, 9)]) +@pytest.mark.parametrize("key", [[(1, 2, 3)], [(9, 9, 9)]]) @pytest.mark.parametrize("method", [None]) def test_get_indexer_multi_numeric(idx, key, method): pi = idx.sort_values() gi = cudf.from_pandas(pi) - if key not in pi: - assert_exceptions_equal( - lfunc=pi.get_indexer, - rfunc=gi.get_indexer, - lfunc_args_and_kwargs=([], {"key": key, "method": method}), - rfunc_args_and_kwargs=([], {"key": key, "method": method}), - ) - else: - expected = pi.get_indexer(key, method=method) - got = gi.get_indexer(key, method=method) + expected = pi.get_indexer(key, method=method) + got = gi.get_indexer(key, method=method) - assert_eq(expected, got) + assert_eq(expected, got) @pytest.mark.parametrize( @@ -2303,46 +2275,27 @@ def test_get_loc_multi_numeric_deviate(idx, key, result): "idx", [ pd.MultiIndex.from_tuples( - [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 1), (1, 1, 1), (2, 2, 1)] + [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 10), (1, 1, 1), (2, 2, 1)] ) ], ) @pytest.mark.parametrize( - "key, result", + "key", [ - (1, slice(1, 5, 1)), # deviates - ((1, 2), slice(1, 3, 1)), - ((1, 2, 3), slice(1, 2, None)), - ((2, 1, 1), slice(0, 1, None)), - ((9, 9, 9), None), + ((1, 2, 3),), + ((2, 1, 1),), + ((9, 9, 9),), ], ) -@pytest.mark.parametrize("method", [None]) -def test_get_indexer_multi_numeric_deviate(idx, key, result, method): +@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) +def test_get_indexer_multi_numeric_deviate(idx, key, method): pi = idx gi = cudf.from_pandas(pi) - with expect_warning_if( - isinstance(key, tuple), pd.errors.PerformanceWarning - ): - key_flag = key not in pi - - if key_flag: - with expect_warning_if( - isinstance(key, tuple), pd.errors.PerformanceWarning - ): - assert_exceptions_equal( - lfunc=pi.get_loc, - rfunc=gi.get_loc, - lfunc_args_and_kwargs=([], {"key": key, "method": method}), - rfunc_args_and_kwargs=([], {"key": key, "method": method}), - ) - else: - expected = result - with expect_warning_if(method is not None): - got = gi.get_loc(key, method=method) + expected = pi.get_indexer(key, method=method) + got = gi.get_indexer(key, method=method) - assert_eq(expected, got) + assert_eq(expected, got) @pytest.mark.parametrize( @@ -2454,48 +2407,20 @@ def test_get_loc_multi_string(idx, key): ("b", "c", "a"), ] ), - pd.MultiIndex.from_tuples( - [ - ("a", "a", "a"), - ("a", "a", "b"), - ("a", "a", "b"), - ("a", "b", "c"), - ("b", "a", "a"), - ("b", "c", "a"), - ] - ), - pd.MultiIndex.from_tuples( - [ - ("a", "a", "b"), - ("b", "a", "a"), - ("b", "a", "a"), - ("a", "a", "a"), - ("a", "b", "a"), - ("b", "c", "a"), - ] - ), ], ) @pytest.mark.parametrize( - "key", ["a", ("a", "a"), ("a", "b", "c"), ("b", "c", "a"), ("z", "z", "z")] + "key", [[("a", "b", "c"), ("b", "c", "a")], [("z", "z", "z")]] ) -@pytest.mark.parametrize("method", [None]) +@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) def test_get_indexer_multi_string(idx, key, method): pi = idx.sort_values() gi = cudf.from_pandas(pi) - if key not in pi: - assert_exceptions_equal( - lfunc=pi.get_indexer, - rfunc=gi.get_indexer, - lfunc_args_and_kwargs=([], {"key": key, "method": method}), - rfunc_args_and_kwargs=([], {"key": key, "method": method}), - ) - else: - expected = pi.get_indexer(key, method=method) - got = gi.get_indexer(key, method=method) + expected = pi.get_indexer(key, method=method) + got = gi.get_indexer(key, method=method) - assert_eq(expected, got) + assert_eq(expected, got) @pytest.mark.parametrize( From 569b3e7fd4da778f1e2effadc529ef17387809b1 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 26 Apr 2023 21:10:02 -0500 Subject: [PATCH 032/162] Fix `kurtosis` pytests to support `numeric_only` parameter (#13217) https://github.com/rapidsai/cudf/pull/12847 introduced support for `numeric_only`, this PR cleans up a `kurt` related pytest that was relying on the old behavior. This PR fixes 18 pytests : ``` = 413 failed, 88257 passed, 2044 skipped, 932 xfailed, 165 xpassed in 463.03s (0:07:43) = ``` On `pandas_2.0_feature_branch`: ``` = 431 failed, 88221 passed, 2044 skipped, 932 xfailed, 165 xpassed in 456.25s (0:07:36) = ``` --- python/cudf/cudf/tests/test_stats.py | 55 +++++++++++++--------------- 1 file changed, 25 insertions(+), 30 deletions(-) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 126a90e580c..12a08bdcefa 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -233,44 +233,39 @@ def test_misc_quantiles(data, q): @pytest.mark.parametrize( "data", [ - cudf.Series(np.random.normal(-100, 100, 1000)), - cudf.Series(np.random.randint(-50, 50, 1000)), - cudf.Series(np.zeros(100)), - cudf.Series(np.repeat(np.nan, 100)), - cudf.Series(np.array([1.123, 2.343, np.nan, 0.0])), - cudf.Series( - [5, 10, 53, None, np.nan, None, 12, 43, -423], nan_as_null=False - ), - cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]), - cudf.Series([]), - cudf.Series([-3]), + {"data": np.random.normal(-100, 100, 1000)}, + {"data": np.random.randint(-50, 50, 1000)}, + {"data": (np.zeros(100))}, + {"data": np.repeat(np.nan, 100)}, + {"data": np.array([1.123, 2.343, np.nan, 0.0])}, + { + "data": [5, 10, 53, None, np.nan, None, 12, 43, -423], + "nan_as_null": False, + }, + {"data": [1.1032, 2.32, 43.4, 13, -312.0], "index": [0, 4, 3, 19, 6]}, + {"data": []}, + {"data": [-3]}, ], ) @pytest.mark.parametrize("null_flag", [False, True]) -def test_kurtosis_series(data, null_flag): - pdata = data.to_pandas() +@pytest.mark.parametrize("numeric_only", [False, True]) +def test_kurtosis_series(data, null_flag, numeric_only): + gs = cudf.Series(**data) + ps = gs.to_pandas() - if null_flag and len(data) > 2: - data.iloc[[0, 2]] = None - pdata.iloc[[0, 2]] = None + if null_flag and len(gs) > 2: + gs.iloc[[0, 2]] = None + ps.iloc[[0, 2]] = None - got = data.kurtosis() - got = got if np.isscalar(got) else got.to_numpy() - expected = pdata.kurtosis() - np.testing.assert_array_almost_equal(got, expected) + got = gs.kurtosis(numeric_only=numeric_only) + expected = ps.kurtosis(numeric_only=numeric_only) - got = data.kurt() - got = got if np.isscalar(got) else got.to_numpy() - expected = pdata.kurt() - np.testing.assert_array_almost_equal(got, expected) + assert_eq(got, expected) - got = data.kurt(numeric_only=False) - got = got if np.isscalar(got) else got.to_numpy() - expected = pdata.kurt(numeric_only=False) - np.testing.assert_array_almost_equal(got, expected) + got = gs.kurt(numeric_only=numeric_only) + expected = ps.kurt(numeric_only=numeric_only) - with pytest.raises(NotImplementedError): - data.kurt(numeric_only=True) + assert_eq(got, expected) @pytest.mark.parametrize( From bbc84f6dee786e117d904da2c523ee35dd921976 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 26 Apr 2023 21:11:02 -0500 Subject: [PATCH 033/162] Fix parquet pytests errors with pandas-2.0 (#13216) Pandas-2.0 fixed the following issue where dtype of the column was being changed even when `np.nan` was not being written to it: ```python In [1]: import pandas as pd In [2]: df = pd.DataFrame({'a':[1, 2, 3]}) In [3]: df.dtypes Out[3]: a int64 dtype: object In [4]: df Out[4]: a 0 1 1 2 2 3 In [7]: df[[False]*3] = np.nan In [8]: df Out[8]: a 0 1 1 2 2 3 In [9]: df.dtypes Out[9]: a int64 dtype: object ``` Bug in pre-2.0: ```python In [1]: import pandas as pd In [2]: df = pd.DataFrame({'a':[1, 2, 3]}) In [3]: df.dtypes Out[3]: a int64 dtype: object In [4]: df Out[4]: a 0 1 1 2 2 3 In [7]: df[[False]*3] = np.nan In [8]: df Out[8]: a 0 1.0 1 2.0 2 3.0 In [9]: df.dtypes Out[9]: a float64 dtype: object ``` `make_pdf` was basically operating correctly with the help of this bug, this PR makes some fixes to the method and the callers to preserve the pytest behaviors. This PR fixes 6 pytests: ``` = 425 failed, 88227 passed, 2044 skipped, 932 xfailed, 165 xpassed in 471.32s (0:07:51) = ``` On `pandas_2.0_feature_branch`: ``` = 431 failed, 88221 passed, 2044 skipped, 932 xfailed, 165 xpassed in 456.25s (0:07:36) = ``` --- python/cudf/cudf/tests/test_parquet.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index ebebd857231..c0d9af6d67d 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -216,10 +216,13 @@ def make_pdf(nrows, ncolumns=1, nvalids=0, dtype=np.int64): ) test_pdf.columns.name = None - # Randomly but reproducibly mark subset of rows as invalid - random.seed(1337) - mask = random.sample(range(nrows), nvalids) - test_pdf[test_pdf.index.isin(mask)] = np.NaN + if nvalids: + # Randomly but reproducibly mark subset of rows as invalid + random.seed(1337) + mask = random.sample(range(nrows), nvalids) + test_pdf[test_pdf.index.isin(mask)] = np.NaN + if dtype: + test_pdf = test_pdf.astype(dtype) return test_pdf @@ -693,7 +696,7 @@ def test_parquet_reader_select_columns(datadir): def test_parquet_reader_invalids(tmpdir): - test_pdf = make_pdf(nrows=1000, nvalids=1000 // 4, dtype=np.int64) + test_pdf = make_pdf(nrows=1000, nvalids=1000 // 4, dtype="Int64") fname = tmpdir.join("invalids.parquet") test_pdf.to_parquet(fname, engine="pyarrow") @@ -701,7 +704,7 @@ def test_parquet_reader_invalids(tmpdir): expect = pd.read_parquet(fname) got = cudf.read_parquet(fname) - assert_eq(expect, got) + assert_eq(expect, got.to_pandas(nullable=True)) def test_parquet_reader_filenotfound(tmpdir): @@ -788,8 +791,8 @@ def create_parquet_source(df, src_type, fname): "src", ["filepath", "pathobj", "bytes_io", "bytes", "url"] ) def test_parquet_reader_multiple_files(tmpdir, src): - test_pdf1 = make_pdf(nrows=1000, nvalids=1000 // 2) - test_pdf2 = make_pdf(nrows=500) + test_pdf1 = make_pdf(nrows=1000, nvalids=1000 // 2, dtype="float64") + test_pdf2 = make_pdf(nrows=500, dtype="float64") expect = pd.concat([test_pdf1, test_pdf2]) src1 = create_parquet_source(test_pdf1, src, tmpdir.join("multi1.parquet")) @@ -1465,8 +1468,8 @@ def test_parquet_writer_int96_timestamps(tmpdir, pdf, gdf): def test_multifile_parquet_folder(tmpdir): - test_pdf1 = make_pdf(nrows=10, nvalids=10 // 2) - test_pdf2 = make_pdf(nrows=20) + test_pdf1 = make_pdf(nrows=10, nvalids=10 // 2, dtype="float64") + test_pdf2 = make_pdf(nrows=20, dtype="float64") expect = pd.concat([test_pdf1, test_pdf2]) tmpdir.mkdir("multi_part") From 3a85f646a961790c4547a54501679dc983db99a7 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 23 May 2023 13:39:09 -0500 Subject: [PATCH 034/162] Fix csv reader pytest & MultiIndex docstring (#13417) Pandas-2.0 moved to a very strict & consistent date format inference, we should plan to move similarly but meanwhile for the pytest to pass, we will need to pass date_format='mixed'. This PR also fixes a miscellaneous issue with MultiIndex.copy docstring. --- python/cudf/cudf/core/multiindex.py | 15 ++++++--------- python/cudf/cudf/tests/test_csv.py | 3 +++ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index dcc6783147b..0498aa474b6 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -348,8 +348,6 @@ def copy( ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]], ... names=['Date', 'Symbol']) >>> idx2 = idx1.copy( - ... levels=[['day1', 'day2'], ['com1', 'com2']], - ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]], ... names=['col1', 'col2']) >>> df.index = idx1 @@ -363,13 +361,12 @@ def copy( >>> df.index = idx2 >>> df - Close - col1 col2 - day1 com1 3400.00 - com2 226.58 - day2 com1 3401.80 - com2 228.91 - + Close + col1 col2 + 2020-08-27 AMZN 3400.00 + MSFT 226.58 + 2020-08-28 AMZN 3401.80 + MSFT 228.91 """ mi = MultiIndex._from_data(self._data.copy(deep=deep)) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index b66e6bc74fb..5bb6de49f10 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -246,11 +246,14 @@ def test_csv_reader_datetime(parse_dates): parse_dates=parse_dates, dayfirst=True, ) + # Need to used `date_format='mixed'`, + # https://github.com/pandas-dev/pandas/issues/53355 pdf = pd.read_csv( StringIO(buffer), names=["date1", "date2", "bad"], parse_dates=parse_dates, dayfirst=True, + date_format="mixed", ) assert_eq(gdf, pdf) From c1e78b9665fcd8df63d518d0b57983b53c862c31 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 26 May 2023 15:26:10 -0500 Subject: [PATCH 035/162] Deprecate `Groupby.dtypes` (#13453) This PR deprecates `Groupby.dtypes` since it is deprecated in `pandas-2.1` This PR fixes 5 pytests: ``` = 474 failed, 95510 passed, 2044 skipped, 763 xfailed, 300 xpassed in 459.93s (0:07:39) = ``` On `pandas_2.0_feature_branch`: ``` = 479 failed, 95505 passed, 2044 skipped, 763 xfailed, 300 xpassed in 471.66s (0:07:51) = ``` --- python/cudf/cudf/core/_compat.py | 1 + python/cudf/cudf/core/groupby/groupby.py | 19 ++++++++++++++----- python/cudf/cudf/tests/test_groupby.py | 13 +++++++++++-- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py index 183faa12904..bbcde903871 100644 --- a/python/cudf/cudf/core/_compat.py +++ b/python/cudf/cudf/core/_compat.py @@ -11,3 +11,4 @@ PANDAS_LT_153 = PANDAS_VERSION < version.parse("1.5.3") PANDAS_EQ_200 = PANDAS_VERSION == version.parse("2.0.0") PANDAS_GE_200 = PANDAS_VERSION >= version.parse("2.0.0") +PANDAS_GE_210 = PANDAS_VERSION >= version.parse("2.1.0") diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index df91625d7f2..a2d973605ba 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -291,6 +291,9 @@ def dtypes(self): """ Return the dtypes in this group. + .. deprecated:: 23.08 + Use `.dtypes` on base object instead. + Returns ------- pandas.DataFrame @@ -302,17 +305,23 @@ def dtypes(self): >>> df = cudf.DataFrame({'a': [1, 2, 3, 3], 'b': ['x', 'y', 'z', 'a'], ... 'c':[10, 11, 12, 12]}) >>> df.groupby("a").dtypes - b c + a b c a - 1 object int64 - 2 object int64 - 3 object int64 + 1 int64 object int64 + 2 int64 object int64 + 3 int64 object int64 """ + warnings.warn( + f"{type(self).__name__}.dtypes is deprecated and will be " + "removed in a future version. Check the dtypes on the " + "base object instead", + FutureWarning, + ) index = self.grouping.keys.unique().sort_values().to_pandas() return pd.DataFrame( { name: [self.obj._dtypes[name]] * len(index) - for name in self.grouping.values._column_names + for name in self.obj._data.names }, index=index, ) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index a486bf8ff89..a560196f14b 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -19,7 +19,12 @@ import cudf from cudf import DataFrame, Series -from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140, PANDAS_GE_200 +from cudf.core._compat import ( + PANDAS_GE_150, + PANDAS_LT_140, + PANDAS_GE_200, + PANDAS_GE_210, +) from cudf.core.udf.groupby_typing import SUPPORTED_GROUPBY_NUMPY_TYPES from cudf.core.udf.utils import precompiled from cudf.testing._utils import ( @@ -3100,8 +3105,12 @@ def test_groupby_dtypes(groups): {"a": [1, 2, 3, 3], "b": ["x", "y", "z", "a"], "c": [10, 11, 12, 12]} ) pdf = df.to_pandas() + with expect_warning_if(PANDAS_GE_210): + expected = pdf.groupby(groups).dtypes + with pytest.warns(FutureWarning): + actual = df.groupby(groups).dtypes - assert_eq(pdf.groupby(groups).dtypes, df.groupby(groups).dtypes) + assert_eq(expected, actual) @pytest.mark.parametrize("index_names", ["a", "b", "c", ["b", "c"]]) From 2dafcfcddf570263e3c244e2e03897cf8e3fc40b Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 26 May 2023 15:39:17 -0500 Subject: [PATCH 036/162] Enforce Groupby.__iter__ deprecation and miscellaneous pytest fixes (#13423) This PR: - [x] Enforces deprecation in `GroupBy.__iter__` - [x] Fixes miscellaneous pytest failures due to already existing differences in cudf vs pandas & nuly introduced `inferred_type` in Index. --- python/cudf/cudf/core/column/column.py | 11 +++++++++- python/cudf/cudf/core/dataframe.py | 9 +++++--- python/cudf/cudf/core/groupby/groupby.py | 12 +++------- .../cudf/cudf/tests/test_column_accessor.py | 13 +++++++++-- python/cudf/cudf/tests/test_dataframe.py | 9 ++++++-- python/cudf/cudf/tests/test_groupby.py | 22 +++++++------------ python/cudf/cudf/tests/test_replace.py | 7 +++--- 7 files changed, 49 insertions(+), 34 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index b08c35d8997..0a87dc144c1 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2314,7 +2314,16 @@ def as_column( pa_type = np_to_pa_dtype( _maybe_convert_to_default_type("float") ) - + if ( + pa_type is None + and isinstance(arbitrary, pd.Index) + and arbitrary.shape == (0,) + ): + # When an empty `pd.Index` is passed to `pa.array`, + # a type of `null-type` is returned by pyarrow, hence + # we need this workaround to preserve the dtype of + # column being created. + pa_type = np_to_pa_dtype(arbitrary.dtype) data = as_column( pa.array( arbitrary, diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index df2f87c805f..675b870056d 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5852,6 +5852,7 @@ def _reduce( ): source = self + axis = source._get_axis_from_axis_arg(axis) if numeric_only: numeric_cols = ( name @@ -5860,9 +5861,11 @@ def _reduce( ) source = self._get_columns_by_label(numeric_cols) if source.empty: - return Series(index=self.index) - - axis = source._get_axis_from_axis_arg(axis) + return Series( + index=self._data.to_pandas_index()[:0] + if axis == 0 + else source.index + ) if axis == 0: try: diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index a2d973605ba..9e9b52a7538 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -272,19 +272,13 @@ def __init__( self.grouping = _Grouping(obj, by, level) def __iter__(self): - if isinstance(self._by, list) and len(self._by) == 1: - warnings.warn( - "In a future version of cudf, a length 1 tuple will be " - "returned when iterating over a groupby with a grouper equal " - "to a list of length 1. To avoid this warning, do not supply " - "a list with a single grouper.", - FutureWarning, - ) group_names, offsets, _, grouped_values = self._grouped() if isinstance(group_names, cudf.BaseIndex): group_names = group_names.to_pandas() for i, name in enumerate(group_names): - yield name, grouped_values[offsets[i] : offsets[i + 1]] + yield (name,) if isinstance(self._by, list) and len( + self._by + ) == 1 else name, grouped_values[offsets[i] : offsets[i + 1]] @property def dtypes(self): diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py index 99d4bdd9910..b983c2dcab9 100644 --- a/python/cudf/cudf/tests/test_column_accessor.py +++ b/python/cudf/cudf/tests/test_column_accessor.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import pandas as pd @@ -7,6 +7,7 @@ import cudf from cudf.core.column_accessor import ColumnAccessor from cudf.testing._utils import assert_eq +from cudf.core._compat import PANDAS_GE_200 simple_test_data = [ {}, @@ -52,7 +53,15 @@ def test_to_pandas_simple(simple_data): Test that a ColumnAccessor converts to a correct pd.Index """ ca = ColumnAccessor(simple_data) - assert_eq(ca.to_pandas_index(), pd.DataFrame(simple_data).columns) + # We cannot return RangeIndex, while pandas returns RangeIndex. + # Pandas compares `inferred_type` which is `empty` for + # Index([], dtype='object'), and `integer` for RangeIndex() + # to ignore this `inferred_type` comparison, we pass exact=False. + assert_eq( + ca.to_pandas_index(), + pd.DataFrame(simple_data).columns, + exact=not PANDAS_GE_200, + ) def test_to_pandas_multiindex(mi_data): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index ee1309ef402..5875959b0c2 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -308,7 +308,7 @@ def test_axes(data): actual = csr.axes for e, a in zip(expected, actual): - assert_eq(e, a) + assert_eq(e, a, exact=not PANDAS_GE_200) def test_dataframe_truncate_axis_0(): @@ -4938,7 +4938,12 @@ def test_rowwise_ops(data, op, skipna, numeric_only): expected = getattr(pdf, op)(**kwargs) got = getattr(gdf, op)(**kwargs) - assert_eq(expected, got, check_dtype=False) + assert_eq( + expected, + got, + check_dtype=False, + check_index_type=False if len(got.index) == 0 else True, + ) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index a560196f14b..5583b2290ae 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -33,7 +33,6 @@ TIMEDELTA_TYPES, assert_eq, assert_exceptions_equal, - expect_warning_if, ) from cudf.testing.dataset_generator import rand_dataframe @@ -982,8 +981,7 @@ def test_groupby_unsupported_columns(): ) pdf["b"] = pd_cat gdf = cudf.from_pandas(pdf) - with pytest.warns(FutureWarning): - pdg = pdf.groupby("x").sum() + pdg = pdf.groupby("x").sum(numeric_only=True) # cudf does not yet support numeric_only, so our default is False (unlike # pandas, which defaults to inferring and throws a warning about it). gdg = gdf.groupby("x").sum() @@ -1547,15 +1545,11 @@ def test_grouping(grouper): ) gdf = cudf.from_pandas(pdf) - # There's no easy way to validate that the same warning is thrown by both - # cudf and pandas here because it's only thrown upon iteration, so we - # settle for catching warnings on the whole block. - with expect_warning_if(isinstance(grouper, list) and len(grouper) == 1): - for pdf_group, gdf_group in zip( - pdf.groupby(grouper), gdf.groupby(grouper) - ): - assert pdf_group[0] == gdf_group[0] - assert_eq(pdf_group[1], gdf_group[1]) + for pdf_group, gdf_group in zip( + pdf.groupby(grouper), gdf.groupby(grouper) + ): + assert pdf_group[0] == gdf_group[0] + assert_eq(pdf_group[1], gdf_group[1]) @pytest.mark.parametrize("agg", [lambda x: x.count(), "count"]) @@ -3311,8 +3305,8 @@ def test_head_tail_empty(): expected = pdf.groupby(pd.Series(values)).head() got = df.groupby(cudf.Series(values)).head() - assert_eq(expected, got) + assert_eq(expected, got, check_column_type=not PANDAS_GE_200) expected = pdf.groupby(pd.Series(values)).tail() got = df.groupby(cudf.Series(values)).tail() - assert_eq(expected, got) + assert_eq(expected, got, check_column_type=not PANDAS_GE_200) diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index 13e44e7cf59..364afacd261 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -8,7 +8,7 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_134, PANDAS_GE_150 +from cudf.core._compat import PANDAS_GE_134, PANDAS_GE_150, PANDAS_GE_200 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing._utils import ( INTEGER_TYPES, @@ -1008,8 +1008,9 @@ def test_numeric_series_replace_dtype(series_dtype, replacement): pd.Series(["one", "two", "three"], dtype="category"), {"to_replace": "one", "value": "two", "inplace": True}, marks=pytest.mark.xfail( - condition=not PANDAS_GE_134, - reason="https://github.com/pandas-dev/pandas/issues/43232", + condition=(not PANDAS_GE_134) or (PANDAS_GE_200), + reason="https://github.com/pandas-dev/pandas/issues/43232" + "https://github.com/pandas-dev/pandas/issues/53358", ), ), ( From 16c987e2051e98bd3f714d1ad69ea7bb894eb4e1 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 30 May 2023 08:41:25 -0500 Subject: [PATCH 037/162] Preserve Index and grouped columns in `Groupby.nth` (#13442) In pandas-2.0 `groupby.nth` behavior has changed: https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#dataframegroupby-nth-and-seriesgroupby-nth-now-behave-as-filtrations This PR enables preserving the callers index in the end result and returns grouping columns as part of the result. This PR fixes all 12 pytests in `python/cudf/cudf/tests/test_groupby.py::test_groupby_nth` --- python/cudf/cudf/core/groupby/groupby.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 9e9b52a7538..f79a337373e 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -802,10 +802,21 @@ def nth(self, n): """ Return the nth row from each group. """ - result = self.agg(lambda x: x.nth(n)).sort_index() - sizes = self.size().sort_index() - return result[sizes > n] + self.obj["__groupbynth_order__"] = range(0, len(self.obj)) + # We perform another groupby here to have the grouping columns + # be a part of dataframe columns. + result = self.obj.groupby(self.grouping.keys).agg(lambda x: x.nth(n)) + sizes = self.size().reindex(result.index) + + result = result[sizes > n] + + result._index = self.obj.index.take( + result._data["__groupbynth_order__"] + ) + del result._data["__groupbynth_order__"] + del self.obj._data["__groupbynth_order__"] + return result @_cudf_nvtx_annotate def ngroup(self, ascending=True): From 258bf3df9d0d29068985c43d43597f480165a17f Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 30 May 2023 08:43:47 -0500 Subject: [PATCH 038/162] `Index` class deprecation enforcements (#13204) This PR: - [x] Enforces `Index` related deprecations by removing `Float32Index`, `Float64Index`, `GenericIndex`, `Int8Index`, `Int16Index`, `Int32Index`, `Int64Index`, `StringIndex`, `UInt8Index`, `UInt16Index`, `UInt32Index`, `UInt64Index`. - [x] Cleans up the repr logic to more closely align with pandas for `` value representation incase of `string` dtype. - [x] Fixes docstring and pytests to support the removals of the above classes. This PR also fixes 202 pytests: ```bash = 267 failed, 95670 passed, 2044 skipped, 763 xfailed, 300 xpassed in 442.18s (0:07:22) = ``` On `pandas_2.0_feature_branch`: ```bash = 469 failed, 95464 passed, 2044 skipped, 763 xfailed, 300 xpassed in 469.26s (0:07:49) = ``` --- docs/cudf/source/api_docs/index_objects.rst | 3 - docs/cudf/source/conf.py | 2 +- .../source/developer_guide/library_design.md | 25 +- python/cudf/benchmarks/conftest.py | 6 +- python/cudf/cudf/__init__.py | 24 - python/cudf/cudf/_typing.py | 6 +- python/cudf/cudf/core/_base_index.py | 72 +- python/cudf/cudf/core/algorithms.py | 8 +- python/cudf/cudf/core/column/categorical.py | 4 +- python/cudf/cudf/core/column/methods.py | 4 +- python/cudf/cudf/core/column/string.py | 10 +- python/cudf/cudf/core/dataframe.py | 16 +- python/cudf/cudf/core/dtypes.py | 11 +- python/cudf/cudf/core/frame.py | 8 +- python/cudf/cudf/core/groupby/groupby.py | 3 +- python/cudf/cudf/core/index.py | 674 +++--------------- python/cudf/cudf/core/indexed_frame.py | 2 +- python/cudf/cudf/core/multiindex.py | 14 +- python/cudf/cudf/core/reshape.py | 4 +- python/cudf/cudf/core/series.py | 6 +- python/cudf/cudf/core/single_column_frame.py | 4 +- python/cudf/cudf/testing/testing.py | 26 +- python/cudf/cudf/tests/test_binops.py | 4 +- python/cudf/cudf/tests/test_dataframe.py | 4 +- python/cudf/cudf/tests/test_groupby.py | 7 +- python/cudf/cudf/tests/test_index.py | 109 +-- python/cudf/cudf/tests/test_monotonic.py | 10 +- python/cudf/cudf/tests/test_pack.py | 10 +- python/cudf/cudf/tests/test_pickling.py | 6 +- python/cudf/cudf/tests/test_repr.py | 31 +- python/cudf/cudf/tests/test_serialize.py | 4 +- python/cudf/cudf/tests/test_string.py | 5 +- python/dask_cudf/dask_cudf/backends.py | 8 +- 33 files changed, 284 insertions(+), 846 deletions(-) diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst index 2a8d18e9cb7..1b748a8f69f 100644 --- a/docs/cudf/source/api_docs/index_objects.rst +++ b/docs/cudf/source/api_docs/index_objects.rst @@ -149,9 +149,6 @@ Numeric Index :template: autosummary/class_without_autosummary.rst RangeIndex - Int64Index - UInt64Index - Float64Index .. _api.categoricalindex: diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 2d3d2494747..4d9558ecd33 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -261,7 +261,7 @@ def process_class_docstrings(app, what, name, obj, options, lines): from the processed docstring. """ if what == "class": - if name in {"cudf.RangeIndex", "cudf.Int64Index", "cudf.UInt64Index", "cudf.Float64Index", "cudf.CategoricalIndex", "cudf.IntervalIndex", "cudf.MultiIndex", "cudf.DatetimeIndex", "cudf.TimedeltaIndex", "cudf.TimedeltaIndex"}: + if name in {"cudf.RangeIndex", "cudf.CategoricalIndex", "cudf.IntervalIndex", "cudf.MultiIndex", "cudf.DatetimeIndex", "cudf.TimedeltaIndex", "cudf.TimedeltaIndex"}: cut_index = lines.index('.. rubric:: Attributes') lines[:] = lines[:cut_index] diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index 16b84476549..e1f91a6417d 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -22,7 +22,7 @@ Finally we tie these pieces together to provide a more holistic view of the proj % class IndexedFrame % class SingleColumnFrame % class BaseIndex -% class GenericIndex +% class Index % class MultiIndex % class RangeIndex % class DataFrame @@ -42,8 +42,8 @@ Finally we tie these pieces together to provide a more holistic view of the proj % BaseIndex <|-- MultiIndex % Frame <|-- MultiIndex % -% BaseIndex <|-- GenericIndex -% SingleColumnFrame <|-- GenericIndex +% BaseIndex <|-- Index +% SingleColumnFrame <|-- Index % % @enduml @@ -89,31 +89,26 @@ While we've highlighted some exceptional cases of Indexes before, let's start wi In practice, `BaseIndex` does have concrete implementations of a small set of methods. However, currently many of these implementations are not applicable to all subclasses and will be eventually be removed. -Almost all indexes are subclasses of `GenericIndex`, a single-columned index with the class hierarchy: +Almost all indexes are subclasses of `Index`, a single-columned index with the class hierarchy: ```python -class GenericIndex(SingleColumnFrame, BaseIndex) +class Index(SingleColumnFrame, BaseIndex) ``` Integer, float, or string indexes are all composed of a single column of data. -Most `GenericIndex` methods are inherited from `Frame`, saving us the trouble of rewriting them. +Most `Index` methods are inherited from `Frame`, saving us the trouble of rewriting them. We now consider the three main exceptions to this model: - A `RangeIndex` is not backed by a column of data, so it inherits directly from `BaseIndex` alone. Wherever possible, its methods have special implementations designed to avoid materializing columns. - Where such an implementation is infeasible, we fall back to converting it to an `Int64Index` first instead. + Where such an implementation is infeasible, we fall back to converting it to an `Index` of `int64` + dtype first instead. - A `MultiIndex` is backed by _multiple_ columns of data. Therefore, its inheritance hierarchy looks like `class MultiIndex(Frame, BaseIndex)`. Some of its more `Frame`-like methods may be inherited, but many others must be reimplemented since in many cases a `MultiIndex` is not expected to behave like a `Frame`. -- Just like in pandas, `Index` itself can never be instantiated. - `pandas.Index` is the parent class for indexes, - but its constructor returns an appropriate subclass depending on the input data type and shape. - Unfortunately, mimicking this behavior requires overriding `__new__`, - which in turn makes shared initialization across inheritance trees much more cumbersome to manage. - To enable sharing constructor logic across different index classes, - we instead define `BaseIndex` as the parent class of all indexes. +- To enable sharing constructor logic across different index classes, + we define `BaseIndex` as the parent class of all indexes. `Index` inherits from `BaseIndex`, but it masquerades as a `BaseIndex` to match pandas. - This class should contain no implementations since it is simply a factory for other indexes. ## The Column layer diff --git a/python/cudf/benchmarks/conftest.py b/python/cudf/benchmarks/conftest.py index 4f2bb96061f..5d0f80189c9 100644 --- a/python/cudf/benchmarks/conftest.py +++ b/python/cudf/benchmarks/conftest.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. """Defines pytest fixtures for all benchmarks. @@ -40,8 +40,8 @@ In addition to the above fixtures, we also provide the following more specialized fixtures: - rangeindex: Since RangeIndex always holds int64 data we cannot conflate - it with index_dtype_int64 (a true Int64Index), and it cannot hold nulls. - As a result, it is provided as a separate fixture. + it with index_dtype_int64 (a true Index with int64 dtype), and it + cannot hold nulls. As a result, it is provided as a separate fixture. """ import os diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index de0f2d67add..c64da9a8ab2 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -40,22 +40,10 @@ BaseIndex, CategoricalIndex, DatetimeIndex, - Float32Index, - Float64Index, - GenericIndex, Index, - Int8Index, - Int16Index, - Int32Index, - Int64Index, IntervalIndex, RangeIndex, - StringIndex, TimedeltaIndex, - UInt8Index, - UInt16Index, - UInt32Index, - UInt64Index, interval_range, ) from cudf.core.missing import NA @@ -106,15 +94,8 @@ "DatetimeIndex", "Decimal32Dtype", "Decimal64Dtype", - "Float32Index", - "Float64Index", - "GenericIndex", "Grouper", "Index", - "Int16Index", - "Int32Index", - "Int64Index", - "Int8Index", "IntervalDtype", "IntervalIndex", "ListDtype", @@ -123,13 +104,8 @@ "RangeIndex", "Scalar", "Series", - "StringIndex", "StructDtype", "TimedeltaIndex", - "UInt16Index", - "UInt32Index", - "UInt64Index", - "UInt8Index", "api", "concat", "crosstab", diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py index e2ea12a0e4d..79762edbd65 100644 --- a/python/cudf/cudf/_typing.py +++ b/python/cudf/cudf/_typing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. import sys from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, TypeVar, Union @@ -37,9 +37,7 @@ DataFrameOrSeries = Union["cudf.Series", "cudf.DataFrame"] SeriesOrIndex = Union["cudf.Series", "cudf.core.index.BaseIndex"] -SeriesOrSingleColumnIndex = Union[ - "cudf.Series", "cudf.core.index.GenericIndex" -] +SeriesOrSingleColumnIndex = Union["cudf.Series", "cudf.core.index.Index"] # Groupby aggregation AggType = Union[str, Callable] diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 7d16824174a..46e7cdfac61 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -58,9 +58,9 @@ >>> import cudf >>> index = cudf.Index([1, 2, 3]) >>> index -Int64Index([1, 2, 3], dtype='int64') +Index([1, 2, 3], dtype='int64') >>> index.astype('float64') -Float64Index([1.0, 2.0, 3.0], dtype='float64') +Index([1.0, 2.0, 3.0], dtype='float64') """ @@ -135,7 +135,7 @@ def get_level_values(self, level): >>> import cudf >>> idx = cudf.Index(["a", "b", "c"]) >>> idx.get_level_values(0) - StringIndex(['a' 'b' 'c'], dtype='object') + Index(['a', 'b', 'c'], dtype='object') """ if level == self.name: @@ -182,7 +182,7 @@ def _clean_nulls_from_index(self): to `` as a preprocessing step to `__repr__` methods. This will involve changing type of Index object - to StringIndex but it is the responsibility of the `__repr__` + to string dtype but it is the responsibility of the `__repr__` methods using this method to replace or handle representation of the actual types correctly. """ @@ -225,7 +225,7 @@ def hasnans(self): >>> import numpy as np >>> index = cudf.Index([1, 2, np.nan, 3, 4], nan_as_null=False) >>> index - Float64Index([1.0, 2.0, nan, 3.0, 4.0], dtype='float64') + Index([1.0, 2.0, nan, 3.0, 4.0], dtype='float64') >>> index.hasnans True @@ -233,7 +233,7 @@ def hasnans(self): >>> index = cudf.Index([1, 2, None, 3, 4]) >>> index - Int64Index([1, 2, , 3, 4], dtype='int64') + Index([1, 2, , 3, 4], dtype='int64') >>> index.hasnans True """ @@ -286,9 +286,9 @@ def set_names(self, names, level=None, inplace=False): >>> import cudf >>> idx = cudf.Index([1, 2, 3, 4]) >>> idx - Int64Index([1, 2, 3, 4], dtype='int64') + Index([1, 2, 3, 4], dtype='int64') >>> idx.set_names('quarter') - Int64Index([1, 2, 3, 4], dtype='int64', name='quarter') + Index([1, 2, 3, 4], dtype='int64', name='quarter') >>> idx = cudf.MultiIndex.from_product([['python', 'cobra'], ... [2018, 2019]]) >>> idx @@ -347,7 +347,7 @@ def union(self, other, sort=None): >>> idx1 = cudf.Index([1, 2, 3, 4]) >>> idx2 = cudf.Index([3, 4, 5, 6]) >>> idx1.union(idx2) - Int64Index([1, 2, 3, 4, 5, 6], dtype='int64') + Index([1, 2, 3, 4, 5, 6], dtype='int64') MultiIndex case @@ -437,7 +437,7 @@ def intersection(self, other, sort=False): >>> idx1 = cudf.Index([1, 2, 3, 4]) >>> idx2 = cudf.Index([3, 4, 5, 6]) >>> idx1.intersection(idx2) - Int64Index([3, 4], dtype='int64') + Index([3, 4], dtype='int64') MultiIndex case @@ -541,9 +541,9 @@ def fillna(self, value, downcast=None): >>> import cudf >>> index = cudf.Index([1, 2, None, 4]) >>> index - Int64Index([1, 2, , 4], dtype='int64') + Index([1, 2, , 4], dtype='int64') >>> index.fillna(3) - Int64Index([1, 2, 3, 4], dtype='int64') + Index([1, 2, 3, 4], dtype='int64') """ if downcast is not None: raise NotImplementedError( @@ -635,13 +635,13 @@ def to_pandas(self, nullable=False): >>> import cudf >>> idx = cudf.Index([-3, 10, 15, 20]) >>> idx - Int64Index([-3, 10, 15, 20], dtype='int64') + Index([-3, 10, 15, 20], dtype='int64') >>> idx.to_pandas() - Int64Index([-3, 10, 15, 20], dtype='int64') + Index([-3, 10, 15, 20], dtype='int64') >>> type(idx.to_pandas()) - + >>> type(idx) - + """ raise NotImplementedError @@ -666,7 +666,7 @@ def isin(self, values): -------- >>> idx = cudf.Index([1,2,3]) >>> idx - Int64Index([1, 2, 3], dtype='int64') + Index([1, 2, 3], dtype='int64') Check whether each index value in a list of values. @@ -736,17 +736,17 @@ def append(self, other): >>> import cudf >>> idx = cudf.Index([1, 2, 10, 100]) >>> idx - Int64Index([1, 2, 10, 100], dtype='int64') + Index([1, 2, 10, 100], dtype='int64') >>> other = cudf.Index([200, 400, 50]) >>> other - Int64Index([200, 400, 50], dtype='int64') + Index([200, 400, 50], dtype='int64') >>> idx.append(other) - Int64Index([1, 2, 10, 100, 200, 400, 50], dtype='int64') + Index([1, 2, 10, 100, 200, 400, 50], dtype='int64') append accepts list of Index objects >>> idx.append([other, other]) - Int64Index([1, 2, 10, 100, 200, 400, 50, 200, 400, 50], dtype='int64') + Index([1, 2, 10, 100, 200, 400, 50, 200, 400, 50], dtype='int64') """ raise NotImplementedError @@ -778,14 +778,14 @@ def difference(self, other, sort=None): >>> import cudf >>> idx1 = cudf.Index([2, 1, 3, 4]) >>> idx1 - Int64Index([2, 1, 3, 4], dtype='int64') + Index([2, 1, 3, 4], dtype='int64') >>> idx2 = cudf.Index([3, 4, 5, 6]) >>> idx2 - Int64Index([3, 4, 5, 6], dtype='int64') + Index([3, 4, 5, 6], dtype='int64') >>> idx1.difference(idx2) - Int64Index([1, 2], dtype='int64') + Index([1, 2], dtype='int64') >>> idx1.difference(idx2, sort=False) - Int64Index([2, 1], dtype='int64') + Index([2, 1], dtype='int64') """ if sort not in {None, False}: raise ValueError( @@ -1231,18 +1231,18 @@ def sort_values( >>> import cudf >>> idx = cudf.Index([10, 100, 1, 1000]) >>> idx - Int64Index([10, 100, 1, 1000], dtype='int64') + Index([10, 100, 1, 1000], dtype='int64') Sort values in ascending order (default behavior). >>> idx.sort_values() - Int64Index([1, 10, 100, 1000], dtype='int64') + Index([1, 10, 100, 1000], dtype='int64') Sort values in descending order, and also get the indices `idx` was sorted by. >>> idx.sort_values(ascending=False, return_indexer=True) - (Int64Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2], + (Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2], dtype=int32)) Sorting values in a MultiIndex: @@ -1319,7 +1319,7 @@ def join( names=['a', 'b']) >>> rhs = cudf.DataFrame({"a": [1, 4, 3]}).set_index('a').index >>> rhs - Int64Index([1, 4, 3], dtype='int64', name='a') + Index([1, 4, 3], dtype='int64', name='a') >>> lhs.join(rhs, how='inner') MultiIndex([(3, 4), (1, 2)], @@ -1402,12 +1402,12 @@ def rename(self, name, inplace=False): >>> import cudf >>> index = cudf.Index([1, 2, 3], name='one') >>> index - Int64Index([1, 2, 3], dtype='int64', name='one') + Index([1, 2, 3], dtype='int64', name='one') >>> index.name 'one' >>> renamed_index = index.rename('two') >>> renamed_index - Int64Index([1, 2, 3], dtype='int64', name='two') + Index([1, 2, 3], dtype='int64', name='two') >>> renamed_index.name 'two' """ @@ -1501,9 +1501,9 @@ def from_pandas(cls, index, nan_as_null=None): >>> data = [10, 20, 30, np.nan] >>> pdi = pd.Index(data) >>> cudf.Index.from_pandas(pdi) - Float64Index([10.0, 20.0, 30.0, ], dtype='float64') + Index([10.0, 20.0, 30.0, ], dtype='float64') >>> cudf.Index.from_pandas(pdi, nan_as_null=False) - Float64Index([10.0, 20.0, 30.0, nan], dtype='float64') + Index([10.0, 20.0, 30.0, nan], dtype='float64') """ if not isinstance(index, pd.Index): raise TypeError("not a pandas.Index") @@ -1674,7 +1674,7 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None): -------- >>> idx = cudf.Index(['a', 'b', 'c', 'd', 'e']) >>> idx.take([2, 0, 4, 3]) - StringIndex(['c' 'a' 'e' 'd'], dtype='object') + Index(['c', 'a', 'e', 'd'], dtype='object') """ if axis not in {0, "index"}: @@ -1725,9 +1725,9 @@ def repeat(self, repeats, axis=None): -------- >>> index = cudf.Index([10, 22, 33, 55]) >>> index - Int64Index([10, 22, 33, 55], dtype='int64') + Index([10, 22, 33, 55], dtype='int64') >>> index.repeat(5) - Int64Index([10, 10, 10, 10, 10, 22, 22, 22, 22, 22, 33, + Index([10, 10, 10, 10, 10, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 55, 55, 55, 55, 55], dtype='int64') """ diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 50ec4b774ee..56bb575d6d6 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -46,7 +46,7 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None): >>> codes array([0, 1, 1], dtype=int8) >>> uniques - StringIndex(['a' 'c'], dtype='object') + Index(['a' 'c'], dtype='object') When ``use_na_sentinel=True`` (the default), missing values are indicated in the `codes` with the sentinel value ``-1`` and missing values are not @@ -56,7 +56,7 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None): >>> codes array([ 1, -1, 0, 2, 1], dtype=int8) >>> uniques - StringIndex(['a' 'b' 'c'], dtype='object') + Index(['a', 'b', 'c'], dtype='object') If NA is in the values, and we want to include NA in the uniques of the values, it can be achieved by setting ``use_na_sentinel=False``. @@ -66,12 +66,12 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None): >>> codes array([ 0, 1, 0, -1], dtype=int8) >>> uniques - Float64Index([1.0, 2.0], dtype='float64') + Index([1.0, 2.0], dtype='float64') >>> codes, uniques = cudf.factorize(values, use_na_sentinel=False) >>> codes array([1, 2, 1, 0], dtype=int8) >>> uniques - Float64Index([, 1.0, 2.0], dtype='float64') + Index([, 1.0, 2.0], dtype='float64') """ return_cupy_array = isinstance(values, cp.ndarray) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index c026574f8cd..6352f9f1fa0 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -63,7 +63,7 @@ class CategoricalAccessor(ColumnMethods): dtype: category Categories (3, int64): [1, 2, 3] >>> s.cat.categories - Int64Index([1, 2, 3], dtype='int64') + Index([1, 2, 3], dtype='int64') >>> s.cat.reorder_categories([3,2,1]) 0 1 1 2 @@ -106,7 +106,7 @@ def __init__(self, parent: SeriesOrSingleColumnIndex): super().__init__(parent=parent) @property - def categories(self) -> "cudf.core.index.GenericIndex": + def categories(self) -> "cudf.core.index.Index": """ The categories of this categorical. """ diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index c1b6dad00b7..0e7bcdc296c 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from __future__ import annotations @@ -8,7 +8,7 @@ import cudf -ParentType = Union["cudf.Series", "cudf.core.index.GenericIndex"] +ParentType = Union["cudf.Series", "cudf.core.index.Index"] class ColumnMethods: diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 2e74ec62204..0205d0ee43b 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -708,9 +708,9 @@ def contains( >>> data = ['Mouse', 'dog', 'house and parrot', '23.0', np.NaN] >>> idx = cudf.Index(data) >>> idx - StringIndex(['Mouse' 'dog' 'house and parrot' '23.0' None], dtype='object') + Index(['Mouse' 'dog' 'house and parrot' '23.0' None], dtype='object') >>> idx.str.contains('23', regex=False) - GenericIndex([False, False, False, True, ], dtype='bool') + Index([False, False, False, True, ], dtype='bool') Returning 'house' or 'dog' when either expression occurs in a string. @@ -2811,7 +2811,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: >>> idx = cudf.Index(['X 123', 'Y 999']) >>> idx - StringIndex(['X 123' 'Y 999'], dtype='object') + Index(['X 123' 'Y 999'], dtype='object') Which will create a MultiIndex: @@ -2876,7 +2876,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: >>> idx = cudf.Index(['X 123', 'Y 999']) >>> idx - StringIndex(['X 123' 'Y 999'], dtype='object') + Index(['X 123' 'Y 999'], dtype='object') Which will create a MultiIndex: @@ -3542,7 +3542,7 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: >>> index = cudf.Index(['A', 'A', 'Aaba', 'cat']) >>> index.str.count('a') - Int64Index([0, 0, 2, 1], dtype='int64') + Index([0, 0, 2, 1], dtype='int64') """ # noqa W605 if isinstance(pat, re.Pattern): flags = pat.flags & ~re.U diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 675b870056d..624e378011a 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1673,7 +1673,7 @@ def _concat( if empty_has_index and num_empty_input_frames == len(objs): out._index = cudf.RangeIndex(result_index_length) elif are_all_range_index and not ignore_index: - out._index = cudf.core.index.GenericIndex._concat( + out._index = cudf.core.index.Index._concat( [o._index for o in objs] ) @@ -3381,7 +3381,7 @@ def rename( if index: if ( any(type(item) == str for item in index.values()) - and type(self.index) != cudf.StringIndex + and type(self.index._values) != cudf.core.column.StringColumn ): raise NotImplementedError( "Implicit conversion of index to " @@ -6606,7 +6606,7 @@ def keys(self): Columns: [0, 1, 2, 3] Index: [] >>> df.keys() - Int64Index([0, 1, 2, 3], dtype='int64') + Index([0, 1, 2, 3], dtype='int64') """ return self._data.to_pandas_index() @@ -7308,14 +7308,14 @@ def from_pandas(obj, nan_as_null=None): >>> pidx = pd.Index([1, 2, 10, 20]) >>> pidx - Int64Index([1, 2, 10, 20], dtype='int64') + Index([1, 2, 10, 20], dtype='int64') >>> gidx = cudf.from_pandas(pidx) >>> gidx - Int64Index([1, 2, 10, 20], dtype='int64') + Index([1, 2, 10, 20], dtype='int64') >>> type(gidx) - + >>> type(pidx) - + Converting a Pandas MultiIndex to cuDF MultiIndex: @@ -7494,7 +7494,7 @@ def _get_union_of_indices(indexes): if len(indexes) == 1: return indexes[0] else: - merged_index = cudf.core.index.GenericIndex._concat(indexes) + merged_index = cudf.core.index.Index._concat(indexes) merged_index = merged_index.drop_duplicates() _, inds = merged_index._values.sort_by_values() return merged_index.take(inds) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index edd557aad1f..dce595b0843 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -162,7 +162,7 @@ def __init__(self, categories=None, ordered: bool = False) -> None: self._ordered = ordered @property - def categories(self) -> "cudf.core.index.GenericIndex": + def categories(self) -> "cudf.core.index.Index": """ An ``Index`` containing the unique categories allowed. @@ -171,7 +171,7 @@ def categories(self) -> "cudf.core.index.GenericIndex": >>> import cudf >>> dtype = cudf.CategoricalDtype(categories=['b', 'a'], ordered=True) >>> dtype.categories - StringIndex(['b' 'a'], dtype='object') + Index(['b', 'a'], dtype='object') """ if self._categories is None: return cudf.core.index.as_index( @@ -238,9 +238,10 @@ def to_pandas(self) -> pd.CategoricalDtype: if self._categories is None: categories = None else: - if isinstance( - self._categories, (cudf.Float32Index, cudf.Float64Index) - ): + if self._categories.dtype in { + cudf.dtype("float32"), + cudf.dtype("float64"), + }: categories = self._categories.dropna().to_pandas() else: categories = self._categories.to_pandas() diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index c7330da5cfa..89b38fad376 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -210,12 +210,12 @@ def size(self): >>> index = cudf.Index([]) >>> index - Float64Index([], dtype='float64') + Index([], dtype='float64') >>> index.size 0 >>> index = cudf.Index([1, 2, 3, 10]) >>> index - Int64Index([1, 2, 3, 10], dtype='int64') + Index([1, 2, 3, 10], dtype='int64') >>> index.size 4 @@ -1289,7 +1289,7 @@ def isna(self): >>> idx = cudf.Index([1, 2, None, np.NaN, 0.32, np.inf]) >>> idx - Float64Index([1.0, 2.0, , , 0.32, Inf], dtype='float64') + Index([1.0, 2.0, , , 0.32, Inf], dtype='float64') >>> idx.isna() array([False, False, True, True, False, False]) """ @@ -1368,7 +1368,7 @@ def notna(self): >>> idx = cudf.Index([1, 2, None, np.NaN, 0.32, np.inf]) >>> idx - Float64Index([1.0, 2.0, , , 0.32, Inf], dtype='float64') + Index([1.0, 2.0, , , 0.32, Inf], dtype='float64') >>> idx.notna() array([ True, True, False, False, True, True]) """ diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index f79a337373e..8e88d994708 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -531,7 +531,8 @@ def agg(self, func): orig_dtypes = tuple(c.dtype for c in columns) # Note: When there are no key columns, the below produces - # a Float64Index, while Pandas returns an Int64Index + # an Index with float64 dtype, while Pandas returns + # an Index with int64 dtype. # (GH: 6945) ( result_columns, diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 783f4012311..c0664d3ca4d 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -9,12 +9,10 @@ from numbers import Number from typing import ( Any, - Dict, List, MutableMapping, Optional, Tuple, - Type, Union, ) @@ -22,6 +20,7 @@ import numpy as np import pandas as pd from pandas._config import get_option +from typing_extensions import Self import cudf from cudf._lib.datetime import extract_quarter, is_leap_year @@ -34,7 +33,6 @@ is_interval_dtype, is_list_like, is_scalar, - is_string_dtype, ) from cudf.core._base_index import BaseIndex, _index_astype_docstring from cudf.core.column import ( @@ -66,8 +64,33 @@ from cudf.core._compat import PANDAS_GE_200 +class IndexMeta(type): + """Custom metaclass for Index that overrides instance/subclass tests.""" + + def __call__(cls, data, *args, **kwargs): + if cls is Index: + return as_index( + arbitrary=data, + *args, + **kwargs, + ) + return super().__call__(data, *args, **kwargs) + + def __instancecheck__(self, instance): + if self is cudf.Index: + return isinstance(instance, BaseIndex) + else: + return False + + def __subclasscheck__(self, subclass): + if self is cudf.Index: + return issubclass(subclass, BaseIndex) + else: + return False + + def _lexsorted_equal_range( - idx: Union[GenericIndex, cudf.MultiIndex], + idx: Union[Index, cudf.MultiIndex], key_as_table: Frame, is_sorted: bool, ) -> Tuple[int, int, Optional[ColumnBase]]: @@ -100,18 +123,13 @@ def _index_from_data(data: MutableMapping, name: Any = None): values = next(iter(data.values())) if isinstance(values, NumericalColumn): - try: - index_class_type: Type[ - Union[GenericIndex, cudf.MultiIndex] - ] = _dtype_to_index[values.dtype.type] - except KeyError: - index_class_type = GenericIndex + index_class_type = Index elif isinstance(values, DatetimeColumn): index_class_type = DatetimeIndex elif isinstance(values, TimeDeltaColumn): index_class_type = TimedeltaIndex elif isinstance(values, StringColumn): - index_class_type = StringIndex + index_class_type = Index elif isinstance(values, CategoricalColumn): index_class_type = CategoricalIndex elif isinstance(values, (IntervalColumn, StructColumn)): @@ -195,8 +213,8 @@ def __init__( self._end = self._start + self._step * (len(self._range) - 1) def _copy_type_metadata( - self: RangeIndex, other: RangeIndex, *, override_dtypes=None - ) -> RangeIndex: + self, other: RangeIndex, *, override_dtypes=None + ) -> Self: # There is no metadata to be copied for RangeIndex since it does not # have an underlying column. return self @@ -564,7 +582,7 @@ def __rmul__(self, other): def _as_int_index(self): # Convert self to an integer index. This method is used to perform ops # that are not defined directly on RangeIndex. - return _dtype_to_index[self.dtype.type]._from_data(self._data) + return cudf.Index._from_data(self._data) @_cudf_nvtx_annotate def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @@ -770,13 +788,13 @@ def sort_values( @_cudf_nvtx_annotate def _gather(self, gather_map, nullify=False, check_bounds=True): gather_map = cudf.core.column.as_column(gather_map) - return _dtype_to_index[self.dtype.type]._from_columns( + return cudf.Index._from_columns( [self._values.take(gather_map, nullify, check_bounds)], [self.name] ) @_cudf_nvtx_annotate def _apply_boolean_mask(self, boolean_mask): - return _dtype_to_index[self.dtype.type]._from_columns( + return cudf.Index._from_columns( [self._values.apply_boolean_mask(boolean_mask)], [self.name] ) @@ -784,7 +802,7 @@ def repeat(self, repeats, axis=None): return self._as_int_index().repeat(repeats, axis) def _split(self, splits): - return _dtype_to_index[self.dtype.type]._from_columns( + return cudf.Index._from_columns( [self._as_int_index()._split(splits)], [self.name] ) @@ -917,7 +935,7 @@ def __abs__(self): return abs(self._as_int_index()) -class GenericIndex(SingleColumnFrame, BaseIndex): +class Index(SingleColumnFrame, BaseIndex, metaclass=IndexMeta): """ An array of orderable values that represent the indices of another Column @@ -939,21 +957,6 @@ class GenericIndex(SingleColumnFrame, BaseIndex): @_cudf_nvtx_annotate def __init__(self, data, **kwargs): kwargs = _setdefault_name(data, **kwargs) - - # normalize the input - if isinstance(data, cudf.Series): - data = data._column - elif isinstance(data, column.ColumnBase): - data = data - else: - if isinstance(data, (list, tuple)): - if len(data) == 0: - data = np.asarray([], dtype="int64") - else: - data = np.asarray(data) - data = column.as_column(data) - assert isinstance(data, (NumericalColumn, StringColumn)) - name = kwargs.get("name") super().__init__({name: data}) @@ -985,8 +988,8 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # pandas returns numpy arrays when the outputs are boolean. for i, o in enumerate(out): # We explicitly _do not_ use isinstance here: we want only - # boolean GenericIndexes, not dtype-specific subclasses. - if type(o) is GenericIndex and o.dtype.kind == "b": + # boolean Indexes, not dtype-specific subclasses. + if type(o) is Index and o.dtype.kind == "b": out[i] = o.values return out[0] if ufunc.nout == 1 else tuple(out) @@ -995,14 +998,21 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @classmethod @_cudf_nvtx_annotate - def _from_data( - cls, data: MutableMapping, name: Any = None - ) -> GenericIndex: + def _from_data(cls, data: MutableMapping, name: Any = None) -> Self: out = super()._from_data(data=data) if name is not None: out.name = name return out + @classmethod + @_cudf_nvtx_annotate + def from_arrow(cls, obj): + try: + return cls(ColumnBase.from_arrow(obj)) + except TypeError: + # Try interpreting object as a MultiIndex before failing. + return cudf.MultiIndex.from_arrow(obj) + def _binaryop( self, other: Frame, @@ -1019,16 +1029,16 @@ def _binaryop( # pandas returns numpy arrays when the outputs are boolean. We # explicitly _do not_ use isinstance here: we want only boolean - # GenericIndexes, not dtype-specific subclasses. - if type(ret) is GenericIndex and ret.dtype.kind == "b": + # Indexes, not dtype-specific subclasses. + if type(ret) is Index and ret.dtype.kind == "b": return ret.values return ret # Override just to make mypy happy. @_cudf_nvtx_annotate def _copy_type_metadata( - self: GenericIndex, other: GenericIndex, *, override_dtypes=None - ) -> GenericIndex: + self, other: Self, *, override_dtypes=None + ) -> Self: return super()._copy_type_metadata( other, override_dtypes=override_dtypes ) @@ -1294,9 +1304,10 @@ def __repr__(self): output = output.replace("nan", cudf._NA_REP) elif preprocess._values.nullable: - output = repr(self._clean_nulls_from_index().to_pandas()) - - if not isinstance(self, StringIndex): + if isinstance(self._values, StringColumn): + output = repr(self.to_pandas(nullable=True)) + else: + output = repr(self._clean_nulls_from_index().to_pandas()) # We should remove all the single quotes # from the output due to the type-cast to # object dtype happening above. @@ -1341,7 +1352,7 @@ def __getitem__(self, index): @_cudf_nvtx_annotate def dtype(self): """ - `dtype` of the underlying values in GenericIndex. + `dtype` of the underlying values in Index. """ return self._values.dtype @@ -1382,19 +1393,21 @@ def get_slice_bound(self, label, side): return self._values.get_slice_bound(label, side) def _is_numeric(self): - return False + return isinstance( + self._values, cudf.core.column.NumericalColumn + ) and self.dtype != cudf.dtype("bool") def _is_boolean(self): - return True + return self.dtype == cudf.dtype("bool") def _is_integer(self): - return False + return cudf.api.types.is_integer_dtype(self.dtype) def _is_floating(self): - return False + return cudf.api.types.is_float_dtype(self.dtype) def _is_object(self): - return False + return isinstance(self._values, cudf.core.column.StringColumn) def _is_categorical(self): return False @@ -1536,333 +1549,19 @@ def isin(self, values): return self._values.isin(values).values - -class NumericIndex(GenericIndex): - """Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Returns - ------- - Index - """ - - # Subclasses must define the dtype they are associated with. - _dtype: Union[None, Type[np.number]] = None - + @copy_docstring(StringMethods) # type: ignore + @property @_cudf_nvtx_annotate - def __init__(self, data=None, dtype=None, copy=False, name=None): - warnings.warn( - f"cudf.{self.__class__.__name__} is deprecated and will be " - "removed from cudf in a future version. Use cudf.Index with the " - "appropriate dtype instead.", - FutureWarning, - ) - - dtype = type(self)._dtype - if copy: - data = column.as_column(data, dtype=dtype).copy() - - kwargs = _setdefault_name(data, name=name) - - data = column.as_column(data, dtype=dtype) - - super().__init__(data, **kwargs) - - def _is_numeric(self): - return True - - def _is_boolean(self): - return False - - def _is_integer(self): - return True - - def _is_floating(self): - return False - - def _is_object(self): - return False - - def _is_categorical(self): - return False - - def _is_interval(self): - return False - - -class Int8Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - Int8Index is a special case of Index with purely - integer(``int8``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Returns - ------- - Int8Index - """ - - _dtype = np.int8 - - -class Int16Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - Int16Index is a special case of Index with purely - integer(``int16``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Returns - ------- - Int16Index - """ - - _dtype = np.int16 - - -class Int32Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - Int32Index is a special case of Index with purely - integer(``int32``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Returns - ------- - Int32Index - """ - - _dtype = np.int32 - - -class Int64Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - Int64Index is a special case of Index with purely - integer(``int64``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Returns - ------- - Int64Index - """ - - _dtype = np.int64 - - -class UInt8Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - UInt8Index is a special case of Index with purely - integer(``uint64``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Returns - ------- - UInt8Index - """ - - _dtype = np.uint8 - - -class UInt16Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - UInt16Index is a special case of Index with purely - integer(``uint16``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Returns - ------- - UInt16Index - """ - - _dtype = np.uint16 - - -class UInt32Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - UInt32Index is a special case of Index with purely - integer(``uint32``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Returns - ------- - UInt32Index - """ - - _dtype = np.uint32 - - -class UInt64Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - UInt64Index is a special case of Index with purely - integer(``uint64``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Returns - ------- - UInt64Index - """ - - _dtype = np.uint64 - - -class Float32Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - Float32Index is a special case of Index with purely - float(``float32``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Returns - ------- - Float32Index - """ - - _dtype = np.float32 - - def _is_integer(self): - return False - - def _is_floating(self): - return True - - -class Float64Index(NumericIndex): - """ - Immutable, ordered and sliceable sequence of labels. - The basic object storing row labels for all cuDF objects. - Float64Index is a special case of Index with purely - float(``float64``) labels. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : NumPy dtype, - but not used. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - - Returns - ------- - Float64Index - """ - - _dtype = np.float64 - - def _is_integer(self): - return False - - def _is_floating(self): - return True + def str(self): + if isinstance(self._values, cudf.core.column.StringColumn): + return StringMethods(parent=self) + else: + raise AttributeError( + "Can only use .str accessor with string values!" + ) -class DatetimeIndex(GenericIndex): +class DatetimeIndex(Index): """ Immutable , ordered and sliceable sequence of datetime64 data, represented internally as int64. @@ -1952,7 +1651,6 @@ def __init__( if copy: data = data.copy() - super().__init__(data, **kwargs) @property # type: ignore @@ -1970,7 +1668,7 @@ def year(self): >>> datetime_index DatetimeIndex(['2000-12-31', '2001-12-31', '2002-12-31'], dtype='datetime64[ns]') >>> datetime_index.year - Int16Index([2000, 2001, 2002], dtype='int16') + Index([2000, 2001, 2002], dtype='int16') """ # noqa: E501 return self._get_dt_field("year") @@ -1989,7 +1687,7 @@ def month(self): >>> datetime_index DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31'], dtype='datetime64[ns]') >>> datetime_index.month - Int16Index([1, 2, 3], dtype='int16') + Index([1, 2, 3], dtype='int16') """ # noqa: E501 return self._get_dt_field("month") @@ -2008,7 +1706,7 @@ def day(self): >>> datetime_index DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03'], dtype='datetime64[ns]') >>> datetime_index.day - Int16Index([1, 2, 3], dtype='int16') + Index([1, 2, 3], dtype='int16') """ # noqa: E501 return self._get_dt_field("day") @@ -2029,7 +1727,7 @@ def hour(self): '2000-01-01 02:00:00'], dtype='datetime64[ns]') >>> datetime_index.hour - Int16Index([0, 1, 2], dtype='int16') + Index([0, 1, 2], dtype='int16') """ return self._get_dt_field("hour") @@ -2050,7 +1748,7 @@ def minute(self): '2000-01-01 00:02:00'], dtype='datetime64[ns]') >>> datetime_index.minute - Int16Index([0, 1, 2], dtype='int16') + Index([0, 1, 2], dtype='int16') """ return self._get_dt_field("minute") @@ -2071,7 +1769,7 @@ def second(self): '2000-01-01 00:00:02'], dtype='datetime64[ns]') >>> datetime_index.second - Int16Index([0, 1, 2], dtype='int16') + Index([0, 1, 2], dtype='int16') """ return self._get_dt_field("second") @@ -2092,7 +1790,7 @@ def microsecond(self): '2000-01-01 00:00:00.000002'], dtype='datetime64[ns]') >>> datetime_index.microsecond - Int32Index([0, 1, 2], dtype='int32') + Index([0, 1, 2], dtype='int32') """ # noqa: E501 return as_index( ( @@ -2124,7 +1822,7 @@ def nanosecond(self): '2000-01-01 00:00:00.000000002'], dtype='datetime64[ns]') >>> datetime_index.nanosecond - Int16Index([0, 1, 2], dtype='int16') + Index([0, 1, 2], dtype='int16') """ return self._get_dt_field("nanosecond") @@ -2146,7 +1844,7 @@ def weekday(self): '2017-01-08'], dtype='datetime64[ns]') >>> datetime_index.weekday - Int16Index([5, 6, 0, 1, 2, 3, 4, 5, 6], dtype='int16') + Index([5, 6, 0, 1, 2, 3, 4, 5, 6], dtype='int16') """ return self._get_dt_field("weekday") @@ -2168,7 +1866,7 @@ def dayofweek(self): '2017-01-08'], dtype='datetime64[ns]') >>> datetime_index.dayofweek - Int16Index([5, 6, 0, 1, 2, 3, 4, 5, 6], dtype='int16') + Index([5, 6, 0, 1, 2, 3, 4, 5, 6], dtype='int16') """ return self._get_dt_field("weekday") @@ -2191,7 +1889,7 @@ def dayofyear(self): '2017-01-08'], dtype='datetime64[ns]') >>> datetime_index.dayofyear - Int16Index([366, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int16') + Index([366, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int16') """ return self._get_dt_field("day_of_year") @@ -2214,7 +1912,7 @@ def day_of_year(self): '2017-01-08'], dtype='datetime64[ns]') >>> datetime_index.day_of_year - Int16Index([366, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int16') + Index([366, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int16') """ return self._get_dt_field("day_of_year") @@ -2249,7 +1947,7 @@ def quarter(self): Returns ------- - Int8Index + Index Integer indicating which quarter the date belongs to. Examples @@ -2258,7 +1956,7 @@ def quarter(self): >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:00:00", ... "1999-12-31 18:40:00"]) >>> gIndex.quarter - Int8Index([2, 4], dtype='int8') + Index([2, 4], dtype='int8') """ res = extract_quarter(self._values) return Index(res, dtype="int8") @@ -2303,7 +2001,7 @@ def to_pandas(self, nullable=False): def _get_dt_field(self, field): out_column = self._values.get_dt_field(field) # column.column_empty_like always returns a Column object - # but we need a NumericalColumn for GenericIndex.. + # but we need a NumericalColumn for Index.. # how should this be handled? out_column = column.build_column( data=out_column.base_data, @@ -2515,7 +2213,7 @@ def tz_convert(self, tz): return DatetimeIndex._from_data({self.name: result_col}) -class TimedeltaIndex(GenericIndex): +class TimedeltaIndex(Index): """ Immutable, ordered and sliceable sequence of timedelta64 data, represented internally as int64. @@ -2588,7 +2286,6 @@ def __init__( if copy: data = data.copy() - super().__init__(data, **kwargs) @_cudf_nvtx_annotate @@ -2605,8 +2302,9 @@ def days(self): """ Number of days for each element. """ + # Need to specifically return `int64` to avoid overflow. return as_index( - arbitrary=self._values.days, name=self.name, dtype="int32" + arbitrary=self._values.days, name=self.name, dtype="int64" ) @property # type: ignore @@ -2664,7 +2362,7 @@ def _is_boolean(self): return False -class CategoricalIndex(GenericIndex): +class CategoricalIndex(Index): """ A categorical of orderable values that represent the indices of another Column @@ -2759,7 +2457,6 @@ def __init__( data = data.as_ordered() elif ordered is False and data.ordered is True: data = data.as_unordered() - super().__init__(data, **kwargs) @property # type: ignore @@ -2929,7 +2626,7 @@ def interval_range( return IntervalIndex(interval_col) -class IntervalIndex(GenericIndex): +class IntervalIndex(Index): """ Immutable index of intervals that are closed on the same side. @@ -3043,80 +2740,6 @@ def _is_boolean(self): return False -class StringIndex(GenericIndex): - """String defined indices into another Column - - .. deprecated:: 23.06 - `StringIndex` is deprecated, use `Index` instead. - - Attributes - ---------- - _values: A StringColumn object or NDArray of strings - name: A string - """ - - @_cudf_nvtx_annotate - def __init__(self, values, copy=False, **kwargs): - warnings.warn( - f"cudf.{self.__class__.__name__} is deprecated and will be " - "removed from cudf in a future version. Use cudf.Index with the " - "appropriate dtype instead.", - FutureWarning, - ) - kwargs = _setdefault_name(values, **kwargs) - if isinstance(values, StringColumn): - values = values.copy(deep=copy) - elif isinstance(values, StringIndex): - values = values._values.copy(deep=copy) - else: - values = column.as_column(values, dtype="str") - if not is_string_dtype(values.dtype): - raise ValueError( - "Couldn't create StringIndex from passed in object" - ) - - super().__init__(values, **kwargs) - - @_cudf_nvtx_annotate - def to_pandas(self, nullable=False): - return pd.Index( - self.to_numpy(na_value=None), - name=self.name, - dtype=pd.StringDtype() if nullable else "object", - ) - - @_cudf_nvtx_annotate - def __repr__(self): - return ( - f"{self.__class__.__name__}({self._values.values_host}," - f" dtype='object'" - + ( - f", name={pd.io.formats.printing.default_pprint(self.name)}" - if self.name is not None - else "" - ) - + ")" - ) - - @copy_docstring(StringMethods) # type: ignore - @property - @_cudf_nvtx_annotate - def str(self): - return StringMethods(parent=self) - - def _clean_nulls_from_index(self): - if self._values.has_nulls(): - return self.fillna(cudf._NA_REP) - else: - return self - - def _is_boolean(self): - return False - - def _is_object(self): - return True - - @_cudf_nvtx_annotate def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex: """Create an Index from an arbitrary object @@ -3137,7 +2760,7 @@ def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex: result : subclass of Index - CategoricalIndex for Categorical input. - DatetimeIndex for Datetime input. - - GenericIndex for all other inputs. + - Index for all other inputs. """ kwargs = _setdefault_name(arbitrary, **kwargs) if isinstance(arbitrary, cudf.MultiIndex): @@ -3174,119 +2797,12 @@ def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex: ) -_dtype_to_index: Dict[Any, Type[NumericIndex]] = { - np.int8: Int8Index, - np.int16: Int16Index, - np.int32: Int32Index, - np.int64: Int64Index, - np.uint8: UInt8Index, - np.uint16: UInt16Index, - np.uint32: UInt32Index, - np.uint64: UInt64Index, - np.float32: Float32Index, - np.float64: Float64Index, -} - - def _setdefault_name(values, **kwargs): if kwargs.get("name") is None: kwargs["name"] = getattr(values, "name", None) return kwargs -class IndexMeta(type): - """Custom metaclass for Index that overrides instance/subclass tests.""" - - def __instancecheck__(self, instance): - return isinstance(instance, BaseIndex) - - def __subclasscheck__(self, subclass): - return issubclass(subclass, BaseIndex) - - -class Index(BaseIndex, metaclass=IndexMeta): - """The basic object storing row labels for all cuDF objects. - - Parameters - ---------- - data : array-like (1-dimensional)/ DataFrame - If it is a DataFrame, it will return a MultiIndex - dtype : NumPy dtype (default: object) - If dtype is None, we find the dtype that best fits the data. - copy : bool - Make a copy of input data. - name : object - Name to be stored in the index. - tupleize_cols : bool (default: True) - When True, attempt to create a MultiIndex if possible. - tupleize_cols == False is not yet supported. - nan_as_null : bool, Default True - If ``None``/``True``, converts ``np.nan`` values to - ``null`` values. - If ``False``, leaves ``np.nan`` values as is. - - Returns - ------- - Index - cudf Index - - Warnings - -------- - This class should not be subclassed. It is designed as a factory for - different subclasses of :class:`BaseIndex` depending on the provided input. - If you absolutely must, and if you're intimately familiar with the - internals of cuDF, subclass :class:`BaseIndex` instead. - - Examples - -------- - >>> import cudf - >>> cudf.Index([1, 2, 3], dtype="uint64", name="a") - UInt64Index([1, 2, 3], dtype='uint64', name='a') - - >>> cudf.Index(cudf.DataFrame({"a":[1, 2], "b":[2, 3]})) - MultiIndex([(1, 2), - (2, 3)], - names=['a', 'b']) - """ - - @_cudf_nvtx_annotate - def __new__( - cls, - data=None, - dtype=None, - copy=False, - name=None, - tupleize_cols=True, - nan_as_null=True, - **kwargs, - ): - assert ( - cls is Index - ), "Index cannot be subclassed, extend BaseIndex instead." - if tupleize_cols is not True: - raise NotImplementedError( - "tupleize_cols != True is not yet supported" - ) - - return as_index( - data, - copy=copy, - dtype=dtype, - name=name, - nan_as_null=nan_as_null, - **kwargs, - ) - - @classmethod - @_cudf_nvtx_annotate - def from_arrow(cls, obj): - try: - return cls(ColumnBase.from_arrow(obj)) - except TypeError: - # Try interpreting object as a MultiIndex before failing. - return cudf.MultiIndex.from_arrow(obj) - - @_cudf_nvtx_annotate def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex: """ diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 40330b45e5b..e406ef14080 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -948,7 +948,7 @@ def _copy_type_metadata( self._index, cudf.core.index.CategoricalIndex ): self._index = cudf.Index( - cast(cudf.core.index.NumericIndex, self._index)._column, + cast("cudf.Index", self._index)._column, name=self._index.name, ) elif isinstance(other._index, cudf.MultiIndex) and not isinstance( diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 0498aa474b6..cdc120935ee 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1000,11 +1000,11 @@ def _concat(cls, objs): obj.columns = colnames source_data = cudf.DataFrame._concat(source_data) - names = [None] * source_data._num_columns - objs = list(filter(lambda o: o.names is not None, objs)) - for o in range(len(objs)): - for i, name in enumerate(objs[o].names): - names[i] = names[i] or name + try: + # Only set names if all objs have the same names + (names,) = {o.names for o in objs} - {None} + except ValueError: + names = [None] * source_data._num_columns return cudf.MultiIndex.from_frame(source_data, names=names) @classmethod @@ -1377,7 +1377,7 @@ def droplevel(self, level=-1): Dropping multiple levels: >>> idx.droplevel(["first", "second"]) - Int64Index([0, 1, 2, 0, 1, 2], dtype='int64', name='third') + Index([0, 1, 2, 0, 1, 2], dtype='int64', name='third') """ mi = self.copy(deep=False) mi._poplevels(level) @@ -1779,7 +1779,7 @@ def _union(self, other, sort=None): # TODO: When to_frame is refactored to return a # deep copy in future, we should push most of the common # logic between MultiIndex._union & BaseIndex._union into - # GenericIndex._union. + # Index._union. other_df = other.copy(deep=True).to_frame(index=False) self_df = self.copy(deep=True).to_frame(index=False) col_names = list(range(0, self.nlevels)) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index b8164255e6d..d3cd84465ca 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -83,7 +83,7 @@ def _get_combined_index(indexes, intersect: bool = False, sort=None): else: index = indexes[0] if sort is None: - sort = not isinstance(index, cudf.StringIndex) + sort = not index._is_object() for other in indexes[1:]: index = index.union(other, sort=False) @@ -427,7 +427,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): elif typ is cudf.MultiIndex: return cudf.MultiIndex._concat(objs) elif issubclass(typ, cudf.Index): - return cudf.core.index.GenericIndex._concat(objs) + return cudf.core.index.Index._concat(objs) else: raise TypeError(f"cannot concatenate object of type {typ}") diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 34936253bf0..4af8aee171c 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1421,9 +1421,7 @@ def _concat(cls, objs, axis=0, index=True): if isinstance(objs[0].index, cudf.MultiIndex): index = cudf.MultiIndex._concat([o.index for o in objs]) else: - index = cudf.core.index.GenericIndex._concat( - [o.index for o in objs] - ) + index = cudf.core.index.Index._concat([o.index for o in objs]) names = {obj.name for obj in objs} if len(names) == 1: @@ -3327,7 +3325,7 @@ def keys(self): c 3 dtype: int64 >>> sr.keys() - StringIndex(['a' 'b' 'c'], dtype='object') + Index(['a', 'b', 'c'], dtype='object') """ return self.index diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index d058d4cee75..27cd1085fa7 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -167,7 +167,7 @@ def from_arrow(cls, array): >>> import cudf >>> import pyarrow as pa >>> cudf.Index.from_arrow(pa.array(["a", "b", None])) - StringIndex(['a' 'b' None], dtype='object') + Index(['a' 'b' None], dtype='object') >>> cudf.Series.from_arrow(pa.array(["a", "b", None])) 0 a 1 b @@ -274,7 +274,7 @@ def factorize(self, sort=False, use_na_sentinel=True): >>> codes array([0, 0, 1], dtype=int8) >>> uniques - StringIndex(['a' 'c'], dtype='object') + Index(['a' 'c'], dtype='object') """ return cudf.core.algorithms.factorize( self, diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 484c013f774..0f54391b426 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -65,25 +65,17 @@ def _check_types( if not exact or exact == "equiv": if ( isinstance(left, cudf.RangeIndex) - and isinstance( - right, - ( - cudf.Int8Index, - cudf.Int16Index, - cudf.Int32Index, - cudf.Int64Index, - ), + and ( + isinstance(right, cudf.Index) + and hasattr(right, "dtype") + and right.dtype.kind == "i" ) ) or ( isinstance(right, cudf.RangeIndex) - and isinstance( - left, - ( - cudf.Int8Index, - cudf.Int16Index, - cudf.Int32Index, - cudf.Int64Index, - ), + and ( + isinstance(left, cudf.Index) + and hasattr(left, "dtype") + and left.dtype.kind == "i" ) ): return @@ -324,7 +316,7 @@ def assert_index_equal( exact : bool or {'equiv'}, default 'equiv' Whether to check the Index class, dtype and inferred_type are identical. If 'equiv', then RangeIndex can be substituted - for Int8Index, Int16Index, Int32Index, Int64Index as well. + for Index with an int8/int32/int64 dtype as well. check_names : bool, default True Whether to check the names attribute. check_less_precise : bool or int, default False diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index e62f19f7877..c74d1fdd85b 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -664,11 +664,11 @@ def test_different_shapes_and_columns_with_unaligned_indices(binop): # Test with a RangeIndex pdf1 = pd.DataFrame({"x": [4, 3, 2, 1], "y": [7, 3, 8, 6]}) - # Test with a GenericIndex + # Test with a Index pdf2 = pd.DataFrame( {"x": [1, 2, 3, 7], "y": [4, 5, 6, 7]}, index=[0, 1, 3, 4] ) - # Test with a GenericIndex in a different order + # Test with a Index in a different order pdf3 = pd.DataFrame( {"x": [4, 5, 6, 7], "y": [1, 2, 3, 7], "z": [0, 5, 3, 7]}, index=[0, 3, 5, 3], diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 5875959b0c2..e6f2f9ec448 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -6518,7 +6518,7 @@ def test_dataframe_info_basic(): str_cmp = textwrap.dedent( """\ - StringIndex: 10 entries, a to 1111 + Index: 10 entries, a to 1111 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- @@ -6591,7 +6591,7 @@ def test_dataframe_info_verbose_mem_usage(): str_cmp = textwrap.dedent( """\ - StringIndex: 3 entries, sdfdsf to dsfdf + Index: 3 entries, sdfdsf to dsfdf Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 5583b2290ae..7c610eca88c 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -33,6 +33,7 @@ TIMEDELTA_TYPES, assert_eq, assert_exceptions_equal, + expect_warning_if, ) from cudf.testing.dataset_generator import rand_dataframe @@ -1290,7 +1291,7 @@ def test_groupby_index_type(): df["string_col"] = ["a", "b", "c"] df["counts"] = [1, 2, 3] res = df.groupby(by="string_col").counts.sum() - assert isinstance(res.index, cudf.StringIndex) + assert res.index.dtype == cudf.dtype("object") @pytest.mark.parametrize( @@ -2020,7 +2021,7 @@ def test_groupby_no_keys(pdf): pdf.groupby([]).max(), gdf.groupby([]).max(), check_dtype=False, - check_index_type=False, # Int64Index v/s Float64Index + check_index_type=False, # Int64 v/s Float64 **kwargs, ) @@ -2038,7 +2039,7 @@ def test_groupby_apply_no_keys(pdf): assert_groupby_results_equal( pdf.groupby([], group_keys=False).apply(lambda x: x.max()), gdf.groupby([]).apply(lambda x: x.max()), - check_index_type=False, # Int64Index v/s Float64Index + check_index_type=False, # Int64 v/s Float64 **kwargs, ) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 0bfd486ae74..de4c72389cf 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -15,7 +15,7 @@ from cudf.core.index import ( CategoricalIndex, DatetimeIndex, - GenericIndex, + Index, IntervalIndex, RangeIndex, as_index, @@ -204,9 +204,9 @@ def test_pandas_as_index(): gdf_category_index = as_index(pdf_category_index) # Check instance types - assert isinstance(gdf_int_index, GenericIndex) - assert isinstance(gdf_uint_index, GenericIndex) - assert isinstance(gdf_float_index, GenericIndex) + assert isinstance(gdf_int_index, Index) + assert isinstance(gdf_uint_index, Index) + assert isinstance(gdf_float_index, Index) assert isinstance(gdf_datetime_index, DatetimeIndex) assert isinstance(gdf_category_index, CategoricalIndex) @@ -329,7 +329,7 @@ def test_index_copy_datetime(name, deep=True): @pytest.mark.parametrize("name", ["x"]) def test_index_copy_string(name, deep=True): - cidx = cudf.StringIndex(["a", "b", "c"]) + cidx = cudf.Index(["a", "b", "c"]) pidx = cidx.to_pandas() pidx_copy = pidx.copy(name=name, deep=deep) @@ -393,12 +393,12 @@ def test_index_copy_deep(idx, deep, copy_on_write): original_cow_setting = cudf.get_option("copy_on_write") cudf.set_option("copy_on_write", copy_on_write) if ( - isinstance(idx, cudf.StringIndex) + isinstance(idx._values, cudf.core.column.StringColumn) or not deep or (cudf.get_option("copy_on_write") and not deep) ): # StringColumn is immutable hence, deep copies of a - # StringIndex will share the same StringColumn. + # Index with string dtype will share the same StringColumn. # When `copy_on_write` is turned on, Index objects will # have unique column object but they all point to same @@ -1207,91 +1207,48 @@ def test_index_basic(data, dtype, name): @pytest.mark.parametrize("name", [1, "a", None]) @pytest.mark.parametrize("dtype", SIGNED_INTEGER_TYPES) def test_integer_index_apis(data, name, dtype): - with pytest.warns(FutureWarning): - pindex = pd.Int64Index(data, dtype=dtype, name=name) - # Int8Index - with pytest.warns(FutureWarning): - gindex = cudf.Int8Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("int8") - - # Int16Index - with pytest.warns(FutureWarning): - gindex = cudf.Int16Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("int16") - - # Int32Index - with pytest.warns(FutureWarning): - gindex = cudf.Int32Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("int32") + if PANDAS_GE_200: + pindex = pd.Index(data, dtype=dtype, name=name) + else: + with pytest.warns(FutureWarning): + pindex = pd.Int64Index(data, dtype=dtype, name=name) - # Int64Index - with pytest.warns(FutureWarning): - gindex = cudf.Int64Index(data, dtype=dtype, name=name) + gindex = cudf.Index(data, dtype=dtype, name=name) assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("int64") + assert gindex.dtype == dtype @pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) @pytest.mark.parametrize("name", [1, "a", None]) @pytest.mark.parametrize("dtype", UNSIGNED_TYPES) def test_unsigned_integer_index_apis(data, name, dtype): - with pytest.warns(FutureWarning): - pindex = pd.UInt64Index(data, dtype=dtype, name=name) - # UInt8Index - with pytest.warns(FutureWarning): - gindex = cudf.UInt8Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("uint8") - - # UInt16Index - with pytest.warns(FutureWarning): - gindex = cudf.UInt16Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("uint16") - - # UInt32Index - with pytest.warns(FutureWarning): - gindex = cudf.UInt32Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("uint32") + if PANDAS_GE_200: + pindex = pd.Index(data, dtype=dtype, name=name) + else: + with pytest.warns(FutureWarning): + pindex = pd.UInt64Index(data, dtype=dtype, name=name) - # UInt64Index - with pytest.warns(FutureWarning): - gindex = cudf.UInt64Index(data, dtype=dtype, name=name) + gindex = cudf.Index(data, dtype=dtype, name=name) assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("uint64") + assert gindex.dtype == dtype @pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) @pytest.mark.parametrize("name", [1, "a", None]) @pytest.mark.parametrize("dtype", FLOAT_TYPES) def test_float_index_apis(data, name, dtype): - with pytest.warns(FutureWarning): - pindex = pd.Float64Index(data, dtype=dtype, name=name) - # Float32Index - with pytest.warns(FutureWarning): - gindex = cudf.Float32Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("float32") + if PANDAS_GE_200: + pindex = pd.Index(data, dtype=dtype, name=name) + else: + with pytest.warns(FutureWarning): + pindex = pd.Float64Index(data, dtype=dtype, name=name) - # Float64Index - with pytest.warns(FutureWarning): - gindex = cudf.Float64Index(data, dtype=dtype, name=name) + gindex = cudf.Index(data, dtype=dtype, name=name) assert_eq(pindex, gindex) - assert gindex.dtype == np.dtype("float64") + assert gindex.dtype == dtype @pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) @@ -1591,6 +1548,9 @@ def test_interval_index_from_breaks(closed): [[1, 2, 3, 4], ["yellow", "violet", "pink", "white"]], names=("number1", "color2"), ), + pd.MultiIndex.from_arrays( + [[1, 1, 2, 2], ["red", "blue", "red", "blue"]], + ), ], ) @pytest.mark.parametrize( @@ -1604,6 +1564,9 @@ def test_interval_index_from_breaks(closed): [[1, 2, 3, 4], ["yellow", "violet", "pink", "white"]], names=("number1", "color2"), ), + pd.MultiIndex.from_arrays( + [[1, 1, 2, 2], ["red", "blue", "red", "blue"]], + ), ], ) def test_multiindex_append(data, other): @@ -1726,7 +1689,7 @@ def test_index_fillna(data, fill_value): assert_eq( pdi.fillna(fill_value), gdi.fillna(fill_value), exact=False - ) # Int64Index v/s Float64Index + ) # Int64 v/s Float64 @pytest.mark.parametrize( @@ -1764,7 +1727,7 @@ def test_index_from_arrow(data): arrow_array = pa.Array.from_pandas(pdi) expected_index = pd.Index(arrow_array.to_pandas()) gdi = cudf.Index.from_arrow(arrow_array) - if PANDAS_GE_200: + if PANDAS_GE_200 and gdi.dtype == cudf.dtype("datetime64[s]"): # Arrow bug: # https://github.com/apache/arrow/issues/33321 # arrow cannot convert non-nanosecond diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py index 1fcf41389dc..db7e4588e95 100644 --- a/python/cudf/cudf/tests/test_monotonic.py +++ b/python/cudf/cudf/tests/test_monotonic.py @@ -13,7 +13,7 @@ from cudf.core.index import ( CategoricalIndex, DatetimeIndex, - GenericIndex, + Index, RangeIndex, ) from cudf.testing._utils import assert_eq @@ -49,7 +49,7 @@ def test_range_index(testrange): ) def test_generic_index(testlist): - index = GenericIndex(testlist) + index = Index(testlist) index_pd = pd.Index(testlist) assert index.is_unique == index_pd.is_unique @@ -222,7 +222,7 @@ def test_multiindex_tuples(testarr): ) @pytest.mark.parametrize("side", ["left", "right"]) def test_get_slice_bound(testlist, side): - index = GenericIndex(testlist) + index = Index(testlist) index_pd = pd.Index(testlist) for label in testlist: expect = index_pd.get_slice_bound(label, side) @@ -269,7 +269,7 @@ def test_rangeindex_get_slice_bound_step(bounds, label, side): @pytest.mark.parametrize("side", ["left", "right"]) def test_get_slice_bound_missing(label, side): mylist = [2, 4, 6, 8, 10] - index = GenericIndex(mylist) + index = Index(mylist) index_pd = pd.Index(mylist) expect = index_pd.get_slice_bound(label, side) @@ -284,7 +284,7 @@ def test_get_slice_bound_missing_str(label, side): # Slicing for monotonic string indices not yet supported # when missing values are specified (allowed in pandas) mylist = ["b", "d", "f"] - index = GenericIndex(mylist) + index = Index(mylist) index_pd = pd.Index(mylist) got = index.get_slice_bound(label, side) expect = index_pd.get_slice_bound(label, side) diff --git a/python/cudf/cudf/tests/test_pack.py b/python/cudf/cudf/tests/test_pack.py index 9b5a8c19cf5..9011efebedb 100644 --- a/python/cudf/cudf/tests/test_pack.py +++ b/python/cudf/cudf/tests/test_pack.py @@ -18,7 +18,7 @@ import numpy as np import pandas as pd -from cudf import DataFrame, GenericIndex, Series +from cudf import DataFrame, Index, Series from cudf._lib.copying import pack, unpack from cudf.testing._utils import assert_eq @@ -52,7 +52,7 @@ def check_packed_equality(df): assert_packed_frame_equality(df[2:-2]) # sorted sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, GenericIndex) + assert isinstance(sortvaldf.index, Index) assert_packed_frame_equality(sortvaldf) @@ -120,7 +120,7 @@ def check_packed_unique_pointers(df): assert_packed_frame_unique_pointers(df[2:-2]) # sorted sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, GenericIndex) + assert isinstance(sortvaldf.index, Index) assert_packed_frame_unique_pointers(sortvaldf) @@ -188,7 +188,7 @@ def check_packed_pickled_equality(df): assert_packed_frame_picklable(df[2:-2]) # sorted sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, GenericIndex) + assert isinstance(sortvaldf.index, Index) assert_packed_frame_picklable(sortvaldf) # out-of-band buffers = [] @@ -261,7 +261,7 @@ def check_packed_serialized_equality(df): assert_packed_frame_serializable(df[2:-2]) # sorted sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, GenericIndex) + assert isinstance(sortvaldf.index, Index) assert_packed_frame_serializable(sortvaldf) diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py index 71c1f206a64..69ccb5be860 100644 --- a/python/cudf/cudf/tests/test_pickling.py +++ b/python/cudf/cudf/tests/test_pickling.py @@ -6,7 +6,7 @@ import pandas as pd import pytest -from cudf import DataFrame, GenericIndex, RangeIndex, Series +from cudf import DataFrame, Index, RangeIndex, Series from cudf.core.buffer import as_buffer from cudf.testing._utils import assert_eq @@ -22,7 +22,7 @@ def check_serialization(df): assert_frame_picklable(df[2:-2]) # sorted sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, (GenericIndex, RangeIndex)) + assert isinstance(sortvaldf.index, (Index, RangeIndex)) assert_frame_picklable(sortvaldf) # out-of-band buffers = [] @@ -80,7 +80,7 @@ def test_memory_usage_dataframe(): def test_pickle_index(): nelem = 10 - idx = GenericIndex(np.arange(nelem), name="a") + idx = Index(np.arange(nelem), name="a") pickled = pickle.dumps(idx) out = pickle.loads(pickled) assert (idx == out).all() diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index e7fa401f1ec..7a67fddd87b 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -302,39 +302,40 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows): [ ( cudf.Index([1, 2, 3, None]), - "Int64Index([1, 2, 3, ], dtype='int64')", + "Index([1, 2, 3, ], dtype='int64')", ), ( cudf.Index([None, 2.2, 3.324342, None]), - "Float64Index([, 2.2, 3.324342, ], dtype='float64')", + "Index([, 2.2, 3.324342, ], dtype='float64')", ), ( cudf.Index([None, None, None], name="hello"), - "StringIndex([None None None], dtype='object', name='hello')", + "Index([, , ], dtype='object', name='hello')", ), ( cudf.Index([None, None, None], dtype="float", name="hello"), - "Float64Index([, , ], dtype='float64', name='hello')", + "Index([, , ], dtype='float64', name='hello')", ), ( cudf.Index([None], dtype="float64", name="hello"), - "Float64Index([], dtype='float64', name='hello')", + "Index([], dtype='float64', name='hello')", ), ( cudf.Index([None], dtype="int8", name="hello"), - "Int8Index([], dtype='int8', name='hello')", + "Index([], dtype='int8', name='hello')", ), ( cudf.Index([None] * 50, dtype="object"), - "StringIndex([None None None None None None None None " - "None None None None None None\n None None None None None None " - "None None None None None None None None\n None None None None " - "None None None None None None None None None None\n None None " - "None None None None None None], dtype='object')", + "Index([, , , , , , , , , " + ", , ,\n , , , , , , , " + ", , , , ,\n , , , , " + ", , , , , , , ,\n , " + ", , , , , , , , , , " + ",\n , ],\n dtype='object')", ), ( cudf.Index([None] * 20, dtype="uint32"), - "UInt32Index([, , , , , , , , " + "Index([, , , , , , , , " ",\n , , , , , , , , " ",\n , ],\n dtype='uint32')", ), @@ -342,7 +343,7 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows): cudf.Index( [None, 111, 22, 33, None, 23, 34, 2343, None], dtype="int16" ), - "Int16Index([, 111, 22, 33, , 23, 34, 2343, ], " + "Index([, 111, 22, 33, , 23, 34, 2343, ], " "dtype='int16')", ), ( @@ -482,7 +483,7 @@ def test_dataframe_null_index_repr(df, pandas_special_case): actual_repr = repr(gdf) if pandas_special_case: - # Pandas inconsistently print StringIndex null values + # Pandas inconsistently print Index null values # as `None` at some places and `NaN` at few other places # Whereas cudf is consistent with strings `null` values # to be printed as `None` everywhere. @@ -561,7 +562,7 @@ def test_series_null_index_repr(sr, pandas_special_case): actual_repr = repr(gsr) if pandas_special_case: - # Pandas inconsistently print StringIndex null values + # Pandas inconsistently print Index null values # as `None` at some places and `NaN` at few other places # Whereas cudf is consistent with strings `null` values # to be printed as `None` everywhere. diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py index e7f26e259c6..2fdc3ef441b 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/test_serialize.py @@ -193,8 +193,8 @@ def test_serialize_range_index(): def test_serialize_generic_index(): - index = cudf.core.index.GenericIndex(cudf.Series(np.arange(10))) - outindex = cudf.core.index.GenericIndex.deserialize(*index.serialize()) + index = cudf.core.index.Index(cudf.Series(np.arange(10))) + outindex = cudf.core.index.Index.deserialize(*index.serialize()) assert_eq(index, outindex) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 200bd30cb12..618f94ed25b 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -17,7 +17,7 @@ from cudf import concat from cudf.core._compat import PANDAS_GE_150 from cudf.core.column.string import StringColumn -from cudf.core.index import StringIndex, as_index +from cudf.core.index import Index, as_index from cudf.testing._utils import ( DATETIME_TYPES, NUMERIC_TYPES, @@ -1075,8 +1075,7 @@ def test_string_index(): pdf.index = stringIndex gdf.index = stringIndex assert_eq(pdf, gdf) - with pytest.warns(FutureWarning): - stringIndex = StringIndex(["a", "b", "c", "d", "e"], name="name") + stringIndex = Index(["a", "b", "c", "d", "e"], name="name") pdf.index = stringIndex.to_pandas() gdf.index = stringIndex assert_eq(pdf, gdf) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 821ec103204..e44775e56df 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -57,8 +57,8 @@ def _nonempty_index(idx): data = np.array([start, "1970-01-02"], dtype=idx.dtype) values = cudf.core.column.as_column(data) return cudf.core.index.DatetimeIndex(values, name=idx.name) - elif isinstance(idx, cudf.StringIndex): - return cudf.StringIndex(["cat", "dog"], name=idx.name) + elif isinstance(idx._column, cudf.core.column.StringColumn): + return cudf.Index(["cat", "dog"], name=idx.name) elif isinstance(idx, cudf.core.index.CategoricalIndex): key = tuple(idx._data.keys()) assert len(key) == 1 @@ -69,8 +69,8 @@ def _nonempty_index(idx): categories=categories, codes=codes, ordered=ordered ) return cudf.core.index.CategoricalIndex(values, name=idx.name) - elif isinstance(idx, cudf.core.index.GenericIndex): - return cudf.core.index.GenericIndex( + elif isinstance(idx, cudf.core.index.Index): + return cudf.core.index.Index( np.arange(2, dtype=idx.dtype), name=idx.name ) elif isinstance(idx, cudf.core.multiindex.MultiIndex): From 72a663ed43c0f95da36eb55c933a9bf564506b6a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 30 May 2023 10:21:00 -0700 Subject: [PATCH 039/162] Fix MultiIndex.get_indexer pytest --- python/cudf/cudf/tests/test_index.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 58872e00394..80707763246 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2251,10 +2251,15 @@ def test_get_loc_multi_numeric_deviate(idx, key, result): ], ) @pytest.mark.parametrize("method", [None, "ffill", "bfill"]) -def test_get_indexer_multi_numeric_deviate(idx, key, method): +def test_get_indexer_multi_numeric_deviate(request, idx, key, method): pi = idx gi = cudf.from_pandas(pi) - + request.applymarker( + pytest.mark.xfail( + condition=method is not None and key == ((1, 2, 3),), + reason="https://github.com/pandas-dev/pandas/issues/53452", + ) + ) expected = pi.get_indexer(key, method=method) got = gi.get_indexer(key, method=method) From 8791749ec7eabf355ca665695a6122ea5c7f4a05 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 30 May 2023 13:49:07 -0700 Subject: [PATCH 040/162] Complete get_indexer implementation --- python/cudf/cudf/core/index.py | 114 +++++++++++++++++++++------ python/cudf/cudf/core/multiindex.py | 38 +++++---- python/cudf/cudf/tests/test_index.py | 17 ++-- 3 files changed, 119 insertions(+), 50 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index a9db663a19b..a4e716910f1 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -16,6 +16,7 @@ ) import cupy +import operator import numpy as np import pandas as pd from pandas._config import get_option @@ -1174,10 +1175,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): """ if is_scalar(target): raise TypeError("Should be a sequence") - # if tolerance is not None: - # raise NotImplementedError( - # "Parameter tolerance is not supported yet." - # ) + if method not in { None, "ffill", @@ -1220,28 +1218,27 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): ) if method is None: result_series = result_series.fillna(-1) + elif method in {"ffill", "bfill", "pad", "backfill"}: + result_series = _get_indexer_basic( + index=self, + positions=result_series, + method=method, + target_col=needle_table["None"], + tolerance=tolerance, + ) + elif method == "nearest": + result_series = _get_nearest_indexer( + index=self, + positions=result_series, + target_col=needle_table["None"], + tolerance=tolerance, + ) else: - nonexact = result_series.isnull() - result_series[nonexact] = self.searchsorted( - needle_table["None"][nonexact], - side="left" if method in {"pad", "ffill"} else "right", + raise ValueError( + f"{method=} is unsupported, only supported values are: " + f"{['ffill', 'bfill', 'nearest', None]}" ) - if method in {"pad", "ffill"}: - # searchsorted returns "indices into a sorted array such that, - # if the corresponding elements in v were inserted before the - # indices, the order of a would be preserved". - # Thus, we need to subtract 1 to find values to the left. - result_series[nonexact] -= 1 - # This also mapped not found values (values of 0 from - # np.searchsorted) to -1, which conveniently is also our - # sentinel for missing values - else: - # Mark indices to the right of the largest value as not found - result_series[result_series == len(self)] = -1 - if tolerance is not None: - distance = self[result_series] - needle_table["None"] - # return cupy.where(distance <= tolerance, result_series, -1) - return result_series.where(distance <= tolerance, -1).to_cupy() + return result_series.to_cupy() @_cudf_nvtx_annotate @@ -2908,3 +2905,72 @@ def _extended_gcd(a: int, b: int) -> Tuple[int, int, int]: old_s, s = s, old_s - quotient * s old_t, t = t, old_t - quotient * t return old_r, old_s, old_t + + +def _get_indexer_basic(index, positions, method, target_col, tolerance): + nonexact = positions.isnull() + positions[nonexact] = index.searchsorted( + target_col[nonexact], + side="left" if method in {"pad", "ffill"} else "right", + ) + if method in {"pad", "ffill"}: + # searchsorted returns "indices into a sorted array such that, + # if the corresponding elements in v were inserted before the + # indices, the order of a would be preserved". + # Thus, we need to subtract 1 to find values to the left. + positions[nonexact] -= 1 + # This also mapped not found values (values of 0 from + # np.searchsorted) to -1, which conveniently is also our + # sentinel for missing values + else: + # Mark indices to the right of the largest value as not found + positions[positions == len(index)] = -1 + + if tolerance is not None: + distance = abs(index[positions] - target_col) + return positions.where(distance <= tolerance, -1) + return positions + + +def _get_nearest_indexer(index, positions, target_col, tolerance): + """ + Get the indexer for the nearest index labels; requires an index with + values that can be subtracted from each other. + """ + if not len(index): + return _get_indexer_basic( + index=index, + positions=positions.copy(deep=True), + method="pad", + targe_col=target_col, + tolerance=tolerance, + ) + + left_indexer = _get_indexer_basic( + index=index, + positions=positions.copy(deep=True), + method="pad", + target_col=target_col, + tolerance=tolerance, + ) + right_indexer = _get_indexer_basic( + index=index, + positions=positions.copy(deep=True), + method="backfill", + target_col=target_col, + tolerance=tolerance, + ) + + left_distances = abs(index[left_indexer] - target_col) + right_distances = abs(index[right_indexer] - target_col) + + op = operator.lt if index.is_monotonic_increasing else operator.le + indexer = left_indexer.where( + op(left_distances, right_distances) | (right_indexer == -1), + right_indexer, + ) + + if tolerance is not None: + distance = abs(index[indexer] - target_col) + return indexer.where(distance <= tolerance, -1) + return indexer diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 7b5eed2d631..9f41e9db610 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -27,6 +27,7 @@ _index_astype_docstring, _lexsorted_equal_range, as_index, + _get_indexer_basic, ) from cudf.utils.docutils import doc_apply from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate @@ -1696,6 +1697,11 @@ def get_indexer(self, target, method=None, tolerance=None): raise NotImplementedError( "Parameter tolerance is not supported yet." ) + if method == "nearest": + raise NotImplementedError( + f"{method=} is not supported yet for MultiIndex." + ) + target = cudf.MultiIndex.from_tuples(target) needle_table = target.to_frame(index=False) col_names = list(range(0, self.nlevels)) @@ -1712,28 +1718,20 @@ def get_indexer(self, target, method=None, tolerance=None): ) if method is None: result_series = result_series.fillna(-1) + elif method in {"ffill", "bfill", "pad", "backfill"}: + result_series = _get_indexer_basic( + index=self, + positions=result_series, + method=method, + target_col=needle_table[col_names], + tolerance=tolerance, + ) else: - nonexact = result_series.isnull() - result_series[nonexact] = self.searchsorted( - needle_table[col_names][nonexact], - side="left" if method in {"pad", "ffill"} else "right", + raise ValueError( + f"{method=} is unsupported, only supported values are: " + f"{['ffill', 'bfill', None]}" ) - if method in {"pad", "ffill"}: - # searchsorted returns "indices into a sorted array such that, - # if the corresponding elements in v were inserted before the - # indices, the order of a would be preserved". - # Thus, we need to subtract 1 to find values to the left. - result_series[nonexact] -= 1 - # This also mapped not found values (values of 0 from - # np.searchsorted) to -1, which conveniently is also our - # sentinel for missing values - else: - # Mark indices to the right of the largest value as not found - result_series[result_series == len(self)] = -1 - if tolerance is not None: - distance = self[result_series] - needle_table["None"] - # return cupy.where(distance <= tolerance, result_series, -1) - return result_series.where(distance <= tolerance, -1).to_cupy() + return result_series.to_cupy() @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 80707763246..2eb14cb2413 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1940,7 +1940,7 @@ def test_get_indexer_single_unique_numeric(idx, key, method): list(range(77, 110, 3)), ], ) -@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) +@pytest.mark.parametrize("method", [None, "ffill", "bfill", "nearest"]) @pytest.mark.parametrize("tolerance", [None, 0, 1, 13, 20]) def test_get_indexer_rangeindex(idx, key, method, tolerance): pi = idx @@ -2018,8 +2018,9 @@ def test_get_loc_single_duplicate_numeric(idx, key): ], ) @pytest.mark.parametrize("key", [[0, 3, 1], [6, 7]]) -@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) -def test_get_indexer_single_duplicate_numeric(idx, key, method): +@pytest.mark.parametrize("method", [None, "ffill", "bfill", "nearest"]) +@pytest.mark.parametrize("tolerance", [None, 1, 2]) +def test_get_indexer_single_duplicate_numeric(idx, key, method, tolerance): pi = idx gi = cudf.from_pandas(pi) @@ -2031,8 +2032,12 @@ def test_get_indexer_single_duplicate_numeric(idx, key, method): rfunc_args_and_kwargs=([], {"key": key, "method": method}), ) else: - expected = pi.get_indexer(key, method=method) - got = gi.get_indexer(key, method=method) + expected = pi.get_indexer( + key, method=method, tolerance=None if method is None else tolerance + ) + got = gi.get_indexer( + key, method=method, tolerance=None if method is None else tolerance + ) assert_eq(expected, got) @@ -2179,7 +2184,7 @@ def test_get_loc_multi_numeric(idx, key): ], ) @pytest.mark.parametrize("key", [[(1, 2, 3)], [(9, 9, 9)]]) -@pytest.mark.parametrize("method", [None]) +@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) def test_get_indexer_multi_numeric(idx, key, method): pi = idx.sort_values() gi = cudf.from_pandas(pi) From ac39341437dc12d35e41e00285909829596fbc4c Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 30 May 2023 14:24:09 -0700 Subject: [PATCH 041/162] Update docs --- docs/cudf/source/api_docs/index_objects.rst | 3 + python/cudf/cudf/core/_base_index.py | 82 +++++++++++++- python/cudf/cudf/core/index.py | 77 +------------ python/cudf/cudf/core/multiindex.py | 114 +------------------- 4 files changed, 90 insertions(+), 186 deletions(-) diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst index 1b748a8f69f..69b5a5f0631 100644 --- a/docs/cudf/source/api_docs/index_objects.rst +++ b/docs/cudf/source/api_docs/index_objects.rst @@ -135,6 +135,7 @@ Selecting .. autosummary:: :toctree: api/ + Index.get_indexer Index.get_level_values Index.get_loc Index.get_slice_bound @@ -191,6 +192,7 @@ IntervalIndex components IntervalIndex.from_breaks IntervalIndex.values + IntervalIndex.get_indexer IntervalIndex.get_loc .. _api.multiindex: @@ -236,6 +238,7 @@ MultiIndex selecting .. autosummary:: :toctree: api/ + MultiIndex.get_indexer MultiIndex.get_loc MultiIndex.get_level_values diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 1527232f340..d34405032b5 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -3,6 +3,7 @@ from __future__ import annotations import pickle +import warnings from functools import cached_property from typing import Any, Set @@ -10,7 +11,6 @@ from typing_extensions import Self import cudf -import warnings from cudf._lib.copying import _gather_map_is_valid, gather from cudf._lib.stream_compaction import ( apply_boolean_mask, @@ -91,9 +91,89 @@ def values(self): raise NotImplementedError def get_indexer(self, target, method=None, limit=None, tolerance=None): + """ + Compute indexer and mask for new index given the current index. + + The indexer should be then used as an input to ndarray.take to align + the current data to the new index. + + Parameters + ---------- + target : Index + method : {None, 'pad'/'fill', 'backfill'/'bfill', 'nearest'}, optional + - default: exact matches only. + - pad / ffill: find the PREVIOUS index value if no exact match. + - backfill / bfill: use NEXT index value if no exact match. + - nearest: use the NEAREST index value if no exact match. Tied + distances are broken by preferring the larger index + value. + tolerance : int or float, optional + Maximum distance from index value for inexact matches. The value + of the index at the matching location must satisfy the equation + ``abs(index[loc] - target) <= tolerance``. + + Returns + ------- + cupy.ndarray + Integers from 0 to n - 1 indicating that the index at these + positions matches the corresponding target values. + Missing values in the target are marked by -1. + + Examples + -------- + >>> import cudf + >>> index = cudf.Index(['c', 'a', 'b']) + >>> index + Index(['c', 'a', 'b'], dtype='object') + >>> index.get_indexer(['a', 'b', 'x']) + array([ 1, 2, -1]) + """ raise NotImplementedError def get_loc(self, key): + """ + Get integer location, slice or boolean mask for requested label. + + Parameters + ---------- + key : label + + Returns + ------- + int or slice or boolean mask + - If result is unique, return integer index + - If index is monotonic, loc is returned as a slice object + - Otherwise, a boolean mask is returned + + Examples + -------- + >>> import cudf + >>> unique_index = cudf.Index(list('abc')) + >>> unique_index.get_loc('b') + 1 + >>> monotonic_index = cudf.Index(list('abbc')) + >>> monotonic_index.get_loc('b') + slice(1, 3, None) + >>> non_monotonic_index = cudf.Index(list('abcb')) + >>> non_monotonic_index.get_loc('b') + array([False, True, False, True]) + >>> numeric_unique_index = cudf.Index([1, 2, 3]) + >>> numeric_unique_index.get_loc(3) + 2 + + **MultiIndex** + + >>> multi_index = cudf.MultiIndex.from_tuples([('a', 'd'), ('b', 'e'), ('b', 'f')]) + >>> multi_index + MultiIndex([('a', 'd'), + ('b', 'e'), + ('b', 'f')], + ) + >>> multi_index.get_loc('b') + slice(1, 3, None) + >>> multi_index.get_loc(('b', 'e')) + 1 + """ # noqa: E501 raise NotImplementedError def __getitem__(self, key): diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index a4e716910f1..a7424bf0f04 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2,6 +2,7 @@ from __future__ import annotations +import operator import pickle import warnings from functools import cached_property @@ -16,7 +17,6 @@ ) import cupy -import operator import numpy as np import pandas as pd from pandas._config import get_option @@ -35,6 +35,7 @@ is_scalar, ) from cudf.core._base_index import BaseIndex, _index_astype_docstring +from cudf.core._compat import PANDAS_GE_200 from cudf.core.column import ( CategoricalColumn, ColumnBase, @@ -61,7 +62,6 @@ numeric_normalize_types, ) from cudf.utils.utils import _cudf_nvtx_annotate, search_range -from cudf.core._compat import PANDAS_GE_200 class IndexMeta(type): @@ -591,7 +591,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): ) @_cudf_nvtx_annotate - def get_indexer(self, target, method=None, tolerance=None): + def get_indexer(self, target, limit=None, method=None, tolerance=None): if method is None: if self.step > 0: start, stop, step = self.start, self.stop, self.step @@ -612,15 +612,13 @@ def get_indexer(self, target, method=None, tolerance=None): return locs else: return self._as_int_index().get_indexer( - target=target, method=method, tolerance=tolerance + target=target, limit=limit, method=method, tolerance=tolerance ) @_cudf_nvtx_annotate def get_loc(self, key): - # Given an actual integer, if not is_scalar(key): raise TypeError("Should be a sequence") - # Given an actual integer, idx = (key - self._start) / self._step idx_int_upper_bound = (self._stop - self._start) // self._step if idx > idx_int_upper_bound or idx < 0: @@ -1134,45 +1132,6 @@ def astype(self, dtype, copy: bool = True): @_cudf_nvtx_annotate def get_indexer(self, target, method=None, limit=None, tolerance=None): - """Get integer location, slice or boolean mask for requested label. - - Parameters - ---------- - target : label - method : {None, 'pad'/'fill', 'backfill'/'bfill', 'nearest'}, optional - - default: exact matches only. - - pad / ffill: find the PREVIOUS index value if no exact match. - - backfill / bfill: use NEXT index value if no exact match. - - nearest: use the NEAREST index value if no exact match. Tied - distances are broken by preferring the larger index - value. - tolerance : int or float, optional - Maximum distance from index value for inexact matches. The value - of the index at the matching location must satisfy the equation - ``abs(index[loc] - target) <= tolerance``. - - Returns - ------- - int or slice or boolean mask - - If result is unique, return integer index - - If index is monotonic, loc is returned as a slice object - - Otherwise, a boolean mask is returned - - Examples - -------- - >>> unique_index = cudf.Index(list('abc')) - >>> unique_index.get_loc('b') - 1 - >>> monotonic_index = cudf.Index(list('abbc')) - >>> monotonic_index.get_loc('b') - slice(1, 3, None) - >>> non_monotonic_index = cudf.Index(list('abcb')) - >>> non_monotonic_index.get_loc('b') - array([False, True, False, True]) - >>> numeric_unique_index = cudf.Index([1, 2, 3]) - >>> numeric_unique_index.get_loc(3) - 2 - """ if is_scalar(target): raise TypeError("Should be a sequence") @@ -1243,34 +1202,6 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): @_cudf_nvtx_annotate def get_loc(self, key): - """Get integer location, slice or boolean mask for requested label. - - Parameters - ---------- - key : label - - Returns - ------- - int or slice or boolean mask - - If result is unique, return integer index - - If index is monotonic, loc is returned as a slice object - - Otherwise, a boolean mask is returned - - Examples - -------- - >>> unique_index = cudf.Index(list('abc')) - >>> unique_index.get_loc('b') - 1 - >>> monotonic_index = cudf.Index(list('abbc')) - >>> monotonic_index.get_loc('b') - slice(1, 3, None) - >>> non_monotonic_index = cudf.Index(list('abcb')) - >>> non_monotonic_index.get_loc('b') - array([False, True, False, True]) - >>> numeric_unique_index = cudf.Index([1, 2, 3]) - >>> numeric_unique_index.get_loc(3) - 2 - """ if is_scalar(key): target = [key] else: diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 9f41e9db610..f6da845ba46 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -24,10 +24,10 @@ from cudf.core.frame import Frame from cudf.core.index import ( BaseIndex, + _get_indexer_basic, _index_astype_docstring, _lexsorted_equal_range, as_index, - _get_indexer_basic, ) from cudf.utils.docutils import doc_apply from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate @@ -1637,62 +1637,7 @@ def _level_index_from_level(self, level): return level @_cudf_nvtx_annotate - def get_indexer(self, target, method=None, tolerance=None): - """ - Get location for a label or a tuple of labels. - - The location is returned as an integer/slice or boolean mask. - - Parameters - ---------- - target : label or tuple of labels (one for each level) - method : None - - Returns - ------- - loc : int, slice object or boolean mask - - If index is unique, search result is unique, return a single int. - - If index is monotonic, index is returned as a slice object. - - Otherwise, cudf attempts a best effort to convert the search - result into a slice object, and will return a boolean mask if - failed to do so. Notice this can deviate from Pandas behavior - in some situations. - - Examples - -------- - >>> import cudf - >>> mi = cudf.MultiIndex.from_tuples( - ... [('a', 'd'), ('b', 'e'), ('b', 'f')]) - >>> mi.get_loc('b') - slice(1, 3, None) - >>> mi.get_loc(('b', 'e')) - 1 - >>> non_monotonic_non_unique_idx = cudf.MultiIndex.from_tuples( - ... [('c', 'd'), ('b', 'e'), ('a', 'f'), ('b', 'e')]) - >>> non_monotonic_non_unique_idx.get_loc('b') # differ from pandas - slice(1, 4, 2) - - .. pandas-compat:: - **MultiIndex.get_loc** - - The return types of this function may deviates from the - method provided by Pandas. If the index is neither - lexicographically sorted nor unique, a best effort attempt is made - to coerce the found indices into a slice. For example: - - .. code-block:: - - >>> import pandas as pd - >>> import cudf - >>> x = pd.MultiIndex.from_tuples([ - ... (2, 1, 1), (1, 2, 3), (1, 2, 1), - ... (1, 1, 1), (1, 1, 1), (2, 2, 1), - ... ]) - >>> x.get_loc(1) - array([False, True, True, True, True, False]) - >>> cudf.from_pandas(x).get_loc(1) - slice(1, 5, 1) - """ + def get_indexer(self, target, method=None, limit=None, tolerance=None): if tolerance is not None: raise NotImplementedError( "Parameter tolerance is not supported yet." @@ -1736,61 +1681,6 @@ def get_indexer(self, target, method=None, tolerance=None): @_cudf_nvtx_annotate def get_loc(self, key): - """ - Get location for a label or a tuple of labels. - - The location is returned as an integer/slice or boolean mask. - - Parameters - ---------- - key : label or tuple of labels (one for each level) - method : None - - Returns - ------- - loc : int, slice object or boolean mask - - If index is unique, search result is unique, return a single int. - - If index is monotonic, index is returned as a slice object. - - Otherwise, cudf attempts a best effort to convert the search - result into a slice object, and will return a boolean mask if - failed to do so. Notice this can deviate from Pandas behavior - in some situations. - - Examples - -------- - >>> import cudf - >>> mi = cudf.MultiIndex.from_tuples( - ... [('a', 'd'), ('b', 'e'), ('b', 'f')]) - >>> mi.get_loc('b') - slice(1, 3, None) - >>> mi.get_loc(('b', 'e')) - 1 - >>> non_monotonic_non_unique_idx = cudf.MultiIndex.from_tuples( - ... [('c', 'd'), ('b', 'e'), ('a', 'f'), ('b', 'e')]) - >>> non_monotonic_non_unique_idx.get_loc('b') # differ from pandas - slice(1, 4, 2) - - .. pandas-compat:: - **MultiIndex.get_loc** - - The return types of this function may deviates from the - method provided by Pandas. If the index is neither - lexicographically sorted nor unique, a best effort attempt is made - to coerce the found indices into a slice. For example: - - .. code-block:: - - >>> import pandas as pd - >>> import cudf - >>> x = pd.MultiIndex.from_tuples([ - ... (2, 1, 1), (1, 2, 3), (1, 2, 1), - ... (1, 1, 1), (1, 1, 1), (2, 2, 1), - ... ]) - >>> x.get_loc(1) - array([False, True, True, True, True, False]) - >>> cudf.from_pandas(x).get_loc(1) - slice(1, 5, 1) - """ is_sorted = ( self.is_monotonic_increasing or self.is_monotonic_decreasing ) From a92ad860d15c7357f12be16d395c50d479cc3c9a Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 31 May 2023 08:56:23 -0500 Subject: [PATCH 042/162] Fix parquet paritioning pytest failures (#13474) This PR fixes parquet pytest failures, mostly working around two upstream issues: 1. https://github.com/pandas-dev/pandas/issues/53345 2. https://github.com/apache/arrow/issues/33321 Thus fixes 20 pytest failure: This PR: ``` = 231 failed, 95767 passed, 2045 skipped, 764 xfailed, 300 xpassed in 426.65s (0:07:06) = ``` On `pandas_2.0_feature_branch`: ``` = 251 failed, 95747 passed, 2045 skipped, 764 xfailed, 300 xpassed in 433.50s (0:07:13) = ``` --- python/cudf/cudf/tests/test_parquet.py | 50 ++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index d684e4a19b1..ba49e1fe798 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1749,6 +1749,15 @@ def test_parquet_partitioned(tmpdir_factory, cols, filename): # Check that cudf and pd return the same read got_cudf = cudf.read_parquet(gdf_dir) + if PANDAS_GE_200 and isinstance(got_pd["c"].dtype, pd.CategoricalDtype): + # Work-around for pandas bug: + # https://github.com/pandas-dev/pandas/issues/53345 + got_pd["c"] = got_pd["c"].astype( + pd.CategoricalDtype( + categories=got_pd["c"].dtype.categories.astype("int64"), + ordered=got_pd["c"].dtype.ordered, + ) + ) assert_eq(got_pd, got_cudf) # If filename is specified, check that it is correct @@ -1796,6 +1805,15 @@ def test_parquet_writer_chunked_partitioned(tmpdir_factory, return_meta): # Check that cudf and pd return the same read got_cudf = cudf.read_parquet(gdf_dir) + if PANDAS_GE_200: + # Work-around for pandas bug: + # https://github.com/pandas-dev/pandas/issues/53345 + got_pd["a"] = got_pd["a"].astype( + pd.CategoricalDtype( + categories=got_pd["a"].dtype.categories.astype("int64"), + ordered=got_pd["a"].dtype.ordered, + ) + ) assert_eq(got_pd, got_cudf) @@ -1836,7 +1854,15 @@ def test_parquet_writer_chunked_max_file_size( # Check that cudf and pd return the same read got_cudf = cudf.read_parquet(gdf_dir) - + if PANDAS_GE_200: + # Work-around for pandas bug: + # https://github.com/pandas-dev/pandas/issues/53345 + got_pd["a"] = got_pd["a"].astype( + pd.CategoricalDtype( + categories=got_pd["a"].dtype.categories.astype("int64"), + ordered=got_pd["a"].dtype.ordered, + ) + ) assert_eq( got_pd.sort_values(["b"]).reset_index(drop=True), got_cudf.sort_values(["b"]).reset_index(drop=True), @@ -1882,6 +1908,15 @@ def test_parquet_writer_chunked_partitioned_context(tmpdir_factory): # Check that cudf and pd return the same read got_cudf = cudf.read_parquet(gdf_dir) + if PANDAS_GE_200: + # Work-around for pandas bug: + # https://github.com/pandas-dev/pandas/issues/53345 + got_pd["a"] = got_pd["a"].astype( + pd.CategoricalDtype( + categories=got_pd["a"].dtype.categories.astype("int64"), + ordered=got_pd["a"].dtype.ordered, + ) + ) assert_eq(got_pd, got_cudf) @@ -1989,6 +2024,15 @@ def test_read_parquet_partitioned_filtered( filters = [[("a", "==", 10)], [("c", "==", 1)]] got = cudf.read_parquet(read_path, filters=filters) expect = pd.read_parquet(read_path, filters=filters) + if PANDAS_GE_200: + # Work-around for pandas bug: + # https://github.com/pandas-dev/pandas/issues/53345 + expect["c"] = expect["c"].astype( + pd.CategoricalDtype( + categories=expect["c"].dtype.categories.astype("int64"), + ordered=expect["c"].dtype.ordered, + ) + ) assert_eq(expect, got) @@ -2803,7 +2847,9 @@ def test_parquet_roundtrip_time_delta(): ) buffer = BytesIO() df.to_parquet(buffer) - assert_eq(df, cudf.read_parquet(buffer)) + # TODO: Remove `check_dtype` once following issue is fixed in arrow: + # https://github.com/apache/arrow/issues/33321 + assert_eq(df, cudf.read_parquet(buffer), check_dtype=not PANDAS_GE_200) def test_parquet_reader_malformed_file(datadir): From 63b8fb1a673bbd92761a8b50ec711edd18dc2618 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 2 Jun 2023 19:00:36 -0500 Subject: [PATCH 043/162] Enforce merge validation deprecation (#13499) This PR raises an error when a merge is being performed between data consisting of different levels. --- python/cudf/cudf/core/join/join.py | 12 +++++------- python/cudf/cudf/tests/test_joining.py | 24 +++++++++++++++--------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index de4e323a8d7..480a6c64fe6 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -1,7 +1,6 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from __future__ import annotations -import warnings from typing import Any, ClassVar, List, Optional import cudf @@ -422,11 +421,10 @@ def _validate_merge_params( # modified in the size 0 case. and max(lhs._data.nlevels, 1) != max(rhs._data.nlevels, 1) ): - warnings.warn( - "merging between different levels is deprecated and will be " - f"removed in a future version. ({lhs._data.nlevels} levels on " - f"the left, {rhs._data.nlevels} on the right)", - FutureWarning, + raise ValueError( + "Not allowed to merge between different levels. " + f"({lhs._data.nlevels} levels on " + f"the left, {rhs._data.nlevels} on the right)" ) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index c578266ac22..f8d0bc2ace8 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -2160,15 +2160,21 @@ def test_join_multiindex_empty(): lhs = pd.DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}, index=["a", "b", "c"]) lhs.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")]) rhs = pd.DataFrame(index=["a", "c", "d"]) - with pytest.warns(FutureWarning): - expect = lhs.join(rhs, how="inner") - - lhs = cudf.from_pandas(lhs) - rhs = cudf.from_pandas(rhs) - with pytest.warns(FutureWarning): - got = lhs.join(rhs, how="inner") - - assert_join_results_equal(expect, got, how="inner") + g_lhs = cudf.from_pandas(lhs) + g_rhs = cudf.from_pandas(rhs) + if PANDAS_GE_200: + assert_exceptions_equal( + lfunc=lhs.join, + rfunc=g_lhs.join, + lfunc_args_and_kwargs=([rhs], {"how": "inner"}), + rfunc_args_and_kwargs=([g_rhs], {"how": "inner"}), + check_exception_type=False, + ) + else: + with pytest.warns(FutureWarning): + _ = lhs.join(rhs, how="inner") + with pytest.raises(ValueError): + _ = g_lhs.join(g_rhs, how="inner") def test_join_on_index_with_duplicate_names(): From 139e32d1415c300daff514a5928996b7630ae313 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 2 Jun 2023 19:01:22 -0500 Subject: [PATCH 044/162] Enable `sort=True` for `Index.union`, `Index.difference` and `Index.intersection` (#13497) This PR enables `sort=True` for `union`, `difference`, and `intersection` APIs in `Index`. This also fixes 1 pytest failure and adds 77 pytests: On `Index_sort_2.0`: ``` = 230 failed, 95836 passed, 2045 skipped, 768 xfailed, 308 xpassed in 438.88s (0:07:18) = ``` On `pandas_2.0_feature_branch`: ``` = 231 failed, 95767 passed, 2045 skipped, 764 xfailed, 300 xpassed in 432.59s (0:07:12) = ``` xref: https://github.com/pandas-dev/pandas/issues/25151 --- python/cudf/cudf/core/_base_index.py | 25 ++++++++------ python/cudf/cudf/core/index.py | 6 ++-- python/cudf/cudf/core/multiindex.py | 4 +-- python/cudf/cudf/tests/test_index.py | 50 +++++++++++++++++++++++----- 4 files changed, 61 insertions(+), 24 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 46e7cdfac61..49721c23eb9 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -334,6 +334,7 @@ def union(self, other, sort=None): 2. `self` or `other` has length 0. * False : do not sort the result. + * True : Sort the result (which may raise TypeError). Returns ------- @@ -395,10 +396,10 @@ def union(self, other, sort=None): if not isinstance(other, BaseIndex): other = cudf.Index(other, name=self.name) - if sort not in {None, False}: + if sort not in {None, False, True}: raise ValueError( f"The 'sort' keyword only takes the values of " - f"None or False; {sort} was passed." + f"[None, False, True]; {sort} was passed." ) if not len(other) or self.equals(other): @@ -425,6 +426,7 @@ def intersection(self, other, sort=False): * False : do not sort the result. * None : sort the result, except when `self` and `other` are equal or when the values cannot be compared. + * True : Sort the result (which may raise TypeError). Returns ------- @@ -475,10 +477,10 @@ def intersection(self, other, sort=False): if not isinstance(other, BaseIndex): other = cudf.Index(other, name=self.name) - if sort not in {None, False}: + if sort not in {None, False, True}: raise ValueError( f"The 'sort' keyword only takes the values of " - f"None or False; {sort} was passed." + f"[None, False, True]; {sort} was passed." ) if self.equals(other): @@ -768,6 +770,7 @@ def difference(self, other, sort=None): * None : Attempt to sort the result, but catch any TypeErrors from comparing incomparable elements. * False : Do not sort the result. + * True : Sort the result (which may raise TypeError). Returns ------- @@ -787,16 +790,18 @@ def difference(self, other, sort=None): >>> idx1.difference(idx2, sort=False) Index([2, 1], dtype='int64') """ - if sort not in {None, False}: + if sort not in {None, False, True}: raise ValueError( f"The 'sort' keyword only takes the values " - f"of None or False; {sort} was passed." + f"of [None, False, True]; {sort} was passed." ) other = cudf.Index(other) - if is_mixed_with_object_dtype(self, other): + if is_mixed_with_object_dtype(self, other) or len(other) == 0: difference = self.copy() + if sort is True: + return difference.sort_values() else: other = other.copy(deep=False) other.names = self.names @@ -813,7 +818,7 @@ def difference(self, other, sort=None): if self.dtype != other.dtype: difference = difference.astype(self.dtype) - if sort is None and len(other): + if sort in {None, True} and len(other): return difference.sort_values() return difference @@ -1170,7 +1175,7 @@ def _union(self, other, sort=None): ) union_result = cudf.core.index._index_from_data({0: res._data[0]}) - if sort is None and len(other): + if sort in {None, True} and len(other): return union_result.sort_values() return union_result @@ -1187,7 +1192,7 @@ def _intersection(self, other, sort=None): ._data ) - if sort is None and len(other): + if sort is {None, True} and len(other): return intersection_result.sort_values() return intersection_result diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index c0664d3ca4d..a71c285b737 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -699,7 +699,7 @@ def _union(self, other, sort=None): ): result = type(self)(start_r, end_r + step_s / 2, step_s / 2) if result is not None: - if sort is None and not result.is_monotonic_increasing: + if sort in {None, True} and not result.is_monotonic_increasing: return result.sort_values() else: return result @@ -710,7 +710,7 @@ def _union(self, other, sort=None): return self._as_int_index()._union(other, sort=sort) @_cudf_nvtx_annotate - def _intersection(self, other, sort=False): + def _intersection(self, other, sort=None): if not isinstance(other, RangeIndex): return super()._intersection(other, sort=sort) @@ -750,7 +750,7 @@ def _intersection(self, other, sort=False): if (self.step < 0 and other.step < 0) is not (new_index.step < 0): new_index = new_index[::-1] - if sort is None: + if sort in {None, True}: new_index = new_index.sort_values() return new_index diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index cdc120935ee..4803e2b8e4b 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1796,7 +1796,7 @@ def _union(self, other, sort=None): midx = MultiIndex.from_frame(result_df.iloc[:, : self.nlevels]) midx.names = self.names if self.names == other.names else None - if sort is None and len(other): + if sort in {None, True} and len(other): return midx.sort_values() return midx @@ -1819,7 +1819,7 @@ def _intersection(self, other, sort=None): result_df = cudf.merge(self_df, other_df, how="inner") midx = self.__class__.from_frame(result_df, names=res_name) - if sort is None and len(other): + if sort in {None, True} and len(other): return midx.sort_values() return midx diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index de4c72389cf..81369cd2c6e 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -781,7 +781,7 @@ def test_index_to_series(data): [], ], ) -@pytest.mark.parametrize("sort", [None, False]) +@pytest.mark.parametrize("sort", [None, False, True]) def test_index_difference(data, other, sort): pd_data = pd.Index(data) pd_other = pd.Index(other) @@ -801,8 +801,8 @@ def test_index_difference_sort_error(): assert_exceptions_equal( pdi.difference, gdi.difference, - ([pdi], {"sort": True}), - ([gdi], {"sort": True}), + ([pdi], {"sort": "A"}), + ([gdi], {"sort": "A"}), ) @@ -2236,13 +2236,45 @@ def test_range_index_concat(objs): [ (pd.RangeIndex(0, 10), pd.RangeIndex(3, 7)), (pd.RangeIndex(0, 10), pd.RangeIndex(10, 20)), - (pd.RangeIndex(0, 10, 2), pd.RangeIndex(1, 5, 3)), - (pd.RangeIndex(1, 5, 3), pd.RangeIndex(0, 10, 2)), - (pd.RangeIndex(1, 10, 3), pd.RangeIndex(1, 5, 2)), + pytest.param( + pd.RangeIndex(0, 10, 2), + pd.RangeIndex(1, 5, 3), + marks=pytest.mark.xfail( + condition=PANDAS_GE_200, + reason="https://github.com/pandas-dev/pandas/issues/53490", + strict=False, + ), + ), + pytest.param( + pd.RangeIndex(1, 5, 3), + pd.RangeIndex(0, 10, 2), + marks=pytest.mark.xfail( + condition=PANDAS_GE_200, + reason="https://github.com/pandas-dev/pandas/issues/53490", + strict=False, + ), + ), + pytest.param( + pd.RangeIndex(1, 10, 3), + pd.RangeIndex(1, 5, 2), + marks=pytest.mark.xfail( + condition=PANDAS_GE_200, + reason="https://github.com/pandas-dev/pandas/issues/53490", + strict=False, + ), + ), (pd.RangeIndex(1, 5, 2), pd.RangeIndex(1, 10, 3)), (pd.RangeIndex(1, 100, 3), pd.RangeIndex(1, 50, 3)), (pd.RangeIndex(1, 100, 3), pd.RangeIndex(1, 50, 6)), - (pd.RangeIndex(1, 100, 6), pd.RangeIndex(1, 50, 3)), + pytest.param( + pd.RangeIndex(1, 100, 6), + pd.RangeIndex(1, 50, 3), + marks=pytest.mark.xfail( + condition=PANDAS_GE_200, + reason="https://github.com/pandas-dev/pandas/issues/53490", + strict=False, + ), + ), (pd.RangeIndex(0, 10, name="a"), pd.RangeIndex(90, 100, name="b")), (pd.Index([0, 1, 2, 30], name="a"), pd.Index([90, 100])), (pd.Index([0, 1, 2, 30], name="a"), [90, 100]), @@ -2250,7 +2282,7 @@ def test_range_index_concat(objs): (pd.Index(["a", "b", "c", "d", "c"]), pd.Index(["a", "c", "z"])), ], ) -@pytest.mark.parametrize("sort", [None, False]) +@pytest.mark.parametrize("sort", [None, False, True]) def test_union_index(idx1, idx2, sort): expected = idx1.union(idx2, sort=sort) @@ -2280,7 +2312,7 @@ def test_union_index(idx1, idx2, sort): (pd.Index([True, False, True, True]), pd.Index([True, True])), ], ) -@pytest.mark.parametrize("sort", [None, False]) +@pytest.mark.parametrize("sort", [None, False, True]) def test_intersection_index(idx1, idx2, sort): expected = idx1.intersection(idx2, sort=sort) From a6869e8a16928b0e07f17d0992eb8b18ea433715 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 2 Jun 2023 19:01:51 -0500 Subject: [PATCH 045/162] Fix a groupby pytest related to numeric_only (#13496) This PR fixes a groupby pytest by performing a special version based handling, we will need this handling because of no support for numeric_only in groupby.agg yet. --- python/cudf/cudf/tests/test_groupby.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 7c610eca88c..6d326252d92 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1876,14 +1876,19 @@ def test_groupby_list_columns_excluded(): ) gdf = cudf.from_pandas(pdf) - # cudf does not yet support numeric_only, so our default is False, but - # pandas defaults to inferring and throws a warning about it, so we need to - # catch that. pandas future behavior will match ours by default (at which - # point supporting numeric_only=True will be the open feature request). - with pytest.warns(FutureWarning): - pandas_result = pdf.groupby("a").mean() - with pytest.warns(FutureWarning): - pandas_agg_result = pdf.groupby("a").agg("mean") + if PANDAS_GE_200: + pandas_result = pdf.groupby("a").mean(numeric_only=True) + pandas_agg_result = pdf.groupby("a").agg("mean", numeric_only=True) + else: + # cudf does not yet support numeric_only, so our default is False, but + # pandas defaults to inferring and throws a warning about it, so + # we need to catch that. pandas future behavior will match ours + # by default (at which point supporting numeric_only=True will + # be the open feature request). + with pytest.warns(FutureWarning): + pandas_result = pdf.groupby("a").mean() + with pytest.warns(FutureWarning): + pandas_agg_result = pdf.groupby("a").agg("mean") assert_groupby_results_equal( pandas_result, gdf.groupby("a").mean(), check_dtype=False From 6001bbfce87bc4032f2b15d38a85504b407c007b Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 2 Jun 2023 19:02:31 -0500 Subject: [PATCH 046/162] Drop special handling of `min_periods` for `Rolling.count` (#13483) This PR drops a special handling for `Rolling.count` where we always default to `min_periods=0`, this is an inconsistency that pandas-2.0 resolves in: https://github.com/pandas-dev/pandas/pull/48839 This PR fixes 2 pytest failures: ``` = 229 failed, 95769 passed, 2045 skipped, 764 xfailed, 300 xpassed in 458.04s (0:07:38) = ``` On `pandas_2.0_feature_branch`: ``` = 231 failed, 95767 passed, 2045 skipped, 764 xfailed, 300 xpassed in 457.58s (0:07:37) = ``` --- python/cudf/cudf/core/window/rolling.py | 2 -- python/cudf/cudf/tests/test_rolling.py | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 8a92ea86d57..f1500408eb4 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -543,8 +543,6 @@ def _window_to_window_sizes(self, window): ) def _apply_agg(self, agg_name): - if agg_name == "count" and not self._time_window: - self.min_periods = 0 index = cudf.MultiIndex.from_frame( cudf.DataFrame( { diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index b4e0983a9e3..a4c41136b1b 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -8,14 +8,14 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140 +from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140, PANDAS_GE_200 from cudf.testing._utils import _create_pandas_series, assert_eq from cudf.testing.dataset_generator import rand_dataframe @contextmanager def _hide_pandas_rolling_min_periods_warning(agg): - if agg == "count": + if not PANDAS_GE_200 and agg == "count": with pytest.warns( FutureWarning, match="min_periods=None will default to the size of window " From 4416a24cb8784485c869125f3a674cadb4151db1 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 2 Jun 2023 19:02:54 -0500 Subject: [PATCH 047/162] Fix JSON pytests (#13476) This PR fixes 3 json reader pytests: This PR: ``` = 228 failed, 95770 passed, 2045 skipped, 764 xfailed, 300 xpassed in 473.29s (0:07:53) = ``` On `pandas_2.0_feature_branch`: ``` = 231 failed, 95767 passed, 2045 skipped, 764 xfailed, 300 xpassed in 445.90s (0:07:25) = ``` --- python/cudf/cudf/tests/test_json.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 43b0ca0119a..2f062e3a738 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -13,6 +13,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_200 from cudf.testing._utils import ( DATETIME_TYPES, NUMERIC_TYPES, @@ -212,6 +213,18 @@ def test_cudf_json_writer_read(gdf_writer_types): if pdf2.empty: pdf2.reset_index(drop=True, inplace=True) pdf2.columns = pdf2.columns.astype("object") + if PANDAS_GE_200: + # Pandas moved to consistent datetimes parsing format: + # https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#datetimes-are-now-parsed-with-a-consistent-format + for unit in ["s", "ms"]: + if f"col_datetime64[{unit}]" in pdf2.columns: + pdf2[f"col_datetime64[{unit}]"] = ( + pd.to_datetime( + pdf2[f"col_datetime64[{unit}]"], format="mixed" + ) + .dt.tz_localize(None) + .astype(f"datetime64[{unit}]") + ) assert_eq(pdf2, gdf2) From d6324d144fdb02f0e4e762d5070bcb1a543ad0bd Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 5 Jun 2023 21:46:36 -0700 Subject: [PATCH 048/162] Fixed strings --- python/cudf/cudf/core/index.py | 2 +- python/cudf/cudf/core/multiindex.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index a7424bf0f04..6d24ccef410 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1195,7 +1195,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): else: raise ValueError( f"{method=} is unsupported, only supported values are: " - f"{['ffill', 'bfill', 'nearest', None]}" + "{['ffill'/'pad', 'bfill'/'backfill', 'nearest', None]}" ) return result_series.to_cupy() diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index f6da845ba46..e8435bee380 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1674,7 +1674,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): else: raise ValueError( f"{method=} is unsupported, only supported values are: " - f"{['ffill', 'bfill', None]}" + "{['ffill'/'pad', 'bfill'/'backfill', None]}" ) return result_series.to_cupy() From 361e96e55b92d9c08dd7be8783b3071e28b1b9fe Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 6 Jun 2023 09:28:56 -0500 Subject: [PATCH 049/162] Fix `DataFrame.mode` pytest (#13500) This PR xfails a condition that is failing due to a pandas bug: pandas-dev/pandas#53497 --- python/cudf/cudf/tests/test_dataframe.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index e6f2f9ec448..4e1e07d2bfd 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8394,8 +8394,17 @@ def test_describe_misc_exclude(df, exclude): ) @pytest.mark.parametrize("numeric_only", [True, False]) @pytest.mark.parametrize("dropna", [True, False]) -def test_dataframe_mode(df, numeric_only, dropna): +def test_dataframe_mode(request, df, numeric_only, dropna): pdf = df.to_pandas() + request.applymarker( + pytest.mark.xfail( + condition=PANDAS_GE_200 + and numeric_only is False + and "b" in df.columns + and df["b"].dtype == np.dtype("timedelta64[s]"), + reason="https://github.com/pandas-dev/pandas/issues/53497", + ) + ) expected = pdf.mode(numeric_only=numeric_only, dropna=dropna) actual = df.mode(numeric_only=numeric_only, dropna=dropna) From 8bf7b04a8b5f36df50be90e2606bf1c711d39ee8 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 6 Jun 2023 11:47:06 -0700 Subject: [PATCH 050/162] Address first round of reviews --- python/cudf/cudf/core/index.py | 52 ++++++++++++++++------------ python/cudf/cudf/core/multiindex.py | 15 +++++--- python/cudf/cudf/tests/test_index.py | 36 +++++++++++++++++++ 3 files changed, 77 insertions(+), 26 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 83c4c17babe..6e59222fc67 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -592,29 +592,34 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @_cudf_nvtx_annotate def get_indexer(self, target, limit=None, method=None, tolerance=None): - if method is None: - if self.step > 0: - start, stop, step = self.start, self.stop, self.step - else: - # Reversed - reverse = self._range[::-1] - start, stop, step = reverse.start, reverse.stop, reverse.step - - target_array = cupy.asarray(target) - locs = target_array - start - valid = (locs % step == 0) & (locs >= 0) & (target_array < stop) - locs[~valid] = -1 - locs[valid] = locs[valid] / step - - if step != self.step: - # Reversed - locs[valid] = len(self) - 1 - locs[valid] - return locs - else: + target_col = cudf.core.column.as_column(target) + if method is not None or not isinstance( + target_col, cudf.core.column.NumericalColumn + ): + # TODO: See if we can implement this without converting to + # Integer index. return self._as_int_index().get_indexer( target=target, limit=limit, method=method, tolerance=tolerance ) + if self.step > 0: + start, stop, step = self.start, self.stop, self.step + else: + # Reversed + reverse = self._range[::-1] + start, stop, step = reverse.start, reverse.stop, reverse.step + + target_array = target_col.values + locs = target_array - start + valid = (locs % step == 0) & (locs >= 0) & (target_array < stop) + locs[~valid] = -1 + locs[valid] = locs[valid] / step + + if step != self.step: + # Reversed + locs[valid] = len(self) - 1 - locs[valid] + return locs + @_cudf_nvtx_annotate def get_loc(self, key): if not is_scalar(key): @@ -1167,9 +1172,12 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): haystack_table = cudf.DataFrame( {"None": self._column, "order": arange(0, len(self))} ) - merged_table = haystack_table.merge( - needle_table, on="None", how="outer" - ) + try: + merged_table = haystack_table.merge( + needle_table, on="None", how="outer" + ) + except ValueError: + return cupy.full(len(needle_table), -1, dtype="int64") result_series = ( merged_table.sort_values(by="order_y") .head(len(target))["order_x"] diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 485644ee2f5..4afd27873e3 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1647,15 +1647,22 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): f"{method=} is not supported yet for MultiIndex." ) - target = cudf.MultiIndex.from_tuples(target) + try: + target = cudf.MultiIndex.from_tuples(target) + except TypeError: + return cp.full(len(target), -1, dtype="int64") needle_table = target.to_frame(index=False) col_names = list(range(0, self.nlevels)) needle_table["order"] = needle_table.index haystack_table = self.copy(deep=True).to_frame(index=False) haystack_table["order"] = haystack_table.index - merged_table = haystack_table.merge( - needle_table, on=col_names, how="outer" - ) + try: + merged_table = haystack_table.merge( + needle_table, on=col_names, how="outer" + ) + except ValueError: + return cp.full(len(needle_table), -1, dtype="int64") + result_series = ( merged_table.sort_values(by="order_y") .head(len(target))["order_x"] diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index ef674430e1c..db94dadbd2c 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2396,6 +2396,42 @@ def test_get_indexer_multi_string(idx, key, method): assert_eq(expected, got) +@pytest.mark.parametrize( + "idx1", + [ + lambda: cudf.Index(["a", "b", "c"]), + lambda: cudf.RangeIndex(0, 10), + lambda: cudf.Index([1, 2, 3], dtype="category"), + lambda: cudf.Index(["a", "b", "c", "d"], dtype="category"), + lambda: cudf.MultiIndex.from_tuples( + [ + ("a", "a", "a"), + ("a", "b", "c"), + ("b", "a", "a"), + ("a", "a", "b"), + ("a", "b", "a"), + ("b", "c", "a"), + ] + ), + ], +) +@pytest.mark.parametrize( + "idx2", + [ + lambda: cudf.Index(["a", "b", "c"]), + lambda: cudf.RangeIndex(0, 10), + lambda: cudf.Index([1, 2, 3], dtype="category"), + lambda: cudf.Index(["a", "b", "c", "d"], dtype="category"), + ], +) +def test_get_indexer_invalid(idx1, idx2): + idx1 = idx1() + idx2 = idx2() + assert_eq( + idx1.get_indexer(idx2), idx1.to_pandas().get_indexer(idx2.to_pandas()) + ) + + @pytest.mark.parametrize( "objs", [ From 0dc0a3da5e3d33bee5e90a766bb8602a0b56fd13 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 6 Jun 2023 17:48:20 -0700 Subject: [PATCH 051/162] annotate --- python/cudf/cudf/core/index.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 6e59222fc67..e6eb9c8c54b 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1172,6 +1172,8 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): haystack_table = cudf.DataFrame( {"None": self._column, "order": arange(0, len(self))} ) + if not len(self): + return cupy.full(len(needle_table), -1, dtype="int64") try: merged_table = haystack_table.merge( needle_table, on="None", how="outer" @@ -2871,20 +2873,16 @@ def _get_indexer_basic(index, positions, method, target_col, tolerance): return positions -def _get_nearest_indexer(index, positions, target_col, tolerance): +def _get_nearest_indexer( + index: Index, + positions: cudf.Series, + target_col: cudf.core.column.ColumnBase, + tolerance: Union[int, float], +): """ Get the indexer for the nearest index labels; requires an index with values that can be subtracted from each other. """ - if not len(index): - return _get_indexer_basic( - index=index, - positions=positions.copy(deep=True), - method="pad", - targe_col=target_col, - tolerance=tolerance, - ) - left_indexer = _get_indexer_basic( index=index, positions=positions.copy(deep=True), From 261f594075d55d2bcb01c8b41f611006fb88c988 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 6 Jun 2023 18:21:36 -0700 Subject: [PATCH 052/162] Fix issues --- python/cudf/cudf/core/index.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index e6eb9c8c54b..2e0d09b9d5f 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -623,7 +623,7 @@ def get_indexer(self, target, limit=None, method=None, tolerance=None): @_cudf_nvtx_annotate def get_loc(self, key): if not is_scalar(key): - raise TypeError("Should be a sequence") + raise TypeError("Should be a scalar-like") idx = (key - self._start) / self._step idx_int_upper_bound = (self._stop - self._start) // self._step if idx > idx_int_upper_bound or idx < 0: @@ -1212,22 +1212,20 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): @_cudf_nvtx_annotate def get_loc(self, key): - if is_scalar(key): - target = [key] - else: - target = key + if not is_scalar(key): + raise TypeError("Should be a scalar-like") is_sorted = ( self.is_monotonic_increasing or self.is_monotonic_decreasing ) - target_as_table = cudf.core.frame.Frame({"None": as_column(target)}) + target_as_table = cudf.core.frame.Frame({"None": as_column([key])}) lower_bound, upper_bound, sort_inds = _lexsorted_equal_range( self, target_as_table, is_sorted ) if lower_bound == upper_bound: - raise KeyError(target) + raise KeyError(key) if lower_bound + 1 == upper_bound: # Search result is unique, return int. From dc08ef05a42c8e163351e2ac02860179d9144621 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 7 Jun 2023 09:57:59 -0700 Subject: [PATCH 053/162] Switch to outer inner --- python/cudf/cudf/core/_base_index.py | 2 +- python/cudf/cudf/core/index.py | 46 +++++++++++++++------------- python/cudf/cudf/core/multiindex.py | 38 +++++++++++++---------- 3 files changed, 46 insertions(+), 40 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 0635e970d10..6d20c49cf2e 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -126,7 +126,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): >>> index Index(['c', 'a', 'b'], dtype='object') >>> index.get_indexer(['a', 'b', 'x']) - array([ 1, 2, -1]) + array([ 1, 2, -1], dtype=int32) """ raise NotImplementedError diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 2e0d09b9d5f..a251d67e689 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -23,6 +23,7 @@ from typing_extensions import Self import cudf +from cudf import _lib as libcudf from cudf._lib.datetime import extract_quarter, is_leap_year from cudf._lib.filling import sequence from cudf._lib.search import search_sorted @@ -52,6 +53,7 @@ from cudf.core.column.string import StringMethods as StringMethods from cudf.core.dtypes import IntervalDtype from cudf.core.frame import Frame +from cudf.core.join._join_helpers import _match_join_keys from cudf.core.mixins import BinaryOperand from cudf.core.single_column_frame import SingleColumnFrame from cudf.utils.docutils import copy_docstring, doc_apply @@ -1166,43 +1168,43 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): "is specified." ) - needle_table = cudf.DataFrame( - {"None": as_column(target), "order": arange(0, len(target))} + needle = as_column(target) + haystack = self._column + result = cudf.core.column.full( + len(needle), + fill_value=-1 if method is None else None, + dtype=libcudf.types.size_type_dtype, ) - haystack_table = cudf.DataFrame( - {"None": self._column, "order": arange(0, len(self))} - ) - if not len(self): - return cupy.full(len(needle_table), -1, dtype="int64") try: - merged_table = haystack_table.merge( - needle_table, on="None", how="outer" - ) + lcol, rcol = _match_join_keys(needle, haystack, "inner") except ValueError: - return cupy.full(len(needle_table), -1, dtype="int64") - result_series = ( - merged_table.sort_values(by="order_y") - .head(len(target))["order_x"] - .reset_index(drop=True) - ) - if method is None: - result_series = result_series.fillna(-1) - elif method in {"ffill", "bfill", "pad", "backfill"}: + return cupy.full( + len(needle), -1, dtype=libcudf.types.size_type_dtype + ) + scatter_map, indices = libcudf.join.join([lcol], [rcol], how="inner") + (result,) = libcudf.copying.scatter([indices], scatter_map, [result]) + result_series = cudf.Series(result) + if not len(self): + return cupy.full( + len(needle), -1, dtype=libcudf.types.size_type_dtype + ) + + if method in {"ffill", "bfill", "pad", "backfill"}: result_series = _get_indexer_basic( index=self, positions=result_series, method=method, - target_col=needle_table["None"], + target_col=cudf.Series(needle), tolerance=tolerance, ) elif method == "nearest": result_series = _get_nearest_indexer( index=self, positions=result_series, - target_col=needle_table["None"], + target_col=cudf.Series(needle), tolerance=tolerance, ) - else: + elif method is not None: raise ValueError( f"{method=} is unsupported, only supported values are: " "{['ffill'/'pad', 'bfill'/'backfill', 'nearest', None]}" diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 4afd27873e3..9e79779c6f6 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1650,27 +1650,31 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): try: target = cudf.MultiIndex.from_tuples(target) except TypeError: - return cp.full(len(target), -1, dtype="int64") + return cp.full( + len(target), -1, dtype=libcudf.types.size_type_dtype + ) + needle_table = target.to_frame(index=False) col_names = list(range(0, self.nlevels)) - needle_table["order"] = needle_table.index haystack_table = self.copy(deep=True).to_frame(index=False) - haystack_table["order"] = haystack_table.index - try: - merged_table = haystack_table.merge( - needle_table, on=col_names, how="outer" + result = cudf.core.column.full( + len(needle_table), + fill_value=-1 if method is None else None, + dtype=libcudf.types.size_type_dtype, + ) + scatter_map, indices = libcudf.join.join( + list(needle_table._data.columns), + list(haystack_table._data.columns), + how="inner", + ) + (result,) = libcudf.copying.scatter([indices], scatter_map, [result]) + result_series = cudf.Series(result) + if not len(self): + return cp.full( + len(needle_table), -1, dtype=libcudf.types.size_type_dtype ) - except ValueError: - return cp.full(len(needle_table), -1, dtype="int64") - result_series = ( - merged_table.sort_values(by="order_y") - .head(len(target))["order_x"] - .reset_index(drop=True) - ) - if method is None: - result_series = result_series.fillna(-1) - elif method in {"ffill", "bfill", "pad", "backfill"}: + if method in {"ffill", "bfill", "pad", "backfill"}: result_series = _get_indexer_basic( index=self, positions=result_series, @@ -1678,7 +1682,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): target_col=needle_table[col_names], tolerance=tolerance, ) - else: + elif method is not None: raise ValueError( f"{method=} is unsupported, only supported values are: " "{['ffill'/'pad', 'bfill'/'backfill', None]}" From 4289ef43272772be819fab15af152ce7c38776c1 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 13 Jun 2023 12:27:15 -0500 Subject: [PATCH 054/162] Fix `dask_cudf` pytest failures for `pandas-2.0` upgrade (#13548) This PR fixes all `dask_cudf` side failures that happen due to `pandas-2.0` upgrade. The fixes are trivial to be broken down into separate PRs. - [x] `check_less_precise` is removed in `pandas-2.0`, since it is a parameter that we never supported and just had it for the sake of matching signature I removed it from all the methods. - [x] Due to the removal of `StringIndex`, we had to perform some re-ordering of `if/elif` logic in `_nonempty_index`. - [x] `dask_cudf.DataFrame.var` got `numeric_only` support. - [x] `Series.count` doesn't have `skipna` support. Hence removed it from the call. This PR fixes 56 pytest failures: ``` == 1100 passed, 13 skipped, 8 xfailed, 5 xpassed, 114 warnings in 57.10s == ``` On `pandas_2.0_feature_branch`: ``` == 56 failed, 1044 passed, 13 skipped, 8 xfailed, 5 xpassed, 114 warnings in 73.73s (0:01:13) == ``` --- python/cudf/cudf/testing/testing.py | 14 -------------- python/dask_cudf/dask_cudf/backends.py | 12 ++++++------ python/dask_cudf/dask_cudf/core.py | 8 ++++++-- .../dask_cudf/io/tests/test_parquet.py | 5 ++++- .../dask_cudf/tests/test_accessor.py | 6 +++--- python/dask_cudf/dask_cudf/tests/test_core.py | 6 +++--- .../dask_cudf/tests/test_reductions.py | 19 ++++++++++++------- 7 files changed, 34 insertions(+), 36 deletions(-) diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 0f54391b426..70a96411a7c 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -2,8 +2,6 @@ from __future__ import annotations -from typing import Union - import cupy as cp import numpy as np import pandas as pd @@ -101,7 +99,6 @@ def assert_column_equal( right, check_dtype=True, check_column_type="equiv", - check_less_precise=False, check_exact=False, check_datetimelike_compat=False, check_categorical=True, @@ -129,8 +126,6 @@ def assert_column_equal( Whether to check the columns class, dtype and inferred_type are identical. Currently it is idle, and similar to pandas. - check_less_precise : bool or int, default False - Not yet supported check_exact : bool, default False Whether to compare number exactly. check_datetime_like_compat : bool, default False @@ -292,7 +287,6 @@ def assert_index_equal( right, exact="equiv", check_names: bool = True, - check_less_precise: Union[bool, int] = False, check_exact: bool = True, check_categorical: bool = True, check_order: bool = True, @@ -319,8 +313,6 @@ def assert_index_equal( for Index with an int8/int32/int64 dtype as well. check_names : bool, default True Whether to check the names attribute. - check_less_precise : bool or int, default False - Not yet supported check_exact : bool, default False Whether to compare number exactly. check_categorical : bool, default True @@ -404,7 +396,6 @@ def assert_index_equal( exact=check_exact, check_names=check_names, check_exact=check_exact, - check_less_precise=check_less_precise, check_order=check_order, rtol=rtol, atol=atol, @@ -433,7 +424,6 @@ def assert_series_equal( check_dtype=True, check_index_type="equiv", check_series_type=True, - check_less_precise=False, check_names=True, check_exact=False, check_datetimelike_compat=False, @@ -465,8 +455,6 @@ def assert_series_equal( Whether to check the series class, dtype and inferred_type are identical. Currently it is idle, and similar to pandas. - check_less_precise : bool or int, default False - Not yet supported check_names : bool, default True Whether to check that the names attribute for both the index and column attributes of the Series is identical. @@ -530,7 +518,6 @@ def assert_series_equal( right.index, exact=check_index_type, check_names=check_names, - check_less_precise=check_less_precise, check_exact=check_exact, check_categorical=check_categorical, rtol=rtol, @@ -543,7 +530,6 @@ def assert_series_equal( right._column, check_dtype=check_dtype, check_column_type=check_series_type, - check_less_precise=check_less_precise, check_exact=check_exact, check_datetimelike_compat=check_datetimelike_compat, check_categorical=check_categorical, diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index e44775e56df..3c7c5c99695 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -57,8 +57,6 @@ def _nonempty_index(idx): data = np.array([start, "1970-01-02"], dtype=idx.dtype) values = cudf.core.column.as_column(data) return cudf.core.index.DatetimeIndex(values, name=idx.name) - elif isinstance(idx._column, cudf.core.column.StringColumn): - return cudf.Index(["cat", "dog"], name=idx.name) elif isinstance(idx, cudf.core.index.CategoricalIndex): key = tuple(idx._data.keys()) assert len(key) == 1 @@ -69,16 +67,18 @@ def _nonempty_index(idx): categories=categories, codes=codes, ordered=ordered ) return cudf.core.index.CategoricalIndex(values, name=idx.name) - elif isinstance(idx, cudf.core.index.Index): - return cudf.core.index.Index( - np.arange(2, dtype=idx.dtype), name=idx.name - ) elif isinstance(idx, cudf.core.multiindex.MultiIndex): levels = [meta_nonempty(lev) for lev in idx.levels] codes = [[0, 0] for i in idx.levels] return cudf.core.multiindex.MultiIndex( levels=levels, codes=codes, names=idx.names ) + elif isinstance(idx._column, cudf.core.column.StringColumn): + return cudf.Index(["cat", "dog"], name=idx.name) + elif isinstance(idx, cudf.core.index.Index): + return cudf.core.index.Index( + np.arange(2, dtype=idx.dtype), name=idx.name + ) raise TypeError(f"Don't know how to handle index of type {type(idx)}") diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index d2858876fcd..3e5a40f5554 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -269,9 +269,12 @@ def var( dtype=None, out=None, naive=False, + numeric_only=False, ): axis = self._validate_axis(axis) - meta = self._meta_nonempty.var(axis=axis, skipna=skipna) + meta = self._meta_nonempty.var( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) if axis == 1: result = map_partitions( M.var, @@ -281,6 +284,7 @@ def var( axis=axis, skipna=skipna, ddof=ddof, + numeric_only=numeric_only, ) return handle_out(out, result) elif naive: @@ -421,7 +425,7 @@ def _naive_var(ddf, meta, skipna, ddof, split_every, out): def _parallel_var(ddf, meta, skipna, split_every, out): def _local_var(x, skipna): if skipna: - n = x.count(skipna=skipna) + n = x.count() avg = x.mean(skipna=skipna) else: # Not skipping nulls, so might as well diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 8e80aad67d1..489608cef1c 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -13,6 +13,7 @@ from dask.utils import natural_sort_key import cudf +from cudf.core._compat import PANDAS_GE_200 import dask_cudf @@ -173,7 +174,9 @@ def test_dask_timeseries_from_pandas(tmpdir): pdf = ddf2.compute() pdf.to_parquet(fn, engine="pyarrow") read_df = dask_cudf.read_parquet(fn) - dd.assert_eq(ddf2, read_df.compute()) + # Workaround until following issue is fixed: + # https://github.com/apache/arrow/issues/33321 + dd.assert_eq(ddf2, read_df.compute(), check_index_type=not PANDAS_GE_200) @pytest.mark.parametrize("index", [False, None]) diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 6b1627c91e8..09d02893c26 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -53,8 +53,8 @@ def test_dt_series(data, field): sr = Series(pdsr) dsr = dgd.from_cudf(sr, npartitions=5) base = getattr(pdsr.dt, field) - test = getattr(dsr.dt, field).compute().to_pandas().astype("int64") - assert_series_equal(base, test) + test = getattr(dsr.dt, field).compute() + assert_eq(base, test, check_dtype=False) @pytest.mark.parametrize("data", [data_dt_1()]) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 7f8876c8564..afd1d91e29c 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -809,7 +809,7 @@ def test_series_describe(): dd.assert_eq( dsr.describe(), pdsr.describe(), - check_less_precise=3, + rtol=1e-3, ) @@ -838,7 +838,7 @@ def test_zero_std_describe(): ddf = dgd.from_cudf(df, npartitions=4) pddf = dd.from_pandas(pdf, npartitions=4) - dd.assert_eq(ddf.describe(), pddf.describe(), check_less_precise=3) + dd.assert_eq(ddf.describe(), pddf.describe(), rtol=1e-3) def test_large_numbers_var(): @@ -853,7 +853,7 @@ def test_large_numbers_var(): ddf = dgd.from_cudf(df, npartitions=4) pddf = dd.from_pandas(pdf, npartitions=4) - dd.assert_eq(ddf.var(), pddf.var(), check_less_precise=3) + dd.assert_eq(ddf.var(), pddf.var(), rtol=1e-3) def test_index_map_partitions(): diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py index c34fbc3b0e7..ae1bfa02357 100644 --- a/python/dask_cudf/dask_cudf/tests/test_reductions.py +++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -71,10 +71,15 @@ def test_rowwise_reductions(data, op): pddf = gddf.to_dask_dataframe() if op in ("var", "std"): - expected = getattr(pddf, op)(axis=1, ddof=0) - got = getattr(gddf, op)(axis=1, ddof=0) + expected = getattr(pddf, op)(axis=1, numeric_only=True, ddof=0) + got = getattr(gddf, op)(axis=1, numeric_only=True, ddof=0) else: - expected = getattr(pddf, op)(axis=1) - got = getattr(pddf, op)(axis=1) - - dd.assert_eq(expected.compute(), got.compute(), check_exact=False) + expected = getattr(pddf, op)(numeric_only=True, axis=1) + got = getattr(pddf, op)(numeric_only=True, axis=1) + + dd.assert_eq( + expected, + got, + check_exact=False, + check_dtype=op not in ("var", "std"), + ) From e7eb1d3918779bdf110db2f6cea9251d74131664 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 13 Jun 2023 12:48:10 -0700 Subject: [PATCH 055/162] simplify --- python/cudf/cudf/core/index.py | 19 ++++++------- python/cudf/cudf/core/multiindex.py | 43 +++++++++++++++++------------ 2 files changed, 33 insertions(+), 29 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index a251d67e689..ab551a43bae 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1169,25 +1169,22 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): ) needle = as_column(target) - haystack = self._column result = cudf.core.column.full( len(needle), - fill_value=-1 if method is None else None, + fill_value=-1, dtype=libcudf.types.size_type_dtype, ) + + if not len(self): + return result.values try: - lcol, rcol = _match_join_keys(needle, haystack, "inner") + lcol, rcol = _match_join_keys(needle, self._column, "inner") except ValueError: - return cupy.full( - len(needle), -1, dtype=libcudf.types.size_type_dtype - ) + return result.values + scatter_map, indices = libcudf.join.join([lcol], [rcol], how="inner") (result,) = libcudf.copying.scatter([indices], scatter_map, [result]) result_series = cudf.Series(result) - if not len(self): - return cupy.full( - len(needle), -1, dtype=libcudf.types.size_type_dtype - ) if method in {"ffill", "bfill", "pad", "backfill"}: result_series = _get_indexer_basic( @@ -2849,7 +2846,7 @@ def _extended_gcd(a: int, b: int) -> Tuple[int, int, int]: def _get_indexer_basic(index, positions, method, target_col, tolerance): - nonexact = positions.isnull() + nonexact = positions == -1 positions[nonexact] = index.searchsorted( target_col[nonexact], side="left" if method in {"pad", "ffill"} else "right", diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 9e79779c6f6..b8390e4e678 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -29,6 +29,7 @@ _lexsorted_equal_range, as_index, ) +from cudf.core.join._join_helpers import _match_join_keys from cudf.utils.docutils import doc_apply from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate @@ -1647,39 +1648,45 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): f"{method=} is not supported yet for MultiIndex." ) + result = cudf.core.column.full( + len(target), + fill_value=-1, + dtype=libcudf.types.size_type_dtype, + ) + if not len(self): + return result.values try: target = cudf.MultiIndex.from_tuples(target) except TypeError: - return cp.full( - len(target), -1, dtype=libcudf.types.size_type_dtype - ) - - needle_table = target.to_frame(index=False) - col_names = list(range(0, self.nlevels)) - haystack_table = self.copy(deep=True).to_frame(index=False) - result = cudf.core.column.full( - len(needle_table), - fill_value=-1 if method is None else None, - dtype=libcudf.types.size_type_dtype, + return result.values + + lcols, rcols = map( + list, + zip( + *[ + _match_join_keys(lcol, rcol, "inner") + for lcol, rcol in zip( + target._data.columns, self._data.columns + ) + ] + ), ) scatter_map, indices = libcudf.join.join( - list(needle_table._data.columns), - list(haystack_table._data.columns), + lcols, + rcols, how="inner", ) (result,) = libcudf.copying.scatter([indices], scatter_map, [result]) result_series = cudf.Series(result) - if not len(self): - return cp.full( - len(needle_table), -1, dtype=libcudf.types.size_type_dtype - ) if method in {"ffill", "bfill", "pad", "backfill"}: result_series = _get_indexer_basic( index=self, positions=result_series, method=method, - target_col=needle_table[col_names], + target_col=target.to_frame(index=False)[ + list(range(0, self.nlevels)) + ], tolerance=tolerance, ) elif method is not None: From fb99b0afe0c2c77799324d5de8601138501769d0 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 14 Jun 2023 09:24:16 -0500 Subject: [PATCH 056/162] Enable writing column names with mixed dtype in parquet writer when `mode.pandas_compatible=True` (#13505) This PR enables writing a dataframe that has column names that are of mixed types to a parquet file when pandas compatibility mode is enabled(`mode.pandas_compatible=True`). --------- Co-authored-by: Bradley Dice --- .../source/user_guide/pandas-comparison.md | 21 ++++++++++++++ python/cudf/cudf/_lib/parquet.pyx | 9 ++++-- python/cudf/cudf/_lib/utils.pyx | 2 +- python/cudf/cudf/tests/test_parquet.py | 29 ++++++++++++++----- 4 files changed, 49 insertions(+), 12 deletions(-) diff --git a/docs/cudf/source/user_guide/pandas-comparison.md b/docs/cudf/source/user_guide/pandas-comparison.md index ba04a231f41..441bc72205a 100644 --- a/docs/cudf/source/user_guide/pandas-comparison.md +++ b/docs/cudf/source/user_guide/pandas-comparison.md @@ -139,6 +139,27 @@ module, which allow you to compare values up to a desired precision. Unlike Pandas, cuDF does not support duplicate column names. It is best to use unique strings for column names. +## Writing a DataFrame to Parquet with non-string column names + +When there is a DataFrame with non-string column names, pandas casts each +column name to `str` before writing to a Parquet file. `cudf` raises an +error by default if this is attempted. However, to achieve similar behavior +as pandas you can enable the `mode.pandas_compatible` option, which will +enable `cudf` to cast the column names to `str` just like pandas. + +```python +>>> import cudf +>>> df = cudf.DataFrame({1: [1, 2, 3], "1": ["a", "b", "c"]}) +>>> df.to_parquet("df.parquet") + +Traceback (most recent call last): +ValueError: Writing a Parquet file requires string column names +>>> cudf.set_option("mode.pandas_compatible", True) +>>> df.to_parquet("df.parquet") + +UserWarning: The DataFrame has column names of non-string type. They will be converted to strings on write. +``` + ## No true `"object"` data type In Pandas and NumPy, the `"object"` data type is used for diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 923f5c4089f..5519bbd4cd5 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -361,9 +361,12 @@ def write_parquet( for i, name in enumerate(table._column_names, num_index_cols_meta): if not isinstance(name, str): - raise ValueError("parquet must have string column names") - - tbl_meta.get().column_metadata[i].set_name(name.encode()) + if cudf.get_option("mode.pandas_compatible"): + tbl_meta.get().column_metadata[i].set_name(str(name).encode()) + else: + raise ValueError("Writing a Parquet file requires string column names") + else: + tbl_meta.get().column_metadata[i].set_name(name.encode()) _set_col_metadata( table[name]._column, tbl_meta.get().column_metadata[i], diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 56918799cca..f5a5571a72f 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -174,7 +174,7 @@ cpdef generate_pandas_metadata(table, index): for col in table._columns ], df=table, - column_names=col_names, + column_names=map(str, col_names), index_levels=index_levels, index_descriptors=index_descriptors, preserve_index=index, diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index cd7075e1851..74ed6baead6 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -30,7 +30,6 @@ from cudf.testing._utils import ( TIMEDELTA_TYPES, assert_eq, - assert_exceptions_equal, expect_warning_if, set_random_null_mask_inplace, ) @@ -2528,15 +2527,29 @@ def test_parquet_writer_decimal(decimal_type, data): def test_parquet_writer_column_validation(): - df = cudf.DataFrame({1: [1, 2, 3], "1": ["a", "b", "c"]}) + df = cudf.DataFrame({1: [1, 2, 3], "a": ["a", "b", "c"]}) pdf = df.to_pandas() - assert_exceptions_equal( - lfunc=df.to_parquet, - rfunc=pdf.to_parquet, - lfunc_args_and_kwargs=(["cudf.parquet"],), - rfunc_args_and_kwargs=(["pandas.parquet"],), - ) + with cudf.option_context("mode.pandas_compatible", True): + with pytest.warns(UserWarning): + df.to_parquet("cudf.parquet") + + if PANDAS_GE_200: + with pytest.warns(UserWarning): + pdf.to_parquet("pandas.parquet") + + assert_eq( + pd.read_parquet("cudf.parquet"), + cudf.read_parquet("pandas.parquet"), + ) + assert_eq( + cudf.read_parquet("cudf.parquet"), + pd.read_parquet("pandas.parquet"), + ) + + with cudf.option_context("mode.pandas_compatible", False): + with pytest.raises(ValueError): + df.to_parquet("cudf.parquet") def test_parquet_writer_nulls_pandas_read(tmpdir, pdf): From 2488d9143a5d38765c2294ebc40f1f212a77f2c6 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 20 Jun 2023 15:47:16 -0700 Subject: [PATCH 057/162] address reviews --- python/cudf/cudf/core/index.py | 3 +++ python/cudf/cudf/core/multiindex.py | 22 ++++++++++------------ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index ab551a43bae..ccde34c2654 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2846,6 +2846,9 @@ def _extended_gcd(a: int, b: int) -> Tuple[int, int, int]: def _get_indexer_basic(index, positions, method, target_col, tolerance): + # `positions` will be modified in-place, so it is the + # responsibility of the caller to decide whether or not + # to make a copy of it before passing it to this method. nonexact = positions == -1 positions[nonexact] = index.searchsorted( target_col[nonexact], diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index b8390e4e678..5bb379b94c3 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1660,20 +1660,18 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): except TypeError: return result.values - lcols, rcols = map( - list, - zip( - *[ - _match_join_keys(lcol, rcol, "inner") - for lcol, rcol in zip( - target._data.columns, self._data.columns + scatter_map, indices = libcudf.join.join( + *map( + list, + zip( + *( + _match_join_keys(lcol, rcol, "inner") + for lcol, rcol in zip( + target._data.columns, self._data.columns + ) ) - ] + ), ), - ) - scatter_map, indices = libcudf.join.join( - lcols, - rcols, how="inner", ) (result,) = libcudf.copying.scatter([indices], scatter_map, [result]) From 7f216cf9f7ef463d47aeed248a9588e4b258ace3 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 20 Jun 2023 15:49:03 -0700 Subject: [PATCH 058/162] fix --- python/cudf/cudf/core/multiindex.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 5bb379b94c3..10929dbb804 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1664,12 +1664,12 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): *map( list, zip( - *( + *[ _match_join_keys(lcol, rcol, "inner") for lcol, rcol in zip( target._data.columns, self._data.columns ) - ) + ] ), ), how="inner", From 13d62c5a2fb6b1325627ba1ce6b6e946ed92e85a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 20 Jun 2023 15:55:55 -0700 Subject: [PATCH 059/162] simplify --- python/cudf/cudf/core/multiindex.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 10929dbb804..649f6d15c76 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1660,18 +1660,13 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): except TypeError: return result.values + join_keys = [ + _match_join_keys(lcol, rcol, "inner") + for lcol, rcol in zip(target._data.columns, self._data.columns) + ] + join_keys = map(list, zip(*join_keys)) scatter_map, indices = libcudf.join.join( - *map( - list, - zip( - *[ - _match_join_keys(lcol, rcol, "inner") - for lcol, rcol in zip( - target._data.columns, self._data.columns - ) - ] - ), - ), + *join_keys, how="inner", ) (result,) = libcudf.copying.scatter([indices], scatter_map, [result]) From 273945b831fc5cb6847677e3391886336d788fb8 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 31 Jul 2023 10:50:11 -0700 Subject: [PATCH 060/162] Fix default behavior of index metaclass instance and subclass checks #13787 The current implementation of IndexMeta gives the wrong result for subclasses of Index: for instance, DatetimeIndex.__instancecheck__(DatetimeIndex(), DatetimeIndex) or DatetimeIndex.__subclasscheck__(DatetimeIndex, DatetimeIndex) would return False. In the case of isinstance, however, we have been saved by https://bugs.python.org/issue35083, wherein Python is silently injecting a if type(instance) is self: return True short-circuit. issubclass currently does have the wrong behavior, though. The fix is to fall back to the original behavior of isinstance/issubclass rather than hardcoding a boolean result. That will also ensure that we get the correct behavior if e.g. a subclass of Index is itself subclassed, e.g. if we introduced a class SpecialDatetimeIndex(DatetimeIndex). --- python/cudf/cudf/core/index.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 7235ddc5e50..297ac21fb7e 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -87,13 +87,13 @@ def __instancecheck__(self, instance): if self is cudf.Index: return isinstance(instance, BaseIndex) else: - return False + return type.__instancecheck__(self, instance) def __subclasscheck__(self, subclass): if self is cudf.Index: return issubclass(subclass, BaseIndex) else: - return False + return type.__subclasscheck__(self, subclass) def _lexsorted_equal_range( From db92536c86ea215cec705e80c59344f2d64de709 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 11 Oct 2023 14:16:53 -0700 Subject: [PATCH 061/162] merge fix --- python/cudf/cudf/core/algorithms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 591816e4161..06c67e831a6 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -91,10 +91,10 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None): if use_na_sentinel: na_sentinel = Scalar(-1) - cats = values._column.dropna() + cats = values.dropna() else: na_sentinel = Scalar(None, dtype=values.dtype) - cats = values._column + cats = values cats = cats.unique().astype(values.dtype) From fc6a30f514633176d860cb1eeaf7308a66cdecae Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 11 Oct 2023 17:41:48 -0700 Subject: [PATCH 062/162] Handle PandasArray renaming --- python/cudf/cudf/core/column/column.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index ed23c9574b0..3289f99d237 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -67,7 +67,7 @@ is_string_dtype, is_struct_dtype, ) -from cudf.core._compat import PANDAS_GE_150 +from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_210 from cudf.core.abc import Serializable from cudf.core.buffer import ( Buffer, @@ -101,6 +101,11 @@ else: from pandas.core.arrays._arrow_utils import ArrowIntervalType +if PANDAS_GE_210: + NumpyExtensionArray = pd.arrays.NumpyExtensionArray +else: + NumpyExtensionArray = pd.arrays.PandasArray + class ColumnBase(Column, Serializable, BinaryOperand, Reducible): _VALID_REDUCTIONS = { @@ -2213,7 +2218,7 @@ def as_column( if delayed_cast: data = data.astype(cudf.dtype(dtype)) - elif isinstance(arbitrary, pd.arrays.PandasArray): + elif isinstance(arbitrary, NumpyExtensionArray): if is_categorical_dtype(arbitrary.dtype): arb_dtype = arbitrary.dtype else: From ad3ae65d7bdaa67269e00e48c6eba36c3227bac8 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 12 Oct 2023 17:51:53 -0500 Subject: [PATCH 063/162] Deprecate `is_categorical_dtype` (#14274) This PR deprecates `is_categorical_dtype` to match https://github.com/pandas-dev/pandas/pull/52527 which was introduced in `pandas-2.x`. This PR internalizes the public API since this is a needed utility in our code base. This PR: ``` = 23835 failed, 5698 passed, 1613 skipped, 288 xfailed, 423 errors in 1976.84s (0:32:56) = ``` On `pandas_2.0_feature_branch`: ``` = 24297 failed, 5115 passed, 1613 skipped, 288 xfailed, 480 errors in 1980.46s (0:33:00) = ``` --- python/cudf/cudf/_fuzz_testing/csv.py | 4 +- python/cudf/cudf/_fuzz_testing/json.py | 4 +- python/cudf/cudf/_lib/column.pyx | 10 ++--- python/cudf/cudf/_lib/csv.pyx | 8 ++-- python/cudf/cudf/_lib/groupby.pyx | 6 +-- python/cudf/cudf/_lib/json.pyx | 4 +- python/cudf/cudf/_lib/utils.pyx | 6 +-- python/cudf/cudf/api/types.py | 5 ++- python/cudf/cudf/core/_internals/where.py | 4 +- python/cudf/cudf/core/column/categorical.py | 4 +- python/cudf/cudf/core/column/column.py | 24 ++++++----- python/cudf/cudf/core/column/interval.py | 4 +- python/cudf/cudf/core/dataframe.py | 22 +++++----- python/cudf/cudf/core/dtypes.py | 46 ++++++++++++++------- python/cudf/cudf/core/index.py | 4 +- python/cudf/cudf/core/indexed_frame.py | 16 +++---- python/cudf/cudf/core/reshape.py | 2 +- python/cudf/cudf/core/tools/numeric.py | 6 +-- python/cudf/cudf/testing/testing.py | 16 +++---- python/cudf/cudf/tests/test_api_types.py | 9 ++-- python/cudf/cudf/tests/test_column.py | 4 +- python/cudf/cudf/tests/test_concat.py | 12 +++--- python/cudf/cudf/tests/test_dataframe.py | 4 +- python/cudf/cudf/tests/test_index.py | 2 +- python/cudf/cudf/utils/dtypes.py | 12 +++--- python/dask_cudf/dask_cudf/backends.py | 4 +- python/dask_cudf/dask_cudf/sorting.py | 4 +- 27 files changed, 134 insertions(+), 112 deletions(-) diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py index 8ab7048cff0..13ea07372d0 100644 --- a/python/cudf/cudf/_fuzz_testing/csv.py +++ b/python/cudf/cudf/_fuzz_testing/csv.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import logging import random @@ -99,7 +99,7 @@ def set_rand_params(self, params): if dtype_val is not None: dtype_val = { col_name: "category" - if cudf.utils.dtypes.is_categorical_dtype(dtype) + if cudf.utils.dtypes._is_categorical_dtype(dtype) else pandas_dtypes_to_np_dtypes[dtype] for col_name, dtype in dtype_val.items() } diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py index 29e0aeb7050..c6e74798cd7 100644 --- a/python/cudf/cudf/_fuzz_testing/json.py +++ b/python/cudf/cudf/_fuzz_testing/json.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import logging import random @@ -27,7 +27,7 @@ def _get_dtype_param_value(dtype_val): if dtype_val is not None and isinstance(dtype_val, abc.Mapping): processed_dtypes = {} for col_name, dtype in dtype_val.items(): - if cudf.utils.dtypes.is_categorical_dtype(dtype): + if cudf.utils.dtypes._is_categorical_dtype(dtype): processed_dtypes[col_name] = "category" else: processed_dtypes[col_name] = str( diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 4db3761b1b8..fbd70de9905 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -11,7 +11,7 @@ import rmm import cudf import cudf._lib as libcudf from cudf._lib import pylibcudf -from cudf.api.types import is_categorical_dtype, is_datetime64tz_dtype +from cudf.api.types import _is_categorical_dtype, is_datetime64tz_dtype from cudf.core.buffer import ( Buffer, ExposureTrackedBuffer, @@ -331,7 +331,7 @@ cdef class Column: ) cdef mutable_column_view mutable_view(self) except *: - if is_categorical_dtype(self.dtype): + if _is_categorical_dtype(self.dtype): col = self.base_children[0] data_dtype = col.dtype elif is_datetime64tz_dtype(self.dtype): @@ -394,7 +394,7 @@ cdef class Column: return self._view(c_null_count) cdef column_view _view(self, libcudf_types.size_type null_count) except *: - if is_categorical_dtype(self.dtype): + if _is_categorical_dtype(self.dtype): col = self.base_children[0] data_dtype = col.dtype elif is_datetime64tz_dtype(self.dtype): @@ -469,7 +469,7 @@ cdef class Column: # categoricals because cudf supports ordered and unordered categoricals # while libcudf supports only unordered categoricals (see # https://github.com/rapidsai/cudf/pull/8567). - if is_categorical_dtype(self.dtype): + if _is_categorical_dtype(self.dtype): col = self.base_children[0] else: col = self @@ -635,7 +635,7 @@ cdef class Column: """ column_owner = isinstance(owner, Column) mask_owner = owner - if column_owner and is_categorical_dtype(owner.dtype): + if column_owner and _is_categorical_dtype(owner.dtype): owner = owner.base_children[0] size = cv.size() diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 630dcf73545..399a53c09b5 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -434,7 +434,7 @@ def read_csv( if dtype is not None: if isinstance(dtype, abc.Mapping): for k, v in dtype.items(): - if cudf.api.types.is_categorical_dtype(v): + if cudf.api.types._is_categorical_dtype(v): df._data[str(k)] = df._data[str(k)].astype(v) elif ( cudf.api.types.is_scalar(dtype) or @@ -442,11 +442,11 @@ def read_csv( np.dtype, pd.api.extensions.ExtensionDtype, type )) ): - if cudf.api.types.is_categorical_dtype(dtype): + if cudf.api.types._is_categorical_dtype(dtype): df = df.astype(dtype) elif isinstance(dtype, abc.Collection): for index, col_dtype in enumerate(dtype): - if cudf.api.types.is_categorical_dtype(col_dtype): + if cudf.api.types._is_categorical_dtype(col_dtype): col_name = df._data.names[index] df._data[col_name] = df._data[col_name].astype(col_dtype) @@ -547,7 +547,7 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *: # TODO: Remove this work-around Dictionary types # in libcudf are fully mapped to categorical columns: # https://github.com/rapidsai/cudf/issues/3960 - if cudf.api.types.is_categorical_dtype(dtype): + if cudf.api.types._is_categorical_dtype(dtype): if isinstance(dtype, str): dtype = "str" else: diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index b9447c96ee6..0067981169c 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -3,7 +3,7 @@ from pandas.core.groupby.groupby import DataError from cudf.api.types import ( - is_categorical_dtype, + _is_categorical_dtype, is_decimal_dtype, is_interval_dtype, is_list_dtype, @@ -189,7 +189,7 @@ cdef class GroupBy: valid_aggregations = ( _LIST_AGGS if is_list_dtype(dtype) else _STRING_AGGS if is_string_dtype(dtype) - else _CATEGORICAL_AGGS if is_categorical_dtype(dtype) + else _CATEGORICAL_AGGS if _is_categorical_dtype(dtype) else _STRUCT_AGGS if is_struct_dtype(dtype) else _INTERVAL_AGGS if is_interval_dtype(dtype) else _DECIMAL_AGGS if is_decimal_dtype(dtype) @@ -260,7 +260,7 @@ cdef class GroupBy: valid_aggregations = ( _LIST_AGGS if is_list_dtype(dtype) else _STRING_AGGS if is_string_dtype(dtype) - else _CATEGORICAL_AGGS if is_categorical_dtype(dtype) + else _CATEGORICAL_AGGS if _is_categorical_dtype(dtype) else _STRUCT_AGGS if is_struct_dtype(dtype) else _INTERVAL_AGGS if is_interval_dtype(dtype) else _DECIMAL_AGGS if is_decimal_dtype(dtype) diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 437c3ef6ec4..f66109bccbd 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -214,7 +214,7 @@ def write_json( cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except *: cdef schema_element s_element cdef data_type lib_type - if cudf.api.types.is_categorical_dtype(dtype): + if cudf.api.types._is_categorical_dtype(dtype): raise NotImplementedError( "CategoricalDtype as dtype is not yet " "supported in JSON reader" @@ -237,7 +237,7 @@ cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except *: cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *: - if cudf.api.types.is_categorical_dtype(dtype): + if cudf.api.types._is_categorical_dtype(dtype): raise NotImplementedError( "CategoricalDtype as dtype is not yet " "supported in JSON reader" diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 2259d90468f..69b0fe5d8f2 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -23,7 +23,7 @@ except ImportError: import json from cudf.api.types import ( - is_categorical_dtype, + _is_categorical_dtype, is_decimal_dtype, is_list_dtype, is_struct_dtype, @@ -92,7 +92,7 @@ cpdef generate_pandas_metadata(table, index): # Columns for name, col in table._data.items(): col_names.append(name) - if is_categorical_dtype(col): + if _is_categorical_dtype(col): raise ValueError( "'category' column dtypes are currently not " + "supported by the gpu accelerated parquet writer" @@ -147,7 +147,7 @@ cpdef generate_pandas_metadata(table, index): level=level, column_names=col_names ) - if is_categorical_dtype(idx): + if _is_categorical_dtype(idx): raise ValueError( "'category' column dtypes are currently not " + "supported by the gpu accelerated parquet writer" diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 7ed5bc31420..007b9f3ee02 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -18,7 +18,7 @@ from cudf.core.dtypes import ( # noqa: F401 _BaseDtype, dtype, - is_categorical_dtype, + _is_categorical_dtype, is_decimal32_dtype, is_decimal64_dtype, is_decimal128_dtype, @@ -112,7 +112,7 @@ def is_string_dtype(obj): or ( pd.api.types.is_string_dtype(obj) # Reject all cudf extension types. - and not is_categorical_dtype(obj) + and not _is_categorical_dtype(obj) and not is_decimal_dtype(obj) and not is_list_dtype(obj) and not is_struct_dtype(obj) @@ -486,6 +486,7 @@ def is_any_real_numeric_dtype(arr_or_dtype) -> bool: is_iterator = pd_types.is_iterator is_bool = pd_types.is_bool is_categorical = pd_types.is_categorical_dtype +# TODO is_complex = pd_types.is_complex is_float = pd_types.is_float is_hashable = pd_types.is_hashable diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 0f65861dc72..ce22d4c8860 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -8,8 +8,8 @@ import cudf from cudf._typing import ScalarLike from cudf.api.types import ( + _is_categorical_dtype, _is_non_decimal_numeric_dtype, - is_categorical_dtype, is_scalar, ) from cudf.core.column import ColumnBase @@ -45,7 +45,7 @@ def _check_and_cast_columns_with_other( ) -> Tuple[ColumnBase, Union[ScalarLike, ColumnBase]]: # Returns type-casted `source_col` & `other` based on `inplace`. source_dtype = source_col.dtype - if is_categorical_dtype(source_dtype): + if _is_categorical_dtype(source_dtype): return _normalize_categorical(source_col, other) other_is_scalar = is_scalar(other) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 40abd5b7db8..af0e3257d4e 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -16,7 +16,7 @@ from cudf import _lib as libcudf from cudf._lib.transform import bools_to_mask from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike -from cudf.api.types import is_categorical_dtype, is_interval_dtype +from cudf.api.types import _is_categorical_dtype, is_interval_dtype from cudf.core.buffer import Buffer from cudf.core.column import column from cudf.core.column.methods import ColumnMethods @@ -98,7 +98,7 @@ class CategoricalAccessor(ColumnMethods): _column: CategoricalColumn def __init__(self, parent: SeriesOrSingleColumnIndex): - if not is_categorical_dtype(parent.dtype): + if not _is_categorical_dtype(parent.dtype): raise AttributeError( "Can only use .cat accessor with a 'category' dtype" ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 3289f99d237..048ce620a8d 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -49,10 +49,10 @@ from cudf._lib.types import size_type_dtype from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.api.types import ( + _is_categorical_dtype, _is_non_decimal_numeric_dtype, infer_dtype, is_bool_dtype, - is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_decimal32_dtype, @@ -977,7 +977,7 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool: def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: if self.dtype == dtype: return self - if is_categorical_dtype(dtype): + if _is_categorical_dtype(dtype): return self.as_categorical_column(dtype, **kwargs) dtype = ( @@ -987,7 +987,7 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: ) if _is_non_decimal_numeric_dtype(dtype): return self.as_numerical_column(dtype, **kwargs) - elif is_categorical_dtype(dtype): + elif _is_categorical_dtype(dtype): return self.as_categorical_column(dtype, **kwargs) elif cudf.dtype(dtype).type in { np.str_, @@ -1423,7 +1423,7 @@ def column_empty_like( if ( hasattr(column, "dtype") - and is_categorical_dtype(column.dtype) + and _is_categorical_dtype(column.dtype) and dtype == column.dtype ): catcolumn = cast("cudf.core.column.CategoricalColumn", column) @@ -1476,7 +1476,7 @@ def column_empty( full(row_count + 1, 0, dtype=libcudf.types.size_type_dtype), column_empty(row_count, dtype=dtype.element_type), ) - elif is_categorical_dtype(dtype): + elif _is_categorical_dtype(dtype): data = None children = ( build_column( @@ -1553,7 +1553,7 @@ def build_column( offset=offset, null_count=null_count, ) - if is_categorical_dtype(dtype): + if _is_categorical_dtype(dtype): if not len(children) == 1: raise ValueError( "Must specify exactly one child column for CategoricalColumn" @@ -2037,7 +2037,7 @@ def as_column( f"{arbitrary.dtype} is not supported. Convert first to " f"{arbitrary.dtype.subtype}." ) - if is_categorical_dtype(arbitrary.dtype): + if _is_categorical_dtype(arbitrary.dtype): if isinstance( arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype ): @@ -2219,7 +2219,7 @@ def as_column( data = data.astype(cudf.dtype(dtype)) elif isinstance(arbitrary, NumpyExtensionArray): - if is_categorical_dtype(arbitrary.dtype): + if _is_categorical_dtype(arbitrary.dtype): arb_dtype = arbitrary.dtype else: if arbitrary.dtype == pd.StringDtype(): @@ -2347,7 +2347,9 @@ def as_column( np_type = None try: if dtype is not None: - if is_categorical_dtype(dtype) or is_interval_dtype(dtype): + if _is_categorical_dtype(dtype) or is_interval_dtype( + dtype + ): raise TypeError if is_datetime64tz_dtype(dtype): raise NotImplementedError( @@ -2491,7 +2493,7 @@ def as_column( except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError) as e: if isinstance(e, MixedTypeError): raise TypeError(str(e)) - if is_categorical_dtype(dtype): + if _is_categorical_dtype(dtype): sr = pd.Series(arbitrary, dtype="category") data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype) elif np_type == np.str_: @@ -2774,7 +2776,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: # ColumnBase._concat so that all subclasses can override necessary # behavior. However, at the moment it's not clear what that API should look # like, so CategoricalColumn simply implements a minimal working API. - if all(is_categorical_dtype(o.dtype) for o in objs): + if all(_is_categorical_dtype(o.dtype) for o in objs): return cudf.core.column.categorical.CategoricalColumn._concat( cast( MutableSequence[ diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index 38384d09126..7b87552f1a0 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -5,7 +5,7 @@ import pyarrow as pa import cudf -from cudf.api.types import is_categorical_dtype, is_interval_dtype +from cudf.api.types import _is_categorical_dtype, is_interval_dtype from cudf.core.column import StructColumn from cudf.core.dtypes import IntervalDtype @@ -102,7 +102,7 @@ def copy(self, deep=True): def as_interval_column(self, dtype, **kwargs): if is_interval_dtype(dtype): - if is_categorical_dtype(self): + if _is_categorical_dtype(self): new_struct = self._get_decategorized_column() return IntervalColumn.from_struct_column(new_struct) if is_interval_dtype(dtype): diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 23c2405b58e..0cfa37c224d 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -45,7 +45,7 @@ from cudf.api.types import ( _is_scalar_or_zero_d_array, is_bool_dtype, - is_categorical_dtype, + _is_categorical_dtype, is_datetime_dtype, is_dict_like, is_dtype_equal, @@ -1403,7 +1403,7 @@ def _get_numeric_data(self): columns = [ c for c, dt in self.dtypes.items() - if dt != object and not is_categorical_dtype(dt) + if dt != object and not _is_categorical_dtype(dt) ] return self[columns] @@ -1646,9 +1646,9 @@ def _concat( out._index._data, indices[:first_data_column_position], ) - if not isinstance(out._index, MultiIndex) and is_categorical_dtype( - out._index._values.dtype - ): + if not isinstance( + out._index, MultiIndex + ) and _is_categorical_dtype(out._index._values.dtype): out = out.set_index( cudf.core.index.as_index(out.index._values) ) @@ -3807,8 +3807,8 @@ def transpose(self): # No column from index is transposed with libcudf. source_columns = [*self._columns] source_dtype = source_columns[0].dtype - if is_categorical_dtype(source_dtype): - if any(not is_categorical_dtype(c.dtype) for c in source_columns): + if _is_categorical_dtype(source_dtype): + if any(not _is_categorical_dtype(c.dtype) for c in source_columns): raise ValueError("Columns must all have the same dtype") cats = list(c.categories for c in source_columns) cats = cudf.core.column.concat_columns(cats).unique() @@ -3822,7 +3822,7 @@ def transpose(self): result_columns = libcudf.transpose.transpose(source_columns) - if is_categorical_dtype(source_dtype): + if _is_categorical_dtype(source_dtype): result_columns = [ codes._with_type_metadata( cudf.core.dtypes.CategoricalDtype(categories=cats) @@ -4524,7 +4524,7 @@ def apply_rows( """ for col in incols: current_col_dtype = self._data[col].dtype - if is_string_dtype(current_col_dtype) or is_categorical_dtype( + if is_string_dtype(current_col_dtype) or _is_categorical_dtype( current_col_dtype ): raise TypeError( @@ -6308,7 +6308,7 @@ def select_dtypes(self, include=None, exclude=None): for dtype in self.dtypes: for i_dtype in include: # category handling - if is_categorical_dtype(i_dtype): + if _is_categorical_dtype(i_dtype): include_subtypes.add(i_dtype) elif inspect.isclass(dtype.type): if issubclass(dtype.type, i_dtype): @@ -6319,7 +6319,7 @@ def select_dtypes(self, include=None, exclude=None): for dtype in self.dtypes: for e_dtype in exclude: # category handling - if is_categorical_dtype(e_dtype): + if _is_categorical_dtype(e_dtype): exclude_subtypes.add(e_dtype) elif inspect.isclass(dtype.type): if issubclass(dtype.type, e_dtype): diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 15fbaa04418..6fae552c6ed 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -4,6 +4,7 @@ import operator import pickle import textwrap +import warnings from functools import cached_property from typing import Any, Callable, Dict, List, Tuple, Type, Union @@ -957,19 +958,7 @@ def deserialize(cls, header: dict, frames: list): return klass(subtype, closed=closed) -def is_categorical_dtype(obj): - """Check whether an array-like or dtype is of the Categorical dtype. - - Parameters - ---------- - obj : array-like or dtype - The array-like or dtype to check. - - Returns - ------- - bool - Whether or not the array-like or dtype is of a categorical dtype. - """ +def _is_categorical_dtype(obj): if obj is None: return False @@ -1013,13 +1002,40 @@ def is_categorical_dtype(obj): pd.Series, ), ): - return is_categorical_dtype(obj.dtype) + return _is_categorical_dtype(obj.dtype) if hasattr(obj, "type"): if obj.type is pd_CategoricalDtypeType: return True # TODO: A lot of the above checks are probably redundant and should be # farmed out to this function here instead. - return pd_types.is_categorical_dtype(obj) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + return pd_types.is_categorical_dtype(obj) + + +def is_categorical_dtype(obj): + """Check whether an array-like or dtype is of the Categorical dtype. + + .. deprecated:: 23.12 + Use isinstance(dtype, cudf.CategoricalDtype) instead + + Parameters + ---------- + obj : array-like or dtype + The array-like or dtype to check. + + Returns + ------- + bool + Whether or not the array-like or dtype is of a categorical dtype. + """ + # Do not remove until pandas 3.0 support is added. + warnings.warn( + "is_categorical_dtype is deprecated and will be removed in a future " + "version. Use isinstance(dtype, cudf.CategoricalDtype) instead", + FutureWarning, + ) + return _is_categorical_dtype(obj) def is_list_dtype(obj): diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 30e24409dbe..92b244d1999 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -31,7 +31,7 @@ from cudf.api.extensions import no_default from cudf.api.types import ( _is_non_decimal_numeric_dtype, - is_categorical_dtype, + _is_categorical_dtype, is_dtype_equal, is_interval_dtype, is_list_like, @@ -2496,7 +2496,7 @@ def __init__( if isinstance(data, CategoricalColumn): data = data elif isinstance(data, pd.Series) and ( - is_categorical_dtype(data.dtype) + _is_categorical_dtype(data.dtype) ): codes_data = column.as_column(data.cat.codes.values) data = column.build_categorical_column( diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index a3f919c6091..f9435eebe96 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -41,8 +41,8 @@ from cudf.api.extensions import no_default from cudf.api.types import ( _is_non_decimal_numeric_dtype, + _is_categorical_dtype, is_bool_dtype, - is_categorical_dtype, is_decimal_dtype, is_dict_like, is_list_dtype, @@ -170,7 +170,7 @@ def _indices_from_labels(obj, labels): if not isinstance(labels, cudf.MultiIndex): labels = cudf.core.column.as_column(labels) - if is_categorical_dtype(obj.index): + if _is_categorical_dtype(obj.index): labels = labels.astype("category") codes = labels.codes.astype(obj.index._values.codes.dtype) labels = cudf.core.column.build_categorical_column( @@ -5418,21 +5418,21 @@ def _is_same_dtype(lhs_dtype, rhs_dtype): if lhs_dtype == rhs_dtype: return True elif ( - is_categorical_dtype(lhs_dtype) - and is_categorical_dtype(rhs_dtype) + _is_categorical_dtype(lhs_dtype) + and _is_categorical_dtype(rhs_dtype) and lhs_dtype.categories.dtype == rhs_dtype.categories.dtype ): # OK if categories are not all the same return True elif ( - is_categorical_dtype(lhs_dtype) - and not is_categorical_dtype(rhs_dtype) + _is_categorical_dtype(lhs_dtype) + and not _is_categorical_dtype(rhs_dtype) and lhs_dtype.categories.dtype == rhs_dtype ): return True elif ( - is_categorical_dtype(rhs_dtype) - and not is_categorical_dtype(lhs_dtype) + _is_categorical_dtype(rhs_dtype) + and not _is_categorical_dtype(lhs_dtype) and rhs_dtype.categories.dtype == lhs_dtype ): return True diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 85973ee194b..7a80d70acb3 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -540,7 +540,7 @@ def melt( # Error for unimplemented support for datatype dtypes = [frame[col].dtype for col in id_vars + value_vars] - if any(cudf.api.types.is_categorical_dtype(t) for t in dtypes): + if any(cudf.api.types._is_categorical_dtype(t) for t in dtypes): raise NotImplementedError( "Categorical columns are not yet supported for function" ) diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 0273227010b..35ddffb0f01 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. import warnings @@ -9,8 +9,8 @@ from cudf import _lib as libcudf from cudf._lib import strings as libstrings from cudf.api.types import ( + _is_categorical_dtype, _is_non_decimal_numeric_dtype, - is_categorical_dtype, is_datetime_dtype, is_list_dtype, is_string_dtype, @@ -110,7 +110,7 @@ def to_numeric(arg, errors="raise", downcast=None): if is_datetime_dtype(dtype) or is_timedelta_dtype(dtype): col = col.as_numerical_column(cudf.dtype("int64")) - elif is_categorical_dtype(dtype): + elif _is_categorical_dtype(dtype): cat_dtype = col.dtype.type if _is_non_decimal_numeric_dtype(cat_dtype): col = col.as_numerical_column(cat_dtype) diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 0ab3a244ebe..9c2ee637584 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -9,7 +9,7 @@ import cudf from cudf._lib.unary import is_nan from cudf.api.types import ( - is_categorical_dtype, + _is_categorical_dtype, is_decimal_dtype, is_interval_dtype, is_list_dtype, @@ -86,7 +86,7 @@ def _check_types( if ( exact and not isinstance(left, cudf.MultiIndex) - and is_categorical_dtype(left) + and _is_categorical_dtype(left) ): if left.dtype != right.dtype: raise_assert_detail( @@ -144,8 +144,8 @@ def assert_column_equal( """ if check_dtype is True: if ( - is_categorical_dtype(left) - and is_categorical_dtype(right) + _is_categorical_dtype(left) + and _is_categorical_dtype(right) and not check_categorical ): pass @@ -173,7 +173,7 @@ def assert_column_equal( return if check_exact and check_categorical: - if is_categorical_dtype(left) and is_categorical_dtype(right): + if _is_categorical_dtype(left) and _is_categorical_dtype(right): left_cat = left.categories right_cat = right.categories @@ -207,8 +207,8 @@ def assert_column_equal( if ( not check_dtype - and is_categorical_dtype(left) - and is_categorical_dtype(right) + and _is_categorical_dtype(left) + and _is_categorical_dtype(right) ): left = left.astype(left.categories.dtype) right = right.astype(right.categories.dtype) @@ -254,7 +254,7 @@ def assert_column_equal( raise e else: columns_equal = False - if is_categorical_dtype(left) and is_categorical_dtype(right): + if _is_categorical_dtype(left) and _is_categorical_dtype(right): left = left.astype(left.categories.dtype) right = right.astype(right.categories.dtype) if not columns_equal: diff --git a/python/cudf/cudf/tests/test_api_types.py b/python/cudf/cudf/tests/test_api_types.py index ae3d232e542..da29972ea82 100644 --- a/python/cudf/cudf/tests/test_api_types.py +++ b/python/cudf/cudf/tests/test_api_types.py @@ -116,7 +116,7 @@ ), ) def test_is_categorical_dtype(obj, expect): - assert types.is_categorical_dtype(obj) == expect + assert types._is_categorical_dtype(obj) == expect @pytest.mark.parametrize( @@ -1036,9 +1036,10 @@ def test_is_decimal_dtype(obj, expect): ), ) def test_pandas_agreement(obj): - assert types.is_categorical_dtype(obj) == pd_types.is_categorical_dtype( - obj - ) + with pytest.warns(FutureWarning): + assert types.is_categorical_dtype( + obj + ) == pd_types.is_categorical_dtype(obj) assert types.is_numeric_dtype(obj) == pd_types.is_numeric_dtype(obj) assert types.is_integer_dtype(obj) == pd_types.is_integer_dtype(obj) assert types.is_integer(obj) == pd_types.is_integer(obj) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index db0446d506c..ad585518b83 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -81,7 +81,7 @@ def test_column_offset_and_size(pandas_input, offset, size): children=col.base_children, ) - if cudf.api.types.is_categorical_dtype(col.dtype): + if cudf.api.types._is_categorical_dtype(col.dtype): assert col.size == col.codes.size assert col.size == (col.codes.data.size / col.codes.dtype.itemsize) elif cudf.api.types.is_string_dtype(col.dtype): @@ -120,7 +120,7 @@ def column_slicing_test(col, offset, size, cast_to_float=False): else: pd_series = series.to_pandas() - if cudf.api.types.is_categorical_dtype(col.dtype): + if cudf.api.types._is_categorical_dtype(col.dtype): # The cudf.Series is constructed from an already sliced column, whereas # the pandas.Series is constructed from the unsliced series and then # sliced, so the indexes should be different and we must ignore it. diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 925a522399d..32d22c3e2f5 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -7,7 +7,7 @@ import pytest import cudf as gd -from cudf.api.types import is_categorical_dtype +from cudf.api.types import _is_categorical_dtype from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140, PANDAS_GE_200 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing._utils import ( @@ -582,8 +582,8 @@ def test_concat_empty_dataframes(df, other, ignore_index): actual = gd.concat(other_gd, ignore_index=ignore_index) if expected.shape != df.shape: for key, col in actual[actual.columns].items(): - if is_categorical_dtype(col.dtype): - if not is_categorical_dtype(expected[key].dtype): + if _is_categorical_dtype(col.dtype): + if not _is_categorical_dtype(expected[key].dtype): # TODO: Pandas bug: # https://github.com/pandas-dev/pandas/issues/42840 expected[key] = expected[key].fillna("-1").astype("str") @@ -1213,8 +1213,8 @@ def test_concat_join_empty_dataframes( if expected.shape != df.shape: if axis == 0: for key, col in actual[actual.columns].items(): - if is_categorical_dtype(col.dtype): - if not is_categorical_dtype(expected[key].dtype): + if _is_categorical_dtype(col.dtype): + if not _is_categorical_dtype(expected[key].dtype): # TODO: Pandas bug: # https://github.com/pandas-dev/pandas/issues/42840 expected[key] = ( @@ -1336,7 +1336,7 @@ def test_concat_join_empty_dataframes_axis_1( if expected.shape != df.shape: if axis == 0: for key, col in actual[actual.columns].items(): - if is_categorical_dtype(col.dtype): + if _is_categorical_dtype(col.dtype): expected[key] = expected[key].fillna("-1") actual[key] = col.astype("str").fillna("-1") # if not expected.empty: diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d873597f849..9ab5b835049 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -5943,7 +5943,9 @@ def test_df_sr_mask_where(data, condition, other, error, inplace): expect_mask = ps_mask got_mask = gs_mask - if pd.api.types.is_categorical_dtype(expect_where): + if hasattr(expect_where, "dtype") and isinstance( + expect_where, pd.CategoricalDtype + ): np.testing.assert_array_equal( expect_where.cat.codes, got_where.cat.codes.astype(expect_where.cat.codes.dtype) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 01715e3be52..e82f3d581e5 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -659,7 +659,7 @@ def test_index_where(data, condition, other, error): gs_other = other if error is None: - if pd.api.types.is_categorical_dtype(ps): + if hasattr(ps, "dtype") and isinstance(ps, pd.CategoricalDtype): expect = ps.where(ps_condition, other=ps_other) got = gs.where(gs_condition, other=gs_other) np.testing.assert_array_equal( diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index f3e245f8769..a5e3d1230fa 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -184,7 +184,7 @@ def cudf_dtype_from_pydata_dtype(dtype): Python dtype. """ - if cudf.api.types.is_categorical_dtype(dtype): + if cudf.api.types._is_categorical_dtype(dtype): return cudf.core.dtypes.CategoricalDtype elif cudf.api.types.is_decimal32_dtype(dtype): return cudf.core.dtypes.Decimal32Dtype @@ -202,7 +202,7 @@ def cudf_dtype_to_pa_type(dtype): """Given a cudf pandas dtype, converts it into the equivalent cuDF Python dtype. """ - if cudf.api.types.is_categorical_dtype(dtype): + if cudf.api.types._is_categorical_dtype(dtype): raise NotImplementedError() elif ( cudf.api.types.is_list_dtype(dtype) @@ -427,9 +427,9 @@ def get_min_float_dtype(col): def is_mixed_with_object_dtype(lhs, rhs): - if cudf.api.types.is_categorical_dtype(lhs.dtype): + if cudf.api.types._is_categorical_dtype(lhs.dtype): return is_mixed_with_object_dtype(lhs.dtype.categories, rhs) - elif cudf.api.types.is_categorical_dtype(rhs.dtype): + elif cudf.api.types._is_categorical_dtype(rhs.dtype): return is_mixed_with_object_dtype(lhs, rhs.dtype.categories) return (lhs.dtype == "object" and rhs.dtype != "object") or ( @@ -529,10 +529,10 @@ def find_common_type(dtypes): # Early exit for categoricals since they're not hashable and therefore # can't be put in a set. - if any(cudf.api.types.is_categorical_dtype(dtype) for dtype in dtypes): + if any(cudf.api.types._is_categorical_dtype(dtype) for dtype in dtypes): if all( ( - cudf.api.types.is_categorical_dtype(dtype) + cudf.api.types._is_categorical_dtype(dtype) and (not dtype.ordered if hasattr(dtype, "ordered") else True) ) for dtype in dtypes diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index c98e724a72c..155f2d81c23 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -312,7 +312,7 @@ def tolist_cudf(obj): ) @_dask_cudf_nvtx_annotate def is_categorical_dtype_cudf(obj): - return cudf.api.types.is_categorical_dtype(obj) + return cudf.api.types._is_categorical_dtype(obj) @grouper_dispatch.register((cudf.Series, cudf.DataFrame)) @@ -341,7 +341,7 @@ def percentile_cudf(a, q, interpolation="linear"): if isinstance(q, Iterator): q = list(q) - if cudf.api.types.is_categorical_dtype(a.dtype): + if cudf.api.types._is_categorical_dtype(a.dtype): result = cp.percentile(a.cat.codes, q, interpolation=interpolation) return ( diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py index d6c9c1be73c..c8ddef54e2b 100644 --- a/python/dask_cudf/dask_cudf/sorting.py +++ b/python/dask_cudf/dask_cudf/sorting.py @@ -15,7 +15,7 @@ from dask.utils import M import cudf as gd -from cudf.api.types import is_categorical_dtype +from cudf.api.types import _is_categorical_dtype from cudf.utils.utils import _dask_cudf_nvtx_annotate _SHUFFLE_SUPPORT = ("tasks", "p2p") # "disk" not supported @@ -203,7 +203,7 @@ def quantile_divisions(df, by, npartitions): if ( len(columns) == 1 and df[columns[0]].dtype != "object" - and not is_categorical_dtype(df[columns[0]].dtype) + and not _is_categorical_dtype(df[columns[0]].dtype) ): dtype = df[columns[0]].dtype divisions = divisions[columns[0]].astype("int64") From 7c6d8f2a6faf92c76ba379656133b4dda8358fa6 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 12 Oct 2023 18:24:00 -0500 Subject: [PATCH 064/162] Deprecate is_interval_dtype and is_datetime64tz_dtype (#14275) This PR deprecates `is_datetime64tz_dtype` and `is_interval_dtype` to have parity with pandas-2.x: https://github.com/pandas-dev/pandas/pull/52607, alternatively this PR internalizes these utilities. This PR: ``` = 1584 failed, 98570 passed, 2069 skipped, 776 xfailed, 312 xpassed, 20 errors in 737.24s (0:12:17) = ``` On `pandas_2.0_feature_branch`: ``` = 23835 failed, 5698 passed, 1613 skipped, 288 xfailed, 423 errors in 1976.84s (0:32:56) = ``` --- python/cudf/cudf/_lib/column.pyx | 6 +-- python/cudf/cudf/_lib/groupby.pyx | 9 ++-- python/cudf/cudf/api/types.py | 29 ++++++++--- python/cudf/cudf/core/column/categorical.py | 6 +-- python/cudf/cudf/core/column/column.py | 24 ++++----- python/cudf/cudf/core/column/datetime.py | 4 +- python/cudf/cudf/core/column/interval.py | 8 +-- python/cudf/cudf/core/dtypes.py | 55 ++++++++++++--------- python/cudf/cudf/core/index.py | 19 +++---- python/cudf/cudf/testing/testing.py | 6 +-- 10 files changed, 90 insertions(+), 76 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index fbd70de9905..da9ef1f1697 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -11,7 +11,7 @@ import rmm import cudf import cudf._lib as libcudf from cudf._lib import pylibcudf -from cudf.api.types import _is_categorical_dtype, is_datetime64tz_dtype +from cudf.api.types import _is_categorical_dtype, _is_datetime64tz_dtype from cudf.core.buffer import ( Buffer, ExposureTrackedBuffer, @@ -334,7 +334,7 @@ cdef class Column: if _is_categorical_dtype(self.dtype): col = self.base_children[0] data_dtype = col.dtype - elif is_datetime64tz_dtype(self.dtype): + elif _is_datetime64tz_dtype(self.dtype): col = self data_dtype = _get_base_dtype(col.dtype) else: @@ -397,7 +397,7 @@ cdef class Column: if _is_categorical_dtype(self.dtype): col = self.base_children[0] data_dtype = col.dtype - elif is_datetime64tz_dtype(self.dtype): + elif _is_datetime64tz_dtype(self.dtype): col = self data_dtype = _get_base_dtype(col.dtype) else: diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 0067981169c..5b882bf9d3c 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -1,16 +1,15 @@ # Copyright (c) 2020-2023, NVIDIA CORPORATION. -from pandas.core.groupby.groupby import DataError - from cudf.api.types import ( _is_categorical_dtype, + _is_interval_dtype, is_decimal_dtype, - is_interval_dtype, is_list_dtype, is_string_dtype, is_struct_dtype, ) from cudf.core.buffer import acquire_spill_lock +from pandas.core.groupby.groupby import DataError from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -191,7 +190,7 @@ cdef class GroupBy: else _STRING_AGGS if is_string_dtype(dtype) else _CATEGORICAL_AGGS if _is_categorical_dtype(dtype) else _STRUCT_AGGS if is_struct_dtype(dtype) - else _INTERVAL_AGGS if is_interval_dtype(dtype) + else _INTERVAL_AGGS if _is_interval_dtype(dtype) else _DECIMAL_AGGS if is_decimal_dtype(dtype) else "ALL" ) @@ -262,7 +261,7 @@ cdef class GroupBy: else _STRING_AGGS if is_string_dtype(dtype) else _CATEGORICAL_AGGS if _is_categorical_dtype(dtype) else _STRUCT_AGGS if is_struct_dtype(dtype) - else _INTERVAL_AGGS if is_interval_dtype(dtype) + else _INTERVAL_AGGS if _is_interval_dtype(dtype) else _DECIMAL_AGGS if is_decimal_dtype(dtype) else "ALL" ) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 007b9f3ee02..4f948fddab7 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -4,6 +4,7 @@ from __future__ import annotations +import warnings from collections import abc from functools import wraps from inspect import isclass @@ -11,14 +12,14 @@ import cupy as cp import numpy as np -import pandas as pd -from pandas.api import types as pd_types import cudf +import pandas as pd from cudf.core.dtypes import ( # noqa: F401 _BaseDtype, - dtype, _is_categorical_dtype, + _is_interval_dtype, + dtype, is_decimal32_dtype, is_decimal64_dtype, is_decimal128_dtype, @@ -27,6 +28,7 @@ is_list_dtype, is_struct_dtype, ) +from pandas.api import types as pd_types def is_numeric_dtype(obj): @@ -116,7 +118,7 @@ def is_string_dtype(obj): and not is_decimal_dtype(obj) and not is_list_dtype(obj) and not is_struct_dtype(obj) - and not is_interval_dtype(obj) + and not _is_interval_dtype(obj) ) ) @@ -451,6 +453,22 @@ def is_any_real_numeric_dtype(arr_or_dtype) -> bool: ) +def _is_datetime64tz_dtype(obj): + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + return _wrap_pandas_is_dtype_api(pd_types.is_datetime64tz_dtype)(obj) + + +def is_datetime64tz_dtype(obj): + # Do not remove until pandas 3.0 support is added. + warnings.warn( + "is_datetime64tz_dtype is deprecated and will be removed in a future " + "version.", + FutureWarning, + ) + return _is_datetime64tz_dtype(obj) + + # TODO: The below alias is removed for now since improving cudf categorical # support is ongoing and we don't want to introduce any ambiguities. The above # method _union_categoricals will take its place once exposed. @@ -465,9 +483,6 @@ def is_any_real_numeric_dtype(arr_or_dtype) -> bool: is_datetime64_ns_dtype = _wrap_pandas_is_dtype_api( pd_types.is_datetime64_ns_dtype ) -is_datetime64tz_dtype = _wrap_pandas_is_dtype_api( - pd_types.is_datetime64tz_dtype -) is_extension_array_dtype = pd_types.is_extension_array_dtype is_int64_dtype = pd_types.is_int64_dtype is_period_dtype = pd_types.is_period_dtype diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index af0e3257d4e..73ca529b248 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -7,16 +7,16 @@ from typing import TYPE_CHECKING, Any, Mapping, Optional, Sequence, Tuple, cast import numpy as np -import pandas as pd import pyarrow as pa from numba import cuda from typing_extensions import Self import cudf +import pandas as pd from cudf import _lib as libcudf from cudf._lib.transform import bools_to_mask from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike -from cudf.api.types import _is_categorical_dtype, is_interval_dtype +from cudf.api.types import _is_categorical_dtype, _is_interval_dtype from cudf.core.buffer import Buffer from cudf.core.column import column from cudf.core.column.methods import ColumnMethods @@ -802,7 +802,7 @@ def to_pandas( .fillna(_DEFAULT_CATEGORICAL_VALUE) .values_host ) - if is_interval_dtype(col.categories.dtype): + if _is_interval_dtype(col.categories.dtype): # leaving out dropna because it temporarily changes an interval # index into a struct and throws off results. # TODO: work on interval index dropna diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 048ce620a8d..c1b74a01fc2 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -23,7 +23,6 @@ import cupy import numpy as np -import pandas as pd import pyarrow as pa from numba import cuda from typing_extensions import Self @@ -31,6 +30,7 @@ import rmm import cudf +import pandas as pd from cudf import _lib as libcudf from cudf._lib.column import Column from cudf._lib.null_mask import ( @@ -50,18 +50,18 @@ from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.api.types import ( _is_categorical_dtype, + _is_datetime64tz_dtype, + _is_interval_dtype, _is_non_decimal_numeric_dtype, infer_dtype, is_bool_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_decimal32_dtype, is_decimal64_dtype, is_decimal128_dtype, is_decimal_dtype, is_dtype_equal, is_integer_dtype, - is_interval_dtype, is_list_dtype, is_scalar, is_string_dtype, @@ -1014,7 +1014,7 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: "Casting struct columns not currently supported" ) return self - elif is_interval_dtype(self.dtype): + elif _is_interval_dtype(self.dtype): return self.as_interval_column(dtype, **kwargs) elif is_decimal_dtype(dtype): return self.as_decimal_column(dtype, **kwargs) @@ -1579,7 +1579,7 @@ def build_column( offset=offset, null_count=null_count, ) - elif is_datetime64tz_dtype(dtype): + elif _is_datetime64tz_dtype(dtype): if data is None: raise TypeError("Must specify data buffer") return cudf.core.column.datetime.DatetimeTZColumn( @@ -1618,7 +1618,7 @@ def build_column( null_count=null_count, children=children, ) - elif is_interval_dtype(dtype): + elif _is_interval_dtype(dtype): return cudf.core.column.IntervalColumn( dtype=dtype, mask=mask, @@ -1675,7 +1675,7 @@ def build_column( null_count=null_count, children=children, ) - elif is_interval_dtype(dtype): + elif _is_interval_dtype(dtype): return cudf.core.column.IntervalColumn( dtype=dtype, mask=mask, @@ -2045,7 +2045,7 @@ def as_column( "cuDF does not yet support timezone-aware datetimes" ) data = as_column(pa.array(arbitrary, from_pandas=True)) - elif is_interval_dtype(arbitrary.dtype): + elif _is_interval_dtype(arbitrary.dtype): if isinstance(arbitrary.dtype.subtype, pd.DatetimeTZDtype): raise NotImplementedError( "cuDF does not yet support timezone-aware datetimes" @@ -2287,7 +2287,7 @@ def as_column( ) or ( isinstance(arbitrary, pd.IntervalIndex) - and is_datetime64tz_dtype(arbitrary.dtype.subtype) + and _is_datetime64tz_dtype(arbitrary.dtype.subtype) ) or ( isinstance(arbitrary, pd.CategoricalIndex) @@ -2347,11 +2347,11 @@ def as_column( np_type = None try: if dtype is not None: - if _is_categorical_dtype(dtype) or is_interval_dtype( + if _is_categorical_dtype(dtype) or _is_interval_dtype( dtype ): raise TypeError - if is_datetime64tz_dtype(dtype): + if _is_datetime64tz_dtype(dtype): raise NotImplementedError( "Use `tz_localize()` to construct " "timezone aware data." @@ -2499,7 +2499,7 @@ def as_column( elif np_type == np.str_: sr = pd.Series(arbitrary, dtype="str") data = as_column(sr, nan_as_null=nan_as_null) - elif is_interval_dtype(dtype): + elif _is_interval_dtype(dtype): sr = pd.Series(arbitrary, dtype="interval") data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype) elif ( diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 75548daf310..3998ca99dba 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -22,8 +22,8 @@ ScalarLike, ) from cudf.api.types import ( + _is_datetime64tz_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_scalar, is_timedelta64_dtype, ) @@ -566,7 +566,7 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool: return False def _with_type_metadata(self, dtype): - if is_datetime64tz_dtype(dtype): + if _is_datetime64tz_dtype(dtype): return DatetimeTZColumn( data=self.base_data, dtype=dtype, diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index 7b87552f1a0..bcbe777ee66 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -1,11 +1,11 @@ # Copyright (c) 2018-2023, NVIDIA CORPORATION. from typing import Optional -import pandas as pd import pyarrow as pa import cudf -from cudf.api.types import _is_categorical_dtype, is_interval_dtype +import pandas as pd +from cudf.api.types import _is_categorical_dtype, _is_interval_dtype from cudf.core.column import StructColumn from cudf.core.dtypes import IntervalDtype @@ -101,11 +101,11 @@ def copy(self, deep=True): ) def as_interval_column(self, dtype, **kwargs): - if is_interval_dtype(dtype): + if _is_interval_dtype(dtype): if _is_categorical_dtype(self): new_struct = self._get_decategorized_column() return IntervalColumn.from_struct_column(new_struct) - if is_interval_dtype(dtype): + if _is_interval_dtype(dtype): # a user can directly input the string `interval` as the dtype # when creating an interval series or interval dataframe if dtype == "interval": diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 6fae552c6ed..e293b8a61f5 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -9,21 +9,21 @@ from typing import Any, Callable, Dict, List, Tuple, Type, Union import numpy as np -import pandas as pd import pyarrow as pa -from pandas.api import types as pd_types -from pandas.api.extensions import ExtensionDtype -from pandas.core.dtypes.dtypes import ( - CategoricalDtype as pd_CategoricalDtype, - CategoricalDtypeType as pd_CategoricalDtypeType, -) import cudf +import pandas as pd from cudf._typing import Dtype from cudf.core._compat import PANDAS_GE_150 from cudf.core.abc import Serializable from cudf.core.buffer import Buffer from cudf.utils.docutils import doc_apply +from pandas.api import types as pd_types +from pandas.api.extensions import ExtensionDtype +from pandas.core.dtypes.dtypes import ( + CategoricalDtype as pd_CategoricalDtype, + CategoricalDtypeType as pd_CategoricalDtypeType, +) if PANDAS_GE_150: from pandas.core.arrays.arrow.extension_types import ArrowIntervalType @@ -261,7 +261,7 @@ def to_pandas(self) -> pd.CategoricalDtype: def _init_categories(self, categories: Any): if categories is None: return categories - if len(categories) == 0 and not is_interval_dtype(categories): + if len(categories) == 0 and not _is_interval_dtype(categories): dtype = "object" # type: Any else: dtype = None @@ -1107,21 +1107,7 @@ def is_decimal_dtype(obj): ) -def is_interval_dtype(obj): - """Check whether an array-like or dtype is of the interval dtype. - - Parameters - ---------- - obj : array-like or dtype - The array-like or dtype to check. - - Returns - ------- - bool - Whether or not the array-like or dtype is of the interval dtype. - """ - # TODO: Should there be any branch in this function that calls - # pd.api.types.is_interval_dtype? +def _is_interval_dtype(obj): return ( isinstance( obj, @@ -1135,8 +1121,29 @@ def is_interval_dtype(obj): or ( isinstance(obj, str) and obj == cudf.core.dtypes.IntervalDtype.name ) - or (hasattr(obj, "dtype") and is_interval_dtype(obj.dtype)) + or (hasattr(obj, "dtype") and _is_interval_dtype(obj.dtype)) + ) + + +def is_interval_dtype(obj): + """Check whether an array-like or dtype is of the interval dtype. + + Parameters + ---------- + obj : array-like or dtype + The array-like or dtype to check. + + Returns + ------- + bool + Whether or not the array-like or dtype is of the interval dtype. + """ + warnings.warn( + "is_interval_dtype is deprecated and will be removed in a " + "future version. Use `isinstance(dtype, cudf.IntervalDtype)` instead", + FutureWarning, ) + return _is_interval_dtype(obj) def is_decimal32_dtype(obj): diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 92b244d1999..6d144b36a65 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -7,22 +7,14 @@ import warnings from functools import cache, cached_property from numbers import Number -from typing import ( - Any, - List, - MutableMapping, - Optional, - Tuple, - Union, -) +from typing import Any, List, MutableMapping, Optional, Tuple, Union import cupy import numpy as np -import pandas as pd -from pandas._config import get_option from typing_extensions import Self import cudf +import pandas as pd from cudf import _lib as libcudf from cudf._lib.datetime import extract_quarter, is_leap_year from cudf._lib.filling import sequence @@ -30,10 +22,10 @@ from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import ( - _is_non_decimal_numeric_dtype, _is_categorical_dtype, + _is_interval_dtype, + _is_non_decimal_numeric_dtype, is_dtype_equal, - is_interval_dtype, is_list_like, is_scalar, ) @@ -70,6 +62,7 @@ _warn_no_dask_cudf, search_range, ) +from pandas._config import get_option class IndexMeta(type): @@ -2713,7 +2706,7 @@ def __init__( if isinstance(data, IntervalColumn): data = data - elif isinstance(data, pd.Series) and (is_interval_dtype(data.dtype)): + elif isinstance(data, pd.Series) and (_is_interval_dtype(data.dtype)): data = column.as_column(data, data.dtype) elif isinstance(data, (pd._libs.interval.Interval, pd.IntervalIndex)): data = column.as_column( diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 9c2ee637584..3e8c986ab95 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -4,14 +4,14 @@ import cupy as cp import numpy as np -import pandas as pd import cudf +import pandas as pd from cudf._lib.unary import is_nan from cudf.api.types import ( _is_categorical_dtype, + _is_interval_dtype, is_decimal_dtype, - is_interval_dtype, is_list_dtype, is_numeric_dtype, is_string_dtype, @@ -28,7 +28,7 @@ def dtype_can_compare_equal_to_other(dtype): or is_list_dtype(dtype) or is_struct_dtype(dtype) or is_decimal_dtype(dtype) - or is_interval_dtype(dtype) + or _is_interval_dtype(dtype) ) From 2461315ed223d214dcb38414344f3207eafd6630 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 13 Oct 2023 11:50:56 -0500 Subject: [PATCH 065/162] Deprecate `method` in `fillna` API (#14278) This PR deprecates `method` parameter in all public `fillna` APIs to match pandas: https://github.com/pandas-dev/pandas/pull/53496/ This PR: ``` = 1056 failed, 99098 passed, 2069 skipped, 776 xfailed, 312 xpassed, 20 errors in 670.87s (0:11:10) = ``` On `pandas_2.0_feature_branch`: ``` = 1584 failed, 98570 passed, 2069 skipped, 776 xfailed, 312 xpassed, 20 errors in 737.24s (0:12:17) = ``` --- python/cudf/cudf/core/dataframe.py | 4 +++- python/cudf/cudf/core/frame.py | 4 +++- python/cudf/cudf/core/groupby/groupby.py | 13 ++++++++++++- python/cudf/cudf/core/indexed_frame.py | 8 ++++++++ python/cudf/cudf/core/resample.py | 5 ++++- python/cudf/cudf/core/series.py | 4 +++- python/cudf/cudf/tests/test_groupby.py | 12 ++++++++---- python/cudf/cudf/tests/test_replace.py | 20 +++++++++++++++----- 8 files changed, 56 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0cfa37c224d..793742604a2 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7031,7 +7031,9 @@ def pct_change( "'bfill', or 'backfill'." ) - data = self.fillna(method=fill_method, limit=limit) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + data = self.fillna(method=fill_method, limit=limit) return data.diff(periods=periods) / data.shift( periods=periods, freq=freq diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 43a713e273d..0b627c12d97 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -735,13 +735,15 @@ def fillna( are filled with values in corresponding indices. A dict can be used to provide different values to fill nulls in different columns. Cannot be used with ``method``. - method : {'ffill', 'bfill'}, default None Method to use for filling null values in the dataframe or series. `ffill` propagates the last non-null values forward to the next non-null value. `bfill` propagates backward with the next non-null value. Cannot be used with ``value``. + .. deprecated:: 23.12 + `method` is deprecated. + Returns ------- result : DataFrame, Series, or Index diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 0fc61713eeb..fcff8e805bf 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -2183,6 +2183,14 @@ def fillna( if method is not None: if method not in {"ffill", "bfill"}: raise ValueError("Method can only be of 'ffill', 'bfill'.") + # Do not remove until pandas 3.0 support is added. + warnings.warn( + f"{type(self).__name__}.fillna with 'method' is " + "deprecated and will raise in a future version. " + "Use obj.ffill() or obj.bfill() instead.", + FutureWarning, + ) + return getattr(self, method, limit)() values = self.obj.__class__._from_data( @@ -2295,7 +2303,10 @@ def pct_change( FutureWarning, ) - filled = self.fillna(method=fill_method, limit=limit) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + filled = self.fillna(method=fill_method, limit=limit) + fill_grp = filled.groupby(self.grouping) shifted = fill_grp.shift(periods=periods, freq=freq) return (filled / shifted) - 1 diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index f9435eebe96..028c6cf208b 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -2148,6 +2148,14 @@ def _split(self, splits, keep_index=True): def fillna( self, value=None, method=None, axis=None, inplace=False, limit=None ): # noqa: D102 + if method is not None: + # Do not remove until pandas 3.0 support is added. + warnings.warn( + f"{type(self).__name__}.fillna with 'method' is " + "deprecated and will raise in a future version. " + "Use obj.ffill() or obj.bfill() instead.", + FutureWarning, + ) old_index = self._index ret = super().fillna(value, method, axis, inplace, limit) if inplace: diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py index eb59cf83926..83a003cb949 100644 --- a/python/cudf/cudf/core/resample.py +++ b/python/cudf/cudf/core/resample.py @@ -15,6 +15,7 @@ # limitations under the License. import pickle +import warnings import numpy as np import pandas as pd @@ -73,7 +74,9 @@ def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries: ) # fill the gaps: - filled = upsampled.fillna(method=method) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + filled = upsampled.fillna(method=method) # filter the result to only include the values corresponding # to the bin labels: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 47808259f14..9e82a353282 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3506,7 +3506,9 @@ def pct_change( "'bfill', or 'backfill'." ) - data = self.fillna(method=fill_method, limit=limit) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + data = self.fillna(method=fill_method, limit=limit) diff = data.diff(periods=periods) change = diff / data.shift(periods=periods, freq=freq) return change diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 50a749a25b8..2f83348bcff 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -2646,10 +2646,12 @@ def test_groupby_various_by_fillna(by, data, args): ps = pd.Series(data) gs = cudf.from_pandas(ps) - expect = ps.groupby(by).fillna(**args) + with expect_warning_if(PANDAS_GE_210 and "method" in args): + expect = ps.groupby(by).fillna(**args) if isinstance(by, pd.Grouper): by = cudf.Grouper(level=by.level) - got = gs.groupby(by).fillna(**args) + with expect_warning_if("method" in args): + got = gs.groupby(by).fillna(**args) assert_groupby_results_equal(expect, got, check_dtype=False) @@ -2693,8 +2695,10 @@ def test_groupby_fillna_method(nelem, method): pdf = t.to_pandas() gdf = cudf.from_pandas(pdf) - expect = pdf.groupby(key_col).fillna(method=method) - got = gdf.groupby(key_col).fillna(method=method) + with expect_warning_if(PANDAS_GE_210): + expect = pdf.groupby(key_col).fillna(method=method) + with pytest.warns(FutureWarning): + got = gdf.groupby(key_col).fillna(method=method) assert_groupby_results_equal( expect[value_cols], got[value_cols], sort=False diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index 364afacd261..3ab7064e2d0 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -8,13 +8,19 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_134, PANDAS_GE_150, PANDAS_GE_200 +from cudf.core._compat import ( + PANDAS_GE_134, + PANDAS_GE_150, + PANDAS_GE_200, + PANDAS_GE_210, +) from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing._utils import ( INTEGER_TYPES, NUMERIC_TYPES, assert_eq, assert_exceptions_equal, + expect_warning_if, ) @@ -348,8 +354,10 @@ def test_fillna_method_numerical(data, container, data_dtype, method, inplace): # Explicitly using nans_as_nulls=True gdata = cudf.from_pandas(pdata, nan_as_null=True) - expected = pdata.fillna(method=method, inplace=inplace) - actual = gdata.fillna(method=method, inplace=inplace) + with expect_warning_if(PANDAS_GE_210): + expected = pdata.fillna(method=method, inplace=inplace) + with pytest.warns(FutureWarning): + actual = gdata.fillna(method=method, inplace=inplace) if inplace: expected = pdata @@ -665,8 +673,10 @@ def test_fillna_method_fixed_width_non_num(data, container, method, inplace): # Explicitly using nans_as_nulls=True gdata = cudf.from_pandas(pdata, nan_as_null=True) - expected = pdata.fillna(method=method, inplace=inplace) - actual = gdata.fillna(method=method, inplace=inplace) + with expect_warning_if(PANDAS_GE_210): + expected = pdata.fillna(method=method, inplace=inplace) + with pytest.warns(FutureWarning): + actual = gdata.fillna(method=method, inplace=inplace) if inplace: expected = pdata From 90788f27953f23b75d88ba53dce99c93b8990292 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 29 Nov 2023 11:46:35 -0600 Subject: [PATCH 066/162] Deprecate `fill_method` and `limit` in `pct_change` APIs (#14277) This PR deprecated `fill_method` and `limit` in `Series.pct_change`, `DataFrame.pct_change` and `Groupby.pct_change` This PR: ``` = 1263 failed, 98996 passed, 2069 skipped, 776 xfailed, 312 xpassed, 20 errors in 701.08s (0:11:41) = ``` On `pandas_2.0_feature_branch`: ``` = 1584 failed, 98570 passed, 2069 skipped, 776 xfailed, 312 xpassed, 20 errors in 737.24s (0:12:17) = ``` --- python/cudf/cudf/core/dataframe.py | 36 +++++++++++++++++++--- python/cudf/cudf/core/groupby/groupby.py | 38 +++++++++++++++++------- python/cudf/cudf/core/series.py | 36 +++++++++++++++++++--- python/cudf/cudf/tests/test_dataframe.py | 15 ++++++++-- python/cudf/cudf/tests/test_groupby.py | 23 +++++++++----- python/cudf/cudf/tests/test_stats.py | 18 +++++++++-- 6 files changed, 135 insertions(+), 31 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b4f8669ba9d..63139231d75 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7137,7 +7137,7 @@ def explode(self, column, ignore_index=False): return super()._explode(column, ignore_index) def pct_change( - self, periods=1, fill_method="ffill", limit=None, freq=None + self, periods=1, fill_method=no_default, limit=no_default, freq=None ): """ Calculates the percent change between sequential elements @@ -7149,9 +7149,15 @@ def pct_change( Periods to shift for forming percent change. fill_method : str, default 'ffill' How to handle NAs before computing percent changes. + + .. deprecated:: 23.12 + All options of `fill_method` are deprecated except `fill_method=None`. limit : int, optional The number of consecutive NAs to fill before stopping. Not yet implemented. + + .. deprecated:: 23.12 + `limit` is deprecated. freq : str, optional Increment to use from time series API. Not yet implemented. @@ -7160,16 +7166,38 @@ def pct_change( ------- DataFrame """ - if limit is not None: + if limit is not no_default: raise NotImplementedError("limit parameter not supported yet.") if freq is not None: raise NotImplementedError("freq parameter not supported yet.") - elif fill_method not in {"ffill", "pad", "bfill", "backfill"}: + elif fill_method not in { + no_default, + None, + "ffill", + "pad", + "bfill", + "backfill", + }: raise ValueError( - "fill_method must be one of 'ffill', 'pad', " + "fill_method must be one of None, 'ffill', 'pad', " "'bfill', or 'backfill'." ) + if fill_method not in (no_default, None) or limit is not no_default: + # Do not remove until pandas 3.0 support is added. + warnings.warn( + "The 'fill_method' and 'limit' keywords in " + f"{type(self).__name__}.pct_change are deprecated and will be " + "removed in a future version. Either fill in any non-leading NA values prior " + "to calling pct_change or specify 'fill_method=None' to not fill NA " + "values.", + FutureWarning, + ) + if fill_method is no_default: + fill_method = "ffill" + if limit is no_default: + limit = None + with warnings.catch_warnings(): warnings.simplefilter("ignore") data = self.fillna(method=fill_method, limit=limit) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index bad5106970e..414a86470f0 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -21,6 +21,7 @@ from cudf._lib.sort import segmented_sort_by_key from cudf._lib.types import size_type_dtype from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType +from cudf.api.extensions import no_default from cudf.api.types import is_bool_dtype, is_float_dtype, is_list_like from cudf.core.abc import Serializable from cudf.core.column.column import ColumnBase, arange, as_column @@ -2286,7 +2287,12 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): @_cudf_nvtx_annotate def pct_change( - self, periods=1, fill_method="ffill", axis=0, limit=None, freq=None + self, + periods=1, + fill_method=no_default, + axis=0, + limit=no_default, + freq=None, ): """ Calculates the percent change between sequential elements @@ -2298,9 +2304,15 @@ def pct_change( Periods to shift for forming percent change. fill_method : str, default 'ffill' How to handle NAs before computing percent changes. + + .. deprecated:: 23.12 + All options of `fill_method` are deprecated except `fill_method=None`. limit : int, optional The number of consecutive NAs to fill before stopping. Not yet implemented. + + .. deprecated:: 23.12 + `limit` is deprecated. freq : str, optional Increment to use from time series API. Not yet implemented. @@ -2312,25 +2324,31 @@ def pct_change( """ if not axis == 0: raise NotImplementedError("Only axis=0 is supported.") - if limit is not None: + if limit is not no_default: raise NotImplementedError("limit parameter not supported yet.") if freq is not None: raise NotImplementedError("freq parameter not supported yet.") - elif fill_method not in {"ffill", "pad", "bfill", "backfill"}: + elif fill_method not in {no_default, None, "ffill", "bfill"}: raise ValueError( - "fill_method must be one of 'ffill', 'pad', " - "'bfill', or 'backfill'." + "fill_method must be one of 'ffill', or" "'bfill'." ) - if fill_method in ("pad", "backfill"): - alternative = "ffill" if fill_method == "pad" else "bfill" - # Do not remove until pandas 2.0 support is added. + if fill_method not in (no_default, None) or limit is not no_default: + # Do not remove until pandas 3.0 support is added. warnings.warn( - f"{fill_method} is deprecated and will be removed in a future " - f"version. Use f{alternative} instead.", + "The 'fill_method' keyword being not None and the 'limit' keywords in " + f"{type(self).__name__}.pct_change are deprecated and will be " + "removed in a future version. Either fill in any non-leading NA values prior " + "to calling pct_change or specify 'fill_method=None' to not fill NA " + "values.", FutureWarning, ) + if fill_method in (no_default, None): + fill_method = "ffill" + if limit is no_default: + limit = None + with warnings.catch_warnings(): warnings.simplefilter("ignore") filled = self.fillna(method=fill_method, limit=limit) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index f0323d6f55b..f9987569070 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3569,7 +3569,7 @@ def explode(self, ignore_index=False): @_cudf_nvtx_annotate def pct_change( - self, periods=1, fill_method="ffill", limit=None, freq=None + self, periods=1, fill_method=no_default, limit=no_default, freq=None ): """ Calculates the percent change between sequential elements @@ -3581,9 +3581,15 @@ def pct_change( Periods to shift for forming percent change. fill_method : str, default 'ffill' How to handle NAs before computing percent changes. + + .. deprecated:: 23.12 + All options of `fill_method` are deprecated except `fill_method=None`. limit : int, optional The number of consecutive NAs to fill before stopping. Not yet implemented. + + .. deprecated:: 23.12 + `limit` is deprecated. freq : str, optional Increment to use from time series API. Not yet implemented. @@ -3592,15 +3598,37 @@ def pct_change( ------- Series """ - if limit is not None: + if limit is not no_default: raise NotImplementedError("limit parameter not supported yet.") if freq is not None: raise NotImplementedError("freq parameter not supported yet.") - elif fill_method not in {"ffill", "pad", "bfill", "backfill"}: + elif fill_method not in { + no_default, + None, + "ffill", + "pad", + "bfill", + "backfill", + }: raise ValueError( - "fill_method must be one of 'ffill', 'pad', " + "fill_method must be one of None, 'ffill', 'pad', " "'bfill', or 'backfill'." ) + if fill_method not in (no_default, None) or limit is not no_default: + # Do not remove until pandas 3.0 support is added. + warnings.warn( + "The 'fill_method' and 'limit' keywords in " + f"{type(self).__name__}.pct_change are deprecated and will be " + "removed in a future version. Either fill in any non-leading NA values prior " + "to calling pct_change or specify 'fill_method=None' to not fill NA " + "values.", + FutureWarning, + ) + + if fill_method is no_default: + fill_method = "ffill" + if limit is no_default: + limit = None with warnings.catch_warnings(): warnings.simplefilter("ignore") diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 9192e5e7ca0..9a51ef5ed57 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -25,8 +25,10 @@ PANDAS_GE_134, PANDAS_GE_150, PANDAS_GE_200, + PANDAS_GE_210, PANDAS_LT_140, ) +from cudf.api.extensions import no_default from cudf.core.buffer.spill_manager import get_global_manager from cudf.core.column import column from cudf.testing import _utils as utils @@ -9896,13 +9898,20 @@ def test_dataframe_rename_duplicate_column(): ], ) @pytest.mark.parametrize("periods", [-5, -2, 0, 2, 5]) -@pytest.mark.parametrize("fill_method", ["ffill", "bfill", "pad", "backfill"]) +@pytest.mark.parametrize( + "fill_method", ["ffill", "bfill", "pad", "backfill", no_default] +) def test_dataframe_pct_change(data, periods, fill_method): gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() - actual = gdf.pct_change(periods=periods, fill_method=fill_method) - expected = pdf.pct_change(periods=periods, fill_method=fill_method) + with expect_warning_if(fill_method is not no_default): + actual = gdf.pct_change(periods=periods, fill_method=fill_method) + with expect_warning_if( + PANDAS_GE_210 + and (fill_method is not no_default or pdf.isna().any().any()) + ): + expected = pdf.pct_change(periods=periods, fill_method=fill_method) assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 65c48c1b12d..fd0f7863d2b 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -19,6 +19,7 @@ import cudf from cudf import DataFrame, Series +from cudf.api.extensions import no_default from cudf.core._compat import ( PANDAS_GE_150, PANDAS_LT_140, @@ -3062,17 +3063,25 @@ def test_groupby_transform_maintain_index(by): ], ) @pytest.mark.parametrize("periods", [-5, -2, 0, 2, 5]) -@pytest.mark.parametrize("fill_method", ["ffill", "bfill"]) +@pytest.mark.parametrize("fill_method", ["ffill", "bfill", no_default, None]) def test_groupby_pct_change(data, gkey, periods, fill_method): gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() - actual = gdf.groupby(gkey).pct_change( - periods=periods, fill_method=fill_method - ) - expected = pdf.groupby(gkey).pct_change( - periods=periods, fill_method=fill_method - ) + with expect_warning_if(fill_method not in (no_default, None)): + actual = gdf.groupby(gkey).pct_change( + periods=periods, fill_method=fill_method + ) + with expect_warning_if( + PANDAS_GE_210 + and ( + fill_method not in (no_default, None) + or (fill_method is not None and pdf.isna().any().any()) + ) + ): + expected = pdf.groupby(gkey).pct_change( + periods=periods, fill_method=fill_method + ) assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 8eae74a34f7..41fac49ea83 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -8,6 +8,7 @@ import pytest import cudf +from cudf.api.extensions import no_default from cudf.datasets import randomdata from cudf.testing._utils import ( _create_cudf_series_float64_default, @@ -16,6 +17,7 @@ assert_exceptions_equal, expect_warning_if, ) +from cudf.core._compat import PANDAS_GE_210 params_dtypes = [np.int32, np.uint32, np.float32, np.float64] methods = ["min", "max", "sum", "mean", "var", "std"] @@ -356,14 +358,24 @@ def test_series_median(dtype, num_na): ], ) @pytest.mark.parametrize("periods", range(-5, 5)) -@pytest.mark.parametrize("fill_method", ["ffill", "bfill", "pad", "backfill"]) +@pytest.mark.parametrize( + "fill_method", ["ffill", "bfill", "pad", "backfill", no_default, None] +) def test_series_pct_change(data, periods, fill_method): cs = cudf.Series(data) ps = cs.to_pandas() if np.abs(periods) <= len(cs): - got = cs.pct_change(periods=periods, fill_method=fill_method) - expected = ps.pct_change(periods=periods, fill_method=fill_method) + with expect_warning_if(fill_method not in (no_default, None)): + got = cs.pct_change(periods=periods, fill_method=fill_method) + with expect_warning_if( + PANDAS_GE_210 + and ( + fill_method not in (no_default, None) + or (fill_method is not None and ps.isna().any()) + ) + ): + expected = ps.pct_change(periods=periods, fill_method=fill_method) np.testing.assert_array_almost_equal( got.to_numpy(na_value=np.nan), expected ) From c51444fef24bd6ab812808f614b024c31f8bbe22 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 5 Dec 2023 02:10:31 -0500 Subject: [PATCH 067/162] Replace PandasArray with NumpyExtensionArray (#14549) This PR replaces usages of `PandasArray` with `NumpyExtensionArray` to not have warnings during runtime. On `pandas_2.0_feature_branch`: ``` = 15895 failed, 61649 passed, 1840 skipped, 735 xfailed, 312 xpassed, 371 errors in 4361.35s (1:12:41) = ``` On this PR: ``` = 923 failed, 100684 passed, 2071 skipped, 786 xfailed, 312 xpassed, 20 errors in 1216.98s (0:20:16) = ``` --- python/cudf/cudf/core/column/column.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 05517da24b1..b79d0644696 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2109,13 +2109,13 @@ def as_column( ) elif isinstance( arbitrary.dtype, pd.api.extensions.ExtensionDtype - ) and not isinstance(arbitrary, pd.arrays.PandasArray): + ) and not isinstance(arbitrary, NumpyExtensionArray): raise NotImplementedError( "Custom pandas ExtensionDtypes are not supported" ) elif arbitrary.dtype.kind in "fiubmM": # numpy dtype like - if isinstance(arbitrary, pd.arrays.PandasArray): + if isinstance(arbitrary, NumpyExtensionArray): arbitrary = np.array(arbitrary) arb_dtype = np.dtype(arbitrary.dtype) if arb_dtype.kind == "f" and arb_dtype.itemsize == 2: @@ -2129,8 +2129,8 @@ def as_column( arbitrary, nan_as_null=nan_as_null, dtype=dtype, length=length ) elif arbitrary.dtype.kind == "O": - if isinstance(arbitrary, pd.arrays.PandasArray): - # infer_dtype does not handle PandasArray + if isinstance(arbitrary, NumpyExtensionArray): + # infer_dtype does not handle NumpyExtensionArray arbitrary = np.array(arbitrary, dtype=object) inferred_dtype = infer_dtype(arbitrary) if inferred_dtype in ("mixed-integer", "mixed-integer-float"): From e04b88b5dc86f696843b896dd0f2dc3cfbec09a7 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 5 Dec 2023 02:13:58 -0500 Subject: [PATCH 068/162] Fix copy creation of a columnAccessor (#14551) This PR fixes a copy creation in ColumnAccessor by properly passing the rangeindex and label_dtype to it's newly constructed object. --- python/cudf/cudf/core/column_accessor.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index f6f3fe7d8fd..679b3e340f7 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -366,11 +366,15 @@ def copy(self, deep=False) -> ColumnAccessor: {k: v.copy(deep=deep) for k, v in self._data.items()}, multiindex=self.multiindex, level_names=self.level_names, + rangeindex=self.rangeindex, + label_dtype=self.label_dtype, ) return self.__class__( self._data.copy(), multiindex=self.multiindex, level_names=self.level_names, + rangeindex=self.rangeindex, + label_dtype=self.label_dtype, ) def select_by_label(self, key: Any) -> ColumnAccessor: From 29b3ac80d05a1a36c2868b4e15d79ffd7185fce6 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 5 Dec 2023 02:14:39 -0500 Subject: [PATCH 069/162] Fix to_pandas calls (#14552) This PR removes nullable=True in two pytests as we error when nullable is passed when there is decimal / list / struct data. --- python/cudf/cudf/tests/test_parquet.py | 1 - python/cudf/cudf/tests/test_udf_masked_ops.py | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 5390ca72c0a..4d16bb4857e 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2661,7 +2661,6 @@ def test_parquet_writer_decimal(decimal_type, data): gdf.to_parquet(buff) got = pd.read_parquet(buff, dtype_backend="numpy_nullable") - assert_eq(gdf.to_pandas(nullable=True), got) assert_eq(gdf["val"].to_pandas(nullable=True), got["val"]) assert_eq(gdf["dec_val"].to_pandas(), got["dec_val"]) diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index 04f4d12b78e..bd31fbd7f51 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -185,14 +185,13 @@ def func(row): gdf["a"] = gdf["a"].astype(dtype_l) gdf["b"] = gdf["b"].astype(dtype_r) - pdf = gdf.to_pandas(nullable=True) - + pdf = gdf.to_pandas() expect = op(pdf["a"], pdf["b"]) obtain = gdf.apply(func, axis=1) assert_eq(expect, obtain, check_dtype=False) # TODO: After the following pandas issue is # fixed, uncomment the following line and delete - # through `to_pandas(nullable=True)` statement. + # through `to_pandas()` statement. # https://github.com/pandas-dev/pandas/issues/52411 # run_masked_udf_test(func, gdf, nullable=False, check_dtype=False) From 19952eb92cbc1d118d2613ac452d3771bae4e458 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 5 Dec 2023 02:15:34 -0500 Subject: [PATCH 070/162] Add missing `is_categorical_dtype` to `cudf.api.types` namespace (#14555) This PR adds back cudf.api.types.is_categorical that was missing due to a bad merge. --- python/cudf/cudf/api/types.py | 1 + python/cudf/cudf/tests/test_api_types.py | 11 +++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 4ad7e4b1db2..c921a48a599 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -21,6 +21,7 @@ _is_categorical_dtype, _is_interval_dtype, dtype, + is_categorical_dtype, is_decimal32_dtype, is_decimal64_dtype, is_decimal128_dtype, diff --git a/python/cudf/cudf/tests/test_api_types.py b/python/cudf/cudf/tests/test_api_types.py index da29972ea82..d640e8e1376 100644 --- a/python/cudf/cudf/tests/test_api_types.py +++ b/python/cudf/cudf/tests/test_api_types.py @@ -6,9 +6,11 @@ from pandas.api import types as pd_types import cudf -from cudf.core._compat import PANDAS_GE_200 +from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210 from cudf.api import types +from cudf.testing._utils import expect_warning_if + @pytest.mark.parametrize( "obj, expect", @@ -1036,10 +1038,11 @@ def test_is_decimal_dtype(obj, expect): ), ) def test_pandas_agreement(obj): + with expect_warning_if(PANDAS_GE_210): + expected = pd_types.is_categorical_dtype(obj) with pytest.warns(FutureWarning): - assert types.is_categorical_dtype( - obj - ) == pd_types.is_categorical_dtype(obj) + actual = types.is_categorical_dtype(obj) + assert expected == actual assert types.is_numeric_dtype(obj) == pd_types.is_numeric_dtype(obj) assert types.is_integer_dtype(obj) == pd_types.is_integer_dtype(obj) assert types.is_integer(obj) == pd_types.is_integer(obj) From ac07b3d21d522d8298f51a88236d5203a418e109 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 5 Dec 2023 02:25:55 -0500 Subject: [PATCH 071/162] Fix name in Index.difference (#14556) This PR fixes result names for Index.difference in some early exit scenarios. --- python/cudf/cudf/core/_base_index.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 0f7e85f1cc2..61a5a4a5d68 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1109,6 +1109,7 @@ def difference(self, other, sort=None): if is_mixed_with_object_dtype(self, other) or len(other) == 0: difference = self.copy() + difference.name = res_name if sort is True: return difference.sort_values() else: @@ -1122,12 +1123,11 @@ def difference(self, other, sort=None): ) ._data ) + difference.name = res_name if self.dtype != other.dtype: difference = difference.astype(self.dtype) - difference.name = res_name - if sort in {None, True} and len(other): return difference.sort_values() From 2bdd8b8acd0e365931ef418bb815a52cdf237772 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 5 Dec 2023 10:13:29 -0500 Subject: [PATCH 072/162] Filter deprecation warning in `ffill` and `bfill` APIs (#14554) This PR doesn't let the fillna warnings propagate to the user when ffill and bfill APIs are invoked. --- python/cudf/cudf/core/indexed_frame.py | 32 +++++++++++++++----------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 79d3bdf4fc3..246d5b934a5 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -2174,13 +2174,15 @@ def bfill(self, value=None, axis=None, inplace=None, limit=None): ------- Object with missing values filled or None if ``inplace=True``. """ - return self.fillna( - method="bfill", - value=value, - axis=axis, - inplace=inplace, - limit=limit, - ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + return self.fillna( + method="bfill", + value=value, + axis=axis, + inplace=inplace, + limit=limit, + ) @_cudf_nvtx_annotate def backfill(self, value=None, axis=None, inplace=None, limit=None): @@ -2211,13 +2213,15 @@ def ffill(self, value=None, axis=None, inplace=None, limit=None): ------- Object with missing values filled or None if ``inplace=True``. """ - return self.fillna( - method="ffill", - value=value, - axis=axis, - inplace=inplace, - limit=limit, - ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + return self.fillna( + method="ffill", + value=value, + axis=axis, + inplace=inplace, + limit=limit, + ) @_cudf_nvtx_annotate def pad(self, value=None, axis=None, inplace=None, limit=None): From a068b10fad7f6ca6a0cbe5cdb4be299e77e494f8 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 5 Dec 2023 10:15:28 -0500 Subject: [PATCH 073/162] Fix typo in value_counts (#14550) This PR fixes the return type of Series.value_counts to return int64, correcting a typo that was int34. --- python/cudf/cudf/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index c66b893f757..61d7c8d5437 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3120,7 +3120,7 @@ def value_counts( if dropna and self.null_count == len(self): return Series( [], - dtype=np.int34, + dtype=np.int64, name=result_name, index=cudf.Index([], dtype=self.dtype, name=self.name), ) From ccfbe7161e729e3a6d1f7b232973cf827f55e113 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 5 Dec 2023 18:33:03 -0500 Subject: [PATCH 074/162] Enforce `Index.to_frame` deprecations (#14553) This PR enforces deprecations of Index.to_frame and updates pytests related to this API. --- python/cudf/cudf/core/_base_index.py | 20 +++++--------------- python/cudf/cudf/core/multiindex.py | 21 +++++++-------------- python/cudf/cudf/tests/test_index.py | 5 ++--- python/cudf/cudf/tests/test_multiindex.py | 22 ++++++++++------------ 4 files changed, 24 insertions(+), 44 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 61a5a4a5d68..0a70f3050eb 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -844,22 +844,12 @@ def to_frame(self, index=True, name=no_default): 1 Bear 2 Cow """ - if name is None: - warnings.warn( - "Explicitly passing `name=None` currently preserves " - "the Index's name or uses a default name of 0. This " - "behaviour is deprecated, and in the future `None` " - "will be used as the name of the " - "resulting DataFrame column.", - FutureWarning, - ) - name = no_default - if name is not no_default: - col_name = name - elif self.name is None: - col_name = 0 + + if name is no_default: + col_name = 0 if self.name is None else self.name else: - col_name = self.name + col_name = name + return cudf.DataFrame( {col_name: self._values}, index=self if index else None ) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 3218d8a735f..5c2b4e6c7b0 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -6,7 +6,6 @@ import numbers import operator import pickle -import warnings from collections import abc from functools import cached_property from numbers import Integral @@ -1023,25 +1022,19 @@ def to_frame(self, index=True, name=no_default, allow_duplicates=False): # TODO: Currently this function makes a shallow copy, which is # incorrect. We want to make a deep copy, otherwise further # modifications of the resulting DataFrame will affect the MultiIndex. - if name is None: - warnings.warn( - "Explicitly passing `name=None` currently preserves the " - "Index's name or uses a default name of 0. This behaviour " - "is deprecated, and in the future `None` will be used " - "as the name of the resulting DataFrame column.", - FutureWarning, - ) - name = no_default - - if name is not no_default: + if name is no_default: + column_names = [ + level if name is None else name + for level, name in enumerate(self.names) + ] + else: if len(name) != len(self.levels): raise ValueError( "'name' should have the same length as " "number of levels on index." ) column_names = name - else: - column_names = self.names + all_none_names = None if not ( all_none_names := all(x is None for x in column_names) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 031143fc9f5..445fc84981b 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -3098,10 +3098,9 @@ def test_index_to_frame(data, data_name, index, name): pidx = pd.Index(data, name=data_name) gidx = cudf.from_pandas(pidx) - with expect_warning_if(name is None): + with expect_warning_if(not PANDAS_GE_200 and name is None): expected = pidx.to_frame(index=index, name=name) - with expect_warning_if(name is None): - actual = gidx.to_frame(index=index, name=name) + actual = gidx.to_frame(index=index, name=name) assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index ddaf83a4c9b..5fdeacc346f 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -1989,22 +1989,20 @@ def test_multiindex_to_frame_allow_duplicates( ) or (isinstance(name, list) and len(name) != len(set(name))): # cudf doesn't have the ability to construct dataframes # with duplicate column names - with expect_warning_if(name is None): - with pytest.raises(ValueError): - gidx.to_frame( - index=index, - name=name, - allow_duplicates=allow_duplicates, - ) + with pytest.raises(ValueError): + gidx.to_frame( + index=index, + name=name, + allow_duplicates=allow_duplicates, + ) else: - with expect_warning_if(name is None): + with expect_warning_if(not PANDAS_GE_200 and name is None): expected = pidx.to_frame( index=index, name=name, allow_duplicates=allow_duplicates ) - with expect_warning_if(name is None): - actual = gidx.to_frame( - index=index, name=name, allow_duplicates=allow_duplicates - ) + actual = gidx.to_frame( + index=index, name=name, allow_duplicates=allow_duplicates + ) assert_eq(expected, actual) From 9b478b002aa036c1b8252214b3911e7e10902db9 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 5 Dec 2023 19:59:22 -0500 Subject: [PATCH 075/162] Deprecate DataFrame.applymap and use map instead (#14579) Pandas 2.1.0 deprecated DataFrame.applymap, This PR deprecated applymap and introduces map to be used as the new alternative API. --- .../source/user_guide/api_docs/dataframe.rst | 5 +-- python/cudf/cudf/core/dataframe.py | 32 +++++++++++++++++++ python/cudf/cudf/tests/test_applymap.py | 19 +++++++---- python/cudf/cudf/tests/test_parquet.py | 2 +- 4 files changed, 48 insertions(+), 10 deletions(-) diff --git a/docs/cudf/source/user_guide/api_docs/dataframe.rst b/docs/cudf/source/user_guide/api_docs/dataframe.rst index dd685447025..90227541e4a 100644 --- a/docs/cudf/source/user_guide/api_docs/dataframe.rst +++ b/docs/cudf/source/user_guide/api_docs/dataframe.rst @@ -105,13 +105,14 @@ Function application, GroupBy & window .. autosummary:: :toctree: api/ + DataFrame.agg DataFrame.apply DataFrame.applymap DataFrame.apply_chunks DataFrame.apply_rows - DataFrame.pipe - DataFrame.agg DataFrame.groupby + DataFrame.map + DataFrame.pipe DataFrame.rolling .. _api.dataframe.stats: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a8e5aecfb30..3118dfa4490 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -4532,6 +4532,38 @@ def applymap( This method applies a function that accepts and returns a scalar to every element of a DataFrame. + Parameters + ---------- + func : callable + Python function, returns a single value from a single value. + na_action : {None, 'ignore'}, default None + If 'ignore', propagate NaN values, without passing them to func. + + Returns + ------- + DataFrame + Transformed DataFrame. + """ + # Do not remove until pandas 3.0 support is added. + warnings.warn( + "DataFrame.applymap has been deprecated. Use DataFrame.map " + "instead.", + FutureWarning, + ) + return self.map(func=func, na_action=na_action, **kwargs) + + def map( + self, + func: Callable[[Any], Any], + na_action: Union[str, None] = None, + **kwargs, + ) -> DataFrame: + """ + Apply a function to a Dataframe elementwise. + + This method applies a function that accepts and returns a scalar + to every element of a DataFrame. + Parameters ---------- func : callable diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/test_applymap.py index 32f3e39dd7c..9c0115fbc29 100644 --- a/python/cudf/cudf/tests/test_applymap.py +++ b/python/cudf/cudf/tests/test_applymap.py @@ -1,9 +1,10 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. import pytest from cudf import NA, DataFrame from cudf.testing import _utils as utils +from cudf.core._compat import PANDAS_GE_210 @pytest.mark.parametrize( @@ -29,8 +30,10 @@ def test_applymap_dataframe(data, func, na_action): gdf = DataFrame(data) pdf = gdf.to_pandas(nullable=True) - expect = pdf.applymap(func, na_action=na_action) - got = gdf.applymap(func, na_action=na_action) + with utils.expect_warning_if(PANDAS_GE_210): + expect = pdf.applymap(func, na_action=na_action) + with pytest.warns(FutureWarning): + got = gdf.applymap(func, na_action=na_action) utils.assert_eq(expect, got, check_dtype=False) @@ -41,8 +44,10 @@ def test_applymap_raise_cases(): def f(x, some_kwarg=0): return x + some_kwarg - with pytest.raises(NotImplementedError): - df.applymap(f, some_kwarg=1) + with pytest.warns(FutureWarning): + with pytest.raises(NotImplementedError): + df.applymap(f, some_kwarg=1) - with pytest.raises(ValueError): - df.applymap(f, na_action="some_invalid_option") + with pytest.warns(FutureWarning): + with pytest.raises(ValueError): + df.applymap(f, na_action="some_invalid_option") diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 4d16bb4857e..adadf147503 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2823,7 +2823,7 @@ def postprocess(val): fname = datadir / "one_level_list2.parquet" expect = pd.read_parquet(fname) - expect = expect.applymap(postprocess) + expect = expect.map(postprocess) got = cudf.read_parquet(fname) assert_eq(expect, got, check_dtype=False) From 0e83e2094a3f2c9a1bc9f1f796eb174d715d70f6 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 7 Dec 2023 08:48:31 -0600 Subject: [PATCH 076/162] Deprecate first and last (#14583) This PR deprecates first and last APIs to bring parity with pandas, where these APIs were deprecated starting 2.1.0 --- python/cudf/cudf/core/indexed_frame.py | 12 ++++++++++++ python/cudf/cudf/tests/test_datetime.py | 25 ++++++++++++++++++------- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 246d5b934a5..ff626c12e0e 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3318,6 +3318,12 @@ def first(self, offset): 2018-04-09 1 2018-04-11 2 """ + # Do not remove until pandas 3.0 support is added. + warnings.warn( + "first is deprecated and will be removed in a future version. " + "Please create a mask and filter using `.loc` instead", + FutureWarning, + ) return self._first_or_last( offset, idx=0, @@ -3364,6 +3370,12 @@ def last(self, offset): 2018-04-13 3 2018-04-15 4 """ + # Do not remove until pandas 3.0 support is added. + warnings.warn( + "last is deprecated and will be removed in a future version. " + "Please create a mask and filter using `.loc` instead", + FutureWarning, + ) return self._first_or_last( offset, idx=-1, diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 521c1303a52..2368b3e539c 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -13,7 +13,12 @@ import warnings import cudf.testing.dataset_generator as dataset_generator from cudf import DataFrame, Series -from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140, PANDAS_EQ_200 +from cudf.core._compat import ( + PANDAS_GE_150, + PANDAS_LT_140, + PANDAS_EQ_200, + PANDAS_GE_210, +) from cudf.core.index import DatetimeIndex from cudf.testing._utils import ( DATETIME_TYPES, @@ -2070,8 +2075,10 @@ def test_first(idx, offset): p = pd.Series(range(len(idx)), dtype="int64", index=idx) g = cudf.from_pandas(p) - expect = p.first(offset=offset) - got = g.first(offset=offset) + with expect_warning_if(PANDAS_GE_210): + expect = p.first(offset=offset) + with pytest.warns(FutureWarning): + got = g.first(offset=offset) assert_eq(expect, got) @@ -2100,8 +2107,10 @@ def test_first_start_at_end_of_month(idx, offset): p = pd.Series(range(len(idx)), index=idx) g = cudf.from_pandas(p) - expect = p.first(offset=offset) - got = g.first(offset=offset) + with expect_warning_if(PANDAS_GE_210): + expect = p.first(offset=offset) + with pytest.warns(FutureWarning): + got = g.first(offset=offset) assert_eq(expect, got) @@ -2137,8 +2146,10 @@ def test_last(idx, offset): p = pd.Series(range(len(idx)), dtype="int64", index=idx) g = cudf.from_pandas(p) - expect = p.last(offset=offset) - got = g.last(offset=offset) + with expect_warning_if(PANDAS_GE_210): + expect = p.last(offset=offset) + with pytest.warns(FutureWarning): + got = g.last(offset=offset) assert_eq(expect, got) From 5f3ecd6a7909dc46d9d85dc2b2a162cff4a2c377 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 14 Dec 2023 00:37:49 +0530 Subject: [PATCH 077/162] Fix CategoricalDtype docstring (#14622) This PR fixes the docstring in CategoricalDtype where the repr has added a new field (categories_dtype). This PR fixes 2 doctest failures. --- python/cudf/cudf/core/dtypes.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 070aacd49c8..834b384d892 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -228,11 +228,11 @@ def from_pandas(cls, dtype: pd.CategoricalDtype) -> "CategoricalDtype": >>> import pandas as pd >>> pd_dtype = pd.CategoricalDtype(categories=['b', 'a'], ordered=True) >>> pd_dtype - CategoricalDtype(categories=['b', 'a'], ordered=True) + CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object) >>> cudf_dtype = cudf.CategoricalDtype.from_pandas(pd_dtype) >>> cudf_dtype - CategoricalDtype(categories=['b', 'a'], ordered=True) - """ + CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object) + """ # noqa: E501 return CategoricalDtype( categories=dtype.categories, ordered=dtype.ordered ) @@ -246,10 +246,10 @@ def to_pandas(self) -> pd.CategoricalDtype: >>> import cudf >>> dtype = cudf.CategoricalDtype(categories=['b', 'a'], ordered=True) >>> dtype - CategoricalDtype(categories=['b', 'a'], ordered=True) + CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object) >>> dtype.to_pandas() - CategoricalDtype(categories=['b', 'a'], ordered=True) - """ + CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object) + """ # noqa: E501 if self._categories is None: categories = None else: From 72221b3c72efbe521e08c350594a312c246024d9 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 14 Dec 2023 00:39:03 +0530 Subject: [PATCH 078/162] Fix `DataFrame.sort_index` when a index is a `MultiIndex` (#14621) This PR fixes sorting of a MultiIndex by removing an existing hard-coded na_position value that was based on ascending flag, essentially ignoring the user-passed parameter. On pandas_2.0_feature_branch: = 501 failed, 101106 passed, 2071 skipped, 786 xfailed, 312 xpassed, 20 errors in 1234.91s (0:20:34) = This PR: = 405 failed, 101034 passed, 2071 skipped, 954 xfailed, 312 xpassed, 20 errors in 1124.69s (0:18:44) = --- python/cudf/cudf/core/indexed_frame.py | 2 -- python/cudf/cudf/tests/test_dataframe.py | 19 ++++++++++--------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index ff626c12e0e..b4fba1eef07 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1611,8 +1611,6 @@ def sort_index( idx = self.index if isinstance(idx, MultiIndex): if level is not None: - # Pandas doesn't handle na_position in case of MultiIndex. - na_position = "first" if ascending is True else "last" if not is_list_like(level): level = [level] by = list(map(idx._get_level_label, level)) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index c89b5b507f5..e18c1809fd4 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3492,8 +3492,16 @@ def test_dataframe_sort_index( @pytest.mark.parametrize("inplace", [True, False]) @pytest.mark.parametrize("na_position", ["first", "last"]) def test_dataframe_mulitindex_sort_index( - axis, level, ascending, inplace, ignore_index, na_position + request, axis, level, ascending, inplace, ignore_index, na_position ): + request.applymarker( + pytest.mark.xfail( + condition=axis in (1, "columns") + and ignore_index + and not (level is None and not ascending), + reason="https://github.com/pandas-dev/pandas/issues/56478", + ) + ) pdf = pd.DataFrame( { "b": [1.0, 3.0, np.nan], @@ -3505,17 +3513,14 @@ def test_dataframe_mulitindex_sort_index( ).set_index(["b", "a", 1]) gdf = cudf.DataFrame.from_pandas(pdf) - # ignore_index is supported in v.1.0 - expected = pdf.sort_index( axis=axis, level=level, ascending=ascending, inplace=inplace, na_position=na_position, + ignore_index=ignore_index, ) - if ignore_index is True: - expected = expected got = gdf.sort_index( axis=axis, level=level, @@ -3526,12 +3531,8 @@ def test_dataframe_mulitindex_sort_index( ) if inplace is True: - if ignore_index is True: - pdf = pdf.reset_index(drop=True) assert_eq(pdf, gdf) else: - if ignore_index is True: - expected = expected.reset_index(drop=True) assert_eq(expected, got) From d7dc16e85e2ab9bb38bb12c916e65ff8dd24e852 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 14 Dec 2023 00:40:04 +0530 Subject: [PATCH 079/162] Deprecate reading literal string in cudf.read_json (#14619) This PR deprecates reading literal strings in read_json, instead users will need to pass StringIO for these cases to silence the warning. This change is to match: pandas-dev/pandas#53409 On pandas_2.0_feature_branch: = 501 failed, 101106 passed, 2071 skipped, 786 xfailed, 312 xpassed, 20 errors in 1234.91s (0:20:34) = This PR: = 426 failed, 101181 passed, 2091 skipped, 786 xfailed, 312 xpassed in 1126.93s (0:18:46) = --- python/cudf/cudf/io/json.py | 2 + python/cudf/cudf/tests/test_json.py | 95 +++++++++++++++++++---------- python/cudf/cudf/tests/test_s3.py | 4 +- python/cudf/cudf/utils/ioutils.py | 29 +++++++++ 4 files changed, 96 insertions(+), 34 deletions(-) diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index ae2f0203642..b499fa23ede 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -102,6 +102,8 @@ def read_json( iotypes=(BytesIO, StringIO), allow_raw_text_input=True, storage_options=storage_options, + warn_on_raw_text_input=True, + warn_meta=("json", "read_json"), ) if isinstance(tmp_source, list): filepaths_or_buffers.extend(tmp_source) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index f44b7495aab..5bc9a33fd8d 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -13,12 +13,13 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_200 +from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210 from cudf.testing._utils import ( DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES, assert_eq, + expect_warning_if, ) @@ -95,6 +96,8 @@ def json_files(request, tmp_path_factory, pdf): ) if index is False and orient == "table": pytest.skip("'index=False' isn't valid when 'orient' is 'table'") + if index is True and orient not in ("split", "table", "index", "columns"): + pytest.skip("'index=False' isn't valid when 'orient' is 'table'") fname_df = tmp_path_factory.mktemp("json") / "test_df.json" fname_series = tmp_path_factory.mktemp("json") / "test_series.json" pdf.to_json(fname_df, index=index, compression=compression, orient=orient) @@ -338,8 +341,16 @@ def json_input(request, tmp_path_factory): @pytest.mark.filterwarnings("ignore:Using CPU") @pytest.mark.parametrize("engine", ["auto", "cudf", "pandas"]) def test_json_lines_basic(json_input, engine): - cu_df = cudf.read_json(json_input, engine=engine, lines=True) - pd_df = pd.read_json(json_input, lines=True) + with expect_warning_if( + isinstance(json_input, str) and not json_input.endswith(".json") + ): + cu_df = cudf.read_json(json_input, engine=engine, lines=True) + with expect_warning_if( + isinstance(json_input, str) + and PANDAS_GE_210 + and not json_input.endswith(".json") + ): + pd_df = pd.read_json(json_input, lines=True) assert all(cu_df.dtypes == ["int64", "int64", "int64"]) for cu_col, pd_col in zip(cu_df.columns, pd_df.columns): @@ -353,7 +364,12 @@ def test_json_lines_multiple(tmpdir, json_input, engine): tmp_file1 = tmpdir.join("MultiInputs1.json") tmp_file2 = tmpdir.join("MultiInputs2.json") - pdf = pd.read_json(json_input, lines=True) + with expect_warning_if( + isinstance(json_input, str) + and PANDAS_GE_210 + and not json_input.endswith(".json") + ): + pdf = pd.read_json(json_input, lines=True) pdf.to_json(tmp_file1, compression="infer", lines=True, orient="records") pdf.to_json(tmp_file2, compression="infer", lines=True, orient="records") @@ -368,7 +384,12 @@ def test_json_lines_multiple(tmpdir, json_input, engine): @pytest.mark.parametrize("engine", ["auto", "cudf"]) def test_json_read_directory(tmpdir, json_input, engine): - pdf = pd.read_json(json_input, lines=True) + with expect_warning_if( + isinstance(json_input, str) + and PANDAS_GE_210 + and not json_input.endswith(".json") + ): + pdf = pd.read_json(json_input, lines=True) pdf.to_json( tmpdir.join("MultiInputs1.json"), compression="infer", @@ -400,37 +421,47 @@ def test_json_read_directory(tmpdir, json_input, engine): def test_json_lines_byte_range(json_input): # include the first row and half of the second row # should parse the first two rows - df = cudf.read_json( - copy.deepcopy(json_input), lines=True, byte_range=(0, 15) + will_warn = isinstance(json_input, str) and not json_input.endswith( + ".json" ) + with expect_warning_if(will_warn): + df = cudf.read_json( + copy.deepcopy(json_input), lines=True, byte_range=(0, 15) + ) assert df.shape == (2, 3) # include half of the second row and half of the third row # should parse only the third row - df = cudf.read_json( - copy.deepcopy(json_input), lines=True, byte_range=(15, 10) - ) + with expect_warning_if(will_warn): + df = cudf.read_json( + copy.deepcopy(json_input), lines=True, byte_range=(15, 10) + ) assert df.shape == (1, 3) # include half of the second row and entire third row # should parse only the third row - df = cudf.read_json( - copy.deepcopy(json_input), lines=True, byte_range=(15, 0) - ) + with expect_warning_if(will_warn): + df = cudf.read_json( + copy.deepcopy(json_input), lines=True, byte_range=(15, 0) + ) assert df.shape == (1, 3) # include half of the second row till past the end of the file # should parse only the third row - df = cudf.read_json( - copy.deepcopy(json_input), lines=True, byte_range=(10, 50) - ) + with expect_warning_if(will_warn): + df = cudf.read_json( + copy.deepcopy(json_input), lines=True, byte_range=(10, 50) + ) assert df.shape == (1, 3) def test_json_lines_dtypes(json_input): - df = cudf.read_json( - json_input, lines=True, dtype={1: "int", 2: "short", 0: "float"} - ) + with expect_warning_if( + isinstance(json_input, str) and not json_input.endswith(".json") + ): + df = cudf.read_json( + json_input, lines=True, dtype={1: "int", 2: "short", 0: "float"} + ) assert all(df.dtypes == ["float64", "int64", "int16"]) @@ -470,32 +501,32 @@ def test_json_engine_selection(): json = "[1, 2, 3]" # should use the cudf engine - df = cudf.read_json(json, lines=True) + df = cudf.read_json(StringIO(json), lines=True) # column names are strings when parsing with cudf for col_name in df.columns: assert isinstance(col_name, str) # should use the pandas engine - df = cudf.read_json(json, lines=False, engine="pandas") + df = cudf.read_json(StringIO(json), lines=False, engine="pandas") # column names are ints when parsing with pandas for col_name in df.columns: assert isinstance(col_name, int) # should use the pandas engine - df = cudf.read_json(json, lines=True, engine="pandas") + df = cudf.read_json(StringIO(json), lines=True, engine="pandas") # column names are ints when parsing with pandas for col_name in df.columns: assert isinstance(col_name, int) # should raise an exception with pytest.raises(ValueError): - cudf.read_json(json, lines=False, engine="cudf_legacy") + cudf.read_json(StringIO(json), lines=False, engine="cudf_legacy") def test_json_bool_values(): buffer = "[true,1]\n[false,false]\n[true,true]" - cu_df = cudf.read_json(buffer, lines=True) - pd_df = pd.read_json(buffer, lines=True) + cu_df = cudf.read_json(StringIO(buffer), lines=True) + pd_df = pd.read_json(StringIO(buffer), lines=True) # types should be ['bool', 'int64'] np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes) @@ -504,7 +535,7 @@ def test_json_bool_values(): np.testing.assert_array_equal(pd_df[1], cu_df["1"].to_numpy()) cu_df = cudf.read_json( - buffer, lines=True, dtype={"0": "bool", "1": "long"} + StringIO(buffer), lines=True, dtype={"0": "bool", "1": "long"} ) np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes) @@ -522,7 +553,7 @@ def test_json_bool_values(): ], ) def test_json_null_literal(buffer): - df = cudf.read_json(buffer, lines=True, engine="cudf_legacy") + df = cudf.read_json(StringIO(buffer), lines=True, engine="cudf_legacy") # first column contains a null field, type should be set to float # second column contains only empty fields, type should be set to int8 @@ -534,7 +565,7 @@ def test_json_null_literal(buffer): def test_json_bad_protocol_string(): - test_string = '{"field": "s3://path"}' + test_string = StringIO('{"field": "s3://path"}') expect = pd.DataFrame([{"field": "s3://path"}]) got = cudf.read_json(test_string, lines=True) @@ -748,7 +779,7 @@ def test_default_integer_bitwidth_extremes(default_integer_bitwidth, engine): def test_default_float_bitwidth(default_float_bitwidth): # Test that float columns in json are _inferred_ as 32 bit columns. df = cudf.read_json( - '{"a": 1.0, "b": 2.5}\n{"a": 3.5, "b": 4.0}', + StringIO('{"a": 1.0, "b": 2.5}\n{"a": 3.5, "b": 4.0}'), engine="cudf", lines=True, orient="records", @@ -1231,7 +1262,7 @@ def test_json_round_trip_gzip(): @pytest.mark.parametrize("lines", [True, False]) def test_json_array_of_arrays(data, lines): data = data if lines else "[" + data.replace("\n", ",") + "]" - pdf = pd.read_json(data, orient="values", lines=lines) + pdf = pd.read_json(StringIO(data), orient="values", lines=lines) df = cudf.read_json( StringIO(data), engine="cudf", @@ -1325,8 +1356,8 @@ def _replace_with_nulls(df, replace_items): # both json lines and json string tested. json_string = "[" + jsonl_string.replace("\n", ",") + "]" - pdf = pd.read_json(jsonl_string, orient="records", lines=True) - pdf2 = pd.read_json(json_string, orient="records", lines=False) + pdf = pd.read_json(StringIO(jsonl_string), orient="records", lines=True) + pdf2 = pd.read_json(StringIO(json_string), orient="records", lines=False) assert_eq(pdf, pdf2) # replace list elements with None if it has dict and non-dict # in above test cases, these items are mixed with dict/list items diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index b92f84b677c..8db46f87d65 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -3,7 +3,7 @@ import os import socket from contextlib import contextmanager -from io import BytesIO +from io import BytesIO, StringIO import numpy as np import pandas as pd @@ -433,7 +433,7 @@ def test_read_json(s3_base, s3so): storage_options=s3so, ) - expect = pd.read_json(buffer, lines=True) + expect = pd.read_json(StringIO(buffer), lines=True) assert_eq(expect, got) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 6641bd8290a..c3b89d64435 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -1666,6 +1666,8 @@ def get_reader_filepath_or_buffer( allow_raw_text_input=False, storage_options=None, bytes_per_thread=_BYTES_PER_THREAD_DEFAULT, + warn_on_raw_text_input=None, + warn_meta=None, ): """{docstring}""" @@ -1679,6 +1681,15 @@ def get_reader_filepath_or_buffer( path_or_data, storage_options ) if fs is None: + if warn_on_raw_text_input: + # Do not remove until pandas 3.0 support is added. + warnings.warn( + f"Passing literal {warn_meta[0]} to {warn_meta[1]} is " + "deprecated and will be removed in a future version. " + "To read from a literal string, wrap it in a " + "'StringIO' object.", + FutureWarning, + ) return path_or_data, compression if _is_local_filesystem(fs): @@ -1691,6 +1702,24 @@ def get_reader_filepath_or_buffer( raise FileNotFoundError( f"{path_or_data} could not be resolved to any files" ) + elif warn_on_raw_text_input: + # Do not remove until pandas 3.0 support is added. + warnings.warn( + f"Passing literal {warn_meta[0]} to {warn_meta[1]} is " + "deprecated and will be removed in a future version. " + "To read from a literal string, wrap it in a " + "'StringIO' object.", + FutureWarning, + ) + elif warn_on_raw_text_input: + # Do not remove until pandas 3.0 support is added. + warnings.warn( + f"Passing literal {warn_meta[0]} to {warn_meta[1]} is " + "deprecated and will be removed in a future version. " + "To read from a literal string, wrap it in a " + "'StringIO' object.", + FutureWarning, + ) else: if len(paths) == 0: From eea5f107cbb062cc47c935728bb1ae234729de09 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Sun, 17 Dec 2023 01:08:23 +0530 Subject: [PATCH 080/162] Preserve column ordering in DataFrame.stack (#14626) This PR preserves original column ordering in DataFrame.stack On pandas_2.0_feature_branch: = 328 failed, 101111 passed, 2091 skipped, 954 xfailed, 312 xpassed in 1113.40s (0:18:33) = This PR: = 316 failed, 101123 passed, 2091 skipped, 954 xfailed, 312 xpassed in 1123.65s (0:18:43) = --- python/cudf/cudf/core/dataframe.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 3118dfa4490..50fe5adebf8 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6749,11 +6749,11 @@ def stack(self, level=-1, dropna=True): cat 1.0 2.0 dog 3.0 4.0 >>> df_multi_level_cols2.stack() - height weight - cat kg 1.0 - m 2.0 - dog kg 3.0 - m 4.0 + weight height + cat kg 1.0 + m 2.0 + dog kg 3.0 + m 4.0 **Prescribing the level(s) to be stacked** @@ -6925,10 +6925,18 @@ def unnamed_group_generator(): else: if unnamed_level_values.nlevels == 1: unnamed_level_values = unnamed_level_values.get_level_values(0) - unnamed_level_values = unnamed_level_values.unique().sort_values() + unnamed_level_values = unnamed_level_values.unique() data = ColumnAccessor( - dict(zip(unnamed_level_values, stacked)), + dict( + zip( + unnamed_level_values, + [ + stacked[i] + for i in unnamed_level_values.argsort().argsort() + ], + ) + ), isinstance(unnamed_level_values, pd.MultiIndex), unnamed_level_values.names, ) From bc5584b159671f9e92281e22a44d8da9610d8748 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 18 Dec 2023 23:01:19 +0530 Subject: [PATCH 081/162] Change `is_.._dtype` deprecations to `DeprecationWarning` instead of `FutureWarning` (#14617) This PR changes all FutureWarning's to DeprecationWarning's to match with pandas: pandas-dev/pandas#55703 On pandas_2.0_feature_branch: = 501 failed, 101106 passed, 2071 skipped, 786 xfailed, 312 xpassed, 20 errors in 1234.91s (0:20:34) = This PR: = 445 failed, 101162 passed, 2071 skipped, 786 xfailed, 312 xpassed, 20 errors in 1216.79s (0:20:16) = --- .pre-commit-config.yaml | 4 ++++ python/cudf/cudf/core/_compat.py | 1 + python/cudf/cudf/core/dtypes.py | 4 ++-- python/cudf/cudf/tests/test_api_types.py | 8 +++++--- 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7db8d9ab52f..d14a34ad1a3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -91,6 +91,10 @@ repos: entry: '(category=|\s)DeprecationWarning[,)]' language: pygrep types_or: [python, cython] + exclude: | + (?x)^( + ^python/cudf/cudf/core/dtypes.py + ) - id: no-programmatic-xfail name: no-programmatic-xfail description: 'Enforce that pytest.xfail is not introduced (see dev docs for details)' diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py index d0b6dcf2e6d..b57fa4e83ed 100644 --- a/python/cudf/cudf/core/_compat.py +++ b/python/cudf/cudf/core/_compat.py @@ -12,4 +12,5 @@ PANDAS_EQ_200 = PANDAS_VERSION == version.parse("2.0.0") PANDAS_GE_200 = PANDAS_VERSION >= version.parse("2.0.0") PANDAS_GE_210 = PANDAS_VERSION >= version.parse("2.1.0") +PANDAS_GE_214 = PANDAS_VERSION >= version.parse("2.1.4") PANDAS_GE_220 = PANDAS_VERSION >= version.parse("2.2.0") diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 834b384d892..c32969a401b 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -1043,7 +1043,7 @@ def is_categorical_dtype(obj): warnings.warn( "is_categorical_dtype is deprecated and will be removed in a future " "version. Use isinstance(dtype, cudf.CategoricalDtype) instead", - FutureWarning, + DeprecationWarning, ) return _is_categorical_dtype(obj) @@ -1151,7 +1151,7 @@ def is_interval_dtype(obj): warnings.warn( "is_interval_dtype is deprecated and will be removed in a " "future version. Use `isinstance(dtype, cudf.IntervalDtype)` instead", - FutureWarning, + DeprecationWarning, ) return _is_interval_dtype(obj) diff --git a/python/cudf/cudf/tests/test_api_types.py b/python/cudf/cudf/tests/test_api_types.py index d640e8e1376..4d617056c10 100644 --- a/python/cudf/cudf/tests/test_api_types.py +++ b/python/cudf/cudf/tests/test_api_types.py @@ -6,7 +6,7 @@ from pandas.api import types as pd_types import cudf -from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210 +from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210, PANDAS_GE_214 from cudf.api import types from cudf.testing._utils import expect_warning_if @@ -1038,9 +1038,11 @@ def test_is_decimal_dtype(obj, expect): ), ) def test_pandas_agreement(obj): - with expect_warning_if(PANDAS_GE_210): + with expect_warning_if( + PANDAS_GE_210, DeprecationWarning if PANDAS_GE_214 else FutureWarning + ): expected = pd_types.is_categorical_dtype(obj) - with pytest.warns(FutureWarning): + with pytest.warns(DeprecationWarning): actual = types.is_categorical_dtype(obj) assert expected == actual assert types.is_numeric_dtype(obj) == pd_types.is_numeric_dtype(obj) From 194e487edf838054e937091472955daa343dd286 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 19 Dec 2023 05:32:31 +0530 Subject: [PATCH 082/162] Version dataframe.mode pytest (#14650) This PR versions the xfail properly to not fail in version of pandas where this bug is fixed. --- python/cudf/cudf/core/_compat.py | 1 + python/cudf/cudf/tests/test_dataframe.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py index b57fa4e83ed..c326b19307d 100644 --- a/python/cudf/cudf/core/_compat.py +++ b/python/cudf/cudf/core/_compat.py @@ -14,3 +14,4 @@ PANDAS_GE_210 = PANDAS_VERSION >= version.parse("2.1.0") PANDAS_GE_214 = PANDAS_VERSION >= version.parse("2.1.4") PANDAS_GE_220 = PANDAS_VERSION >= version.parse("2.2.0") +PANDAS_LT_203 = PANDAS_VERSION < version.parse("2.0.3") diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index e18c1809fd4..f3cd65a72a1 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -27,6 +27,7 @@ PANDAS_GE_200, PANDAS_GE_210, PANDAS_LT_140, + PANDAS_LT_203, ) from cudf.api.extensions import no_default from cudf.core.buffer.spill_manager import get_global_manager @@ -8593,6 +8594,7 @@ def test_dataframe_mode(request, df, numeric_only, dropna): request.applymarker( pytest.mark.xfail( condition=PANDAS_GE_200 + and PANDAS_LT_203 and numeric_only is False and "b" in df.columns and df["b"].dtype == np.dtype("timedelta64[s]"), From f736d72c5e1e8d400d9335e39e8ca7c42ef33263 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 19 Dec 2023 07:10:38 +0530 Subject: [PATCH 083/162] Filter ufunc related warnings in pytests (#14652) This PR ignores ufunc runtime warnings that show up in eval API and setitem deprecation warnings. On pandas_2.0_feature_branch: = 260 failed, 101179 passed, 2091 skipped, 954 xfailed, 312 xpassed in 1104.58s (0:18:24) = This PR: = 211 failed, 101228 passed, 2091 skipped, 954 xfailed, 312 xpassed in 1095.49s (0:18:15) = --- python/cudf/cudf/tests/test_array_ufunc.py | 67 ++++++++++++++++++++-- python/cudf/cudf/tests/test_dataframe.py | 34 ++++++++++- 2 files changed, 95 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index 305f935bcb4..40966f6b6c9 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -10,8 +10,12 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_200 -from cudf.testing._utils import assert_eq, set_random_null_mask_inplace +from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_200, PANDAS_GE_210 +from cudf.testing._utils import ( + assert_eq, + set_random_null_mask_inplace, + expect_warning_if, +) _UFUNCS = [ obj @@ -47,6 +51,21 @@ def _hide_ufunc_warnings(ufunc): category=RuntimeWarning, ) yield + elif name in { + "bitwise_and", + "bitwise_or", + "bitwise_xor", + }: + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Operation between non boolean Series with different " + "indexes will no longer return a boolean result in " + "a future version. Cast both Series to object type " + "to maintain the prior behavior.", + category=FutureWarning, + ) + yield else: yield @@ -217,7 +236,27 @@ def test_ufunc_series(request, ufunc, has_nulls, indexed): assert_eq(g, e, check_exact=False) else: if has_nulls: - expect[mask] = np.nan + with expect_warning_if( + PANDAS_GE_210 + and fname + in ( + "isfinite", + "isinf", + "isnan", + "logical_and", + "logical_not", + "logical_or", + "logical_xor", + "signbit", + "equal", + "greater", + "greater_equal", + "less", + "less_equal", + "not_equal", + ) + ): + expect[mask] = np.nan assert_eq(got, expect, check_exact=False) @@ -443,5 +482,25 @@ def test_ufunc_dataframe(request, ufunc, has_nulls, indexed): assert_eq(g, e, check_exact=False) else: if has_nulls: - expect[mask] = np.nan + with expect_warning_if( + PANDAS_GE_210 + and fname + in ( + "isfinite", + "isinf", + "isnan", + "logical_and", + "logical_not", + "logical_or", + "logical_xor", + "signbit", + "equal", + "greater", + "greater_equal", + "less", + "less_equal", + "not_equal", + ) + ): + expect[mask] = np.nan assert_eq(got, expect, check_exact=False) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index f3cd65a72a1..6e9b9a37ac0 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9,6 +9,8 @@ import re import string import textwrap +import warnings +from contextlib import contextmanager from collections import OrderedDict, defaultdict, namedtuple from copy import copy @@ -65,6 +67,32 @@ pytest_xfail = pytest.mark.skipif +@contextmanager +def _hide_ufunc_warnings(eval_str): + # pandas raises warnings for some inputs to the following ufuncs: + if any( + x in eval_str + for x in { + "arctanh", + "log", + } + ): + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "invalid value encountered in", + category=RuntimeWarning, + ) + warnings.filterwarnings( + "ignore", + "divide by zero encountered in", + category=RuntimeWarning, + ) + yield + else: + yield + + def test_init_via_list_of_tuples(): data = [ (5, "cats", "jump", np.nan), @@ -10071,7 +10099,8 @@ def df_eval(request): ) def test_dataframe_eval(df_eval, expr, dtype): df_eval = df_eval.astype(dtype) - expect = df_eval.to_pandas().eval(expr) + with _hide_ufunc_warnings(expr): + expect = df_eval.to_pandas().eval(expr) got = df_eval.eval(expr) # In the specific case where the evaluated expression is a unary function # of a single column with no nesting, pandas will retain the name. This @@ -10081,7 +10110,8 @@ def test_dataframe_eval(df_eval, expr, dtype): # Test inplace if re.search("[^=><]=[^=]", expr) is not None: pdf_eval = df_eval.to_pandas() - pdf_eval.eval(expr, inplace=True) + with _hide_ufunc_warnings(expr): + pdf_eval.eval(expr, inplace=True) df_eval.eval(expr, inplace=True) assert_eq(pdf_eval, df_eval) From 4539f4f83b4297039c56c87b6fab741994b61334 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 19 Dec 2023 07:11:53 +0530 Subject: [PATCH 084/162] Deprecate positional access for label based indexes in Series.__getitem__ (#14654) This PR deprecates positional access in `Series.__getitem__` when a label-based index is present. xref: https://github.com/pandas-dev/pandas/pull/53201 On `pandas_2.0_feature_branch`: ``` = 260 failed, 101179 passed, 2091 skipped, 954 xfailed, 312 xpassed in 1104.58s (0:18:24) = ``` This PR: ``` = 248 failed, 101190 passed, 2091 skipped, 954 xfailed, 312 xpassed in 1105.78s (0:18:25) = ``` --- python/cudf/cudf/core/series.py | 16 ++++++++++++---- python/cudf/cudf/tests/test_csv.py | 4 ++-- python/cudf/cudf/tests/test_indexing.py | 10 +++++++--- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 61d7c8d5437..5876a577b87 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -371,6 +371,12 @@ def _loc_to_iloc(self, arg): arg = arg[0] if _is_scalar_or_zero_d_array(arg): index_dtype = self._frame.index.dtype + warn_msg = ( + "Series.__getitem__ treating keys as positions is deprecated. " + "In a future version, integer keys will always be treated " + "as labels (consistent with DataFrame behavior). To access " + "a value by position, use `ser.iloc[pos]`" + ) if not _is_non_decimal_numeric_dtype(index_dtype) and not ( isinstance(index_dtype, cudf.CategoricalDtype) and is_integer_dtype(index_dtype.categories.dtype) @@ -379,11 +385,13 @@ def _loc_to_iloc(self, arg): if isinstance(arg, cudf.Scalar) and is_integer_dtype( arg.dtype ): - found_index = arg.value - return found_index + # Do not remove until pandas 3.0 support is added. + warnings.warn(warn_msg, FutureWarning) + return arg.value elif is_integer(arg): - found_index = arg - return found_index + # Do not remove until pandas 3.0 support is added. + warnings.warn(warn_msg, FutureWarning) + return arg try: indices = self._frame.index._indices_of(arg) if (n := len(indices)) == 0: diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 1ccf91fe63e..cbb262cd649 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -595,12 +595,12 @@ def test_csv_reader_NaN_values(): header=None, na_values=custom_na_values, ) - assert gdf.dtypes[0] == "int8" + assert gdf.dtypes.iloc[0] == "int8" assert all(gdf["0"][idx] is cudf.NA for idx in range(len(gdf["0"]))) # data type detection should evaluate the column to object if some nulls gdf = read_csv(StringIO(all_cells), header=None) - assert gdf.dtypes[0] == np.dtype("object") + assert gdf.dtypes.iloc[0] == np.dtype("object") def test_csv_reader_thousands(tmpdir): diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 27e84f179b6..e921a6ccf3f 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -9,11 +9,13 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_210 from cudf.testing import _utils as utils from cudf.testing._utils import ( INTEGER_TYPES, assert_eq, assert_exceptions_equal, + expect_warning_if, ) index_dtypes = INTEGER_TYPES @@ -151,8 +153,10 @@ def test_series_get_item_iloc_defer(arg): ps = pd.Series([1, 2, 3], index=pd.Index(["a", "b", "c"])) gs = cudf.from_pandas(ps) - expect = ps[arg] - got = gs[arg] + with expect_warning_if(PANDAS_GE_210 and not isinstance(arg, str)): + expect = ps[arg] + with expect_warning_if(not isinstance(arg, str)): + got = gs[arg] assert_eq(expect, got) @@ -163,7 +167,7 @@ def test_series_iloc_defer_cudf_scalar(): for t in index_dtypes: arg = cudf.Scalar(1, dtype=t) - got = gs[arg] + got = gs.iloc[arg] expect = 2 assert_eq(expect, got) From c1411b6c1df40d98d1c3172175f73fc56e1b7a82 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 28 Dec 2023 00:17:57 +0530 Subject: [PATCH 085/162] Deprecate `method` in `interpolate` and calculation on `object` dtype (#14667) This PR: - [x] Deprecates `method` in `interpolate`. - [x] Deprecates performing `interpolate` on string columns. On `pandas_2.0_feature_branch`: ``` = 198 failed, 101241 passed, 2091 skipped, 954 xfailed, 312 xpassed in 1098.81s (0:18:18) = ``` This PR: ``` = 187 failed, 101252 passed, 2091 skipped, 954 xfailed, 312 xpassed in 1090.48s (0:18:10) = ``` --- python/cudf/cudf/core/indexed_frame.py | 16 +++++++++++- python/cudf/cudf/tests/test_interpolate.py | 29 ++++++++++++++-------- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index b4fba1eef07..d2223ff004a 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -40,8 +40,8 @@ ) from cudf.api.extensions import no_default from cudf.api.types import ( - _is_non_decimal_numeric_dtype, _is_categorical_dtype, + _is_non_decimal_numeric_dtype, is_bool_dtype, is_decimal_dtype, is_dict_like, @@ -1067,6 +1067,14 @@ def interpolate( f"`limit_direction` must be 'backward' for method `{method}`" ) + if method.lower() in {"ffill", "bfill", "pad", "backfill"}: + warnings.warn( + f"{type(self).__name__}.interpolate with method={method} is " + "deprecated and will raise in a future version. " + "Use obj.ffill() or obj.bfill() instead.", + FutureWarning, + ) + data = self if not isinstance(data._index, cudf.RangeIndex): @@ -1082,6 +1090,12 @@ def interpolate( interpolator = cudf.core.algorithms.get_column_interpolator(method) columns = {} for colname, col in data._data.items(): + if isinstance(col, cudf.core.column.StringColumn): + warnings.warn( + f"{type(self).__name__}.interpolate with object dtype is " + "deprecated and will raise in a future version.", + FutureWarning, + ) if col.nullable: col = col.astype("float64").fillna(np.nan) diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py index c0b085a5097..3acda9165fd 100644 --- a/python/cudf/cudf/tests/test_interpolate.py +++ b/python/cudf/cudf/tests/test_interpolate.py @@ -1,9 +1,14 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. import pytest import cudf -from cudf.testing._utils import assert_eq, assert_exceptions_equal +from cudf.core._compat import PANDAS_GE_210 +from cudf.testing._utils import ( + assert_eq, + assert_exceptions_equal, + expect_warning_if, +) @pytest.mark.parametrize( @@ -49,8 +54,10 @@ def test_interpolate_series(data, method, axis): gsr = cudf.Series(data) psr = gsr.to_pandas() - expect = psr.interpolate(method=method, axis=axis) - got = gsr.interpolate(method=method, axis=axis) + with expect_warning_if(PANDAS_GE_210 and psr.dtype == "object"): + expect = psr.interpolate(method=method, axis=axis) + with expect_warning_if(gsr.dtype == "object"): + got = gsr.interpolate(method=method, axis=axis) assert_eq(expect, got, check_dtype=psr.dtype != "object") @@ -87,8 +94,10 @@ def test_interpolate_series_values_or_index(data, index, method): gsr = cudf.Series(data, index=index) psr = gsr.to_pandas() - expect = psr.interpolate(method=method) - got = gsr.interpolate(method=method) + with expect_warning_if(PANDAS_GE_210 and gsr.dtype == "object"): + expect = psr.interpolate(method=method) + with expect_warning_if(gsr.dtype == "object"): + got = gsr.interpolate(method=method) assert_eq(expect, got, check_dtype=psr.dtype != "object") @@ -100,12 +109,12 @@ def test_interpolate_series_values_or_index(data, index, method): {"A": ["a", "b", "c"], "B": ["d", "e", "f"]}, {"axis": 0, "method": "linear"}, ), - ({"A": [1, 2, 3]}, {"method": "pad", "limit_direction": "backward"}), - ({"A": [1, 2, 3]}, {"method": "ffill", "limit_direction": "backward"}), - ({"A": [1, 2, 3]}, {"method": "bfill", "limit_direction": "forward"}), + ({"A": [1, 2, 3]}, {"method": "pad", "limit_direction": "forward"}), + ({"A": [1, 2, 3]}, {"method": "ffill", "limit_direction": "forward"}), + ({"A": [1, 2, 3]}, {"method": "bfill", "limit_direction": "backward"}), ( {"A": [1, 2, 3]}, - {"method": "backfill", "limit_direction": "forward"}, + {"method": "backfill", "limit_direction": "backward"}, ), ], ) From 2b9ab53599511b636ed5067ff7d18c617c1172b5 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 28 Dec 2023 00:19:56 +0530 Subject: [PATCH 086/162] Add more validation to MultiIndex.to_frame (#14671) This PR adds validation to `name` inputs in `MultiIndex.to_frame` API. On `pandas_2.0_feature_branch`: ``` = 198 failed, 101241 passed, 2091 skipped, 954 xfailed, 312 xpassed in 1098.81s (0:18:18) = ``` This PR: ``` = 180 failed, 101247 passed, 2091 skipped, 954 xfailed, 312 xpassed in 1025.07s (0:17:05) = ``` --- python/cudf/cudf/core/multiindex.py | 4 ++++ python/cudf/cudf/tests/test_multiindex.py | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 5c2b4e6c7b0..a2cc5450ca4 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1028,6 +1028,10 @@ def to_frame(self, index=True, name=no_default, allow_duplicates=False): for level, name in enumerate(self.names) ] else: + if not is_list_like(name): + raise TypeError( + "'name' must be a list / sequence of column names." + ) if len(name) != len(self.levels): raise ValueError( "'name' should have the same length as " diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 5fdeacc346f..0cdc0e42cc1 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -1953,13 +1953,13 @@ def test_multiindex_to_frame_allow_duplicates( ): gidx = cudf.from_pandas(pidx) - if ( + if name is None or ( ( len(pidx.names) != len(set(pidx.names)) and not all(x is None for x in pidx.names) ) and not allow_duplicates - and (name is None or name is no_default) + and name is no_default ): assert_exceptions_equal( pidx.to_frame, From 46ef14838e6aa97e1400ae704bd47e9b97c86324 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 28 Dec 2023 00:20:40 +0530 Subject: [PATCH 087/162] Deprecate ignoring empty objects in concat (#14672) This PR deprecates ignoring `empty` objects for dtype calculation in `concat`. On `pandas_2.0_feature_branch`: ``` = 198 failed, 101241 passed, 2091 skipped, 954 xfailed, 312 xpassed in 1098.81s (0:18:18) = ``` This PR: ``` = 179 failed, 101260 passed, 2091 skipped, 954 xfailed, 312 xpassed in 1225.23s (0:20:25) = ``` ## Checklist - [x] I am familiar with the [Contributing Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md). - [x] New or existing tests cover these changes. - [x] The documentation is up to date with these changes. --- python/cudf/cudf/core/dataframe.py | 36 ++-- python/cudf/cudf/core/groupby/groupby.py | 12 +- python/cudf/cudf/core/index.py | 14 +- python/cudf/cudf/core/join/_join_helpers.py | 9 +- python/cudf/cudf/core/multiindex.py | 21 ++- python/cudf/cudf/core/reshape.py | 14 +- python/cudf/cudf/core/series.py | 4 +- python/cudf/cudf/io/parquet.py | 16 +- python/cudf/cudf/tests/test_concat.py | 187 ++++++++++++-------- python/cudf/cudf/tests/test_dataframe.py | 72 +++++--- python/cudf/cudf/tests/test_index.py | 22 ++- 11 files changed, 263 insertions(+), 144 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 50fe5adebf8..bfb5fbe4d48 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -910,7 +910,9 @@ def _init_from_series_list(self, data, columns, index): transpose = self.T else: - concat_df = cudf.concat(data, axis=1) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + concat_df = cudf.concat(data, axis=1) cols = concat_df._data.to_pandas_index() if cols.dtype == "object": @@ -1920,9 +1922,11 @@ def _get_renderable_dataframe(self): lower_left = self.tail(lower_rows).iloc[:, :left_cols] lower_right = self.tail(lower_rows).iloc[:, right_cols:] - upper = cudf.concat([upper_left, upper_right], axis=1) - lower = cudf.concat([lower_left, lower_right], axis=1) - output = cudf.concat([upper, lower]) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + upper = cudf.concat([upper_left, upper_right], axis=1) + lower = cudf.concat([lower_left, lower_right], axis=1) + output = cudf.concat([upper, lower]) output = self._clean_nulls_from_dataframe(output) output._index = output._index._clean_nulls_from_index() @@ -5154,14 +5158,17 @@ def describe( None, ) - return cudf.concat( - [ - series.reindex(names, copy=False) - for series in describe_series_list - ], - axis=1, - sort=False, - ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + res = cudf.concat( + [ + series.reindex(names, copy=False) + for series in describe_series_list + ], + axis=1, + sort=False, + ) + return res @_cudf_nvtx_annotate def to_pandas(self, *, nullable: bool = False) -> pd.DataFrame: @@ -6258,7 +6265,10 @@ def mode(self, axis=0, numeric_only=False, dropna=True): if len(mode_results) == 0: return DataFrame() - df = cudf.concat(mode_results, axis=1) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + df = cudf.concat(mode_results, axis=1) + if isinstance(df, Series): df = df.to_frame() diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 0262e586807..849ec46f74d 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1319,13 +1319,17 @@ def _post_process_chunk_results( # group is a row-like "Series" where the index labels # are the same as the original calling DataFrame if _is_row_of(chunk_results[0], self.obj): - result = cudf.concat(chunk_results, axis=1).T + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + result = cudf.concat(chunk_results, axis=1).T result.index = group_names result.index.names = self.grouping.names # When the UDF is like df.x + df.y, the result for each # group is the same length as the original group elif len(self.obj) == sum(len(chk) for chk in chunk_results): - result = cudf.concat(chunk_results) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + result = cudf.concat(chunk_results) index_data = group_keys._data.copy(deep=True) index_data[None] = grouped_values.index._column result.index = cudf.MultiIndex._from_data(index_data) @@ -1336,7 +1340,9 @@ def _post_process_chunk_results( f"type {type(chunk_results[0])}" ) else: - result = cudf.concat(chunk_results) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + result = cudf.concat(chunk_results) if self._group_keys: index_data = group_keys._data.copy(deep=True) index_data[None] = grouped_values.index._column diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 3cce1ab515e..25a58d77830 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1103,6 +1103,16 @@ def _values(self): @_cudf_nvtx_annotate def _concat(cls, objs): non_empties = [index for index in objs if len(index)] + if len(objs) != len(non_empties): + # Do not remove until pandas-3.0 support is added. + warnings.warn( + "The behavior of array concatenation with empty entries is " + "deprecated. In a future version, this will no longer exclude " + "empty items when determining the result dtype. " + "To retain the old behavior, exclude the empty entries before " + "the concat operation.", + FutureWarning, + ) if all(isinstance(obj, RangeIndex) for obj in non_empties): result = _concat_range_index(non_empties) else: @@ -1300,7 +1310,9 @@ def __repr__(self): top = self[0:mr] bottom = self[-1 * mr :] - preprocess = cudf.concat([top, bottom]) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + preprocess = cudf.concat([top, bottom]) else: preprocess = self diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 1071261044f..822c1848d58 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -2,6 +2,7 @@ from __future__ import annotations +import warnings from collections import abc from typing import TYPE_CHECKING, Any, Tuple, cast @@ -170,9 +171,11 @@ def _match_categorical_dtypes_both( return lcol, rcol.astype(ltype) else: # merge categories - merged_categories = cudf.concat( - [ltype.categories, rtype.categories] - ).unique() + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + merged_categories = cudf.concat( + [ltype.categories, rtype.categories] + ).unique() common_type = cudf.CategoricalDtype( categories=merged_categories, ordered=False ) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index a2cc5450ca4..1bca738590f 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -6,6 +6,7 @@ import numbers import operator import pickle +import warnings from collections import abc from functools import cached_property from numbers import Integral @@ -717,15 +718,17 @@ def _compute_validity_mask(self, index, row_tuple, max_length): continue lookup[i] = cudf.Series(row) frame = cudf.DataFrame(dict(enumerate(index._data.columns))) - data_table = cudf.concat( - [ - frame, - cudf.DataFrame( - {"idx": cudf.Series(column.arange(len(frame)))} - ), - ], - axis=1, - ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + data_table = cudf.concat( + [ + frame, + cudf.DataFrame( + {"idx": cudf.Series(column.arange(len(frame)))} + ), + ], + axis=1, + ) # Sort indices in pandas compatible mode # because we want the indices to be fetched # in a deterministic order. diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 7a80d70acb3..465186d81d2 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -5,6 +5,7 @@ from typing import Dict, Optional import cupy +import warnings import numpy as np import pandas as pd @@ -320,9 +321,20 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): df = cudf.DataFrame() _normalize_series_and_dataframe(objs, axis=axis) + any_empty = any(obj.empty for obj in objs) + if any_empty: + # Do not remove until pandas-3.0 support is added. + warnings.warn( + "The behavior of array concatenation with empty entries is " + "deprecated. In a future version, this will no longer exclude " + "empty items when determining the result dtype. " + "To retain the old behavior, exclude the empty entries before " + "the concat operation.", + FutureWarning, + ) # Inner joins involving empty data frames always return empty dfs, but # We must delay returning until we have set the column names. - empty_inner = any(obj.empty for obj in objs) and join == "inner" + empty_inner = any_empty and join == "inner" objs = [obj for obj in objs if obj.shape != (0, 0)] diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 5876a577b87..959b91afd32 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1429,7 +1429,9 @@ def __repr__(self): if max_rows not in (0, None) and len(self) > max_rows: top = self.head(int(max_rows / 2 + 1)) bottom = self.tail(int(max_rows / 2 + 1)) - preprocess = cudf.concat([top, bottom]) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + preprocess = cudf.concat([top, bottom]) else: preprocess = self.copy() preprocess.index = preprocess.index._clean_nulls_from_index() diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index bcc24a85cf9..a6da55c1a7f 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -794,13 +794,15 @@ def _parquet_to_frame( dtype=_dtype, ) - # Concatenate dfs and return. - # Assume we can ignore the index if it has no name. - return ( - cudf.concat(dfs, ignore_index=dfs[-1].index.name is None) - if len(dfs) > 1 - else dfs[0] - ) + if len(dfs) > 1: + # Concatenate dfs and return. + # Assume we can ignore the index if it has no name. + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + res = cudf.concat(dfs, ignore_index=dfs[-1].index.name is None) + return res + else: + return dfs[0] @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index a265618e4ba..7fa1b634185 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -2,10 +2,13 @@ from decimal import Decimal +import warnings import numpy as np import pandas as pd import pytest +from contextlib import contextmanager + import cudf as gd from cudf.api.types import _is_categorical_dtype from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140, PANDAS_GE_200 @@ -17,6 +20,20 @@ ) +@contextmanager +def _hide_concat_empty_dtype_warning(): + with warnings.catch_warnings(): + # Ignoring warnings in this test as warnings are + # being caught and validated in other tests. + warnings.filterwarnings( + "ignore", + "The behavior of array concatenation with empty entries " + "is deprecated.", + category=FutureWarning, + ) + yield + + def make_frames(index=None, nulls="none"): df = pd.DataFrame( { @@ -66,8 +83,9 @@ def test_concat_dataframe(index, nulls, axis): df_empty1 = gdf_empty1.to_pandas() # DataFrame - res = gd.concat([gdf, gdf2, gdf, gdf_empty1], axis=axis).to_pandas() - sol = pd.concat([df, df2, df, df_empty1], axis=axis) + with _hide_concat_empty_dtype_warning(): + res = gd.concat([gdf, gdf2, gdf, gdf_empty1], axis=axis).to_pandas() + sol = pd.concat([df, df2, df, df_empty1], axis=axis) assert_eq( res, sol, @@ -476,8 +494,9 @@ def test_concat_series_dataframe_input(objs): pd_objs = objs gd_objs = [gd.from_pandas(obj) for obj in objs] - expected = pd.concat(pd_objs) - actual = gd.concat(gd_objs) + with _hide_concat_empty_dtype_warning(): + expected = pd.concat(pd_objs) + actual = gd.concat(gd_objs) assert_eq( expected.fillna(-1), @@ -843,23 +862,24 @@ def test_concat_join_many_df_and_empty_df(ignore_index, sort, join, axis): gdf3 = gd.from_pandas(pdf3) gdf_empty1 = gd.from_pandas(pdf_empty1) - assert_eq( - pd.concat( - [pdf1, pdf2, pdf3, pdf_empty1], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ), - gd.concat( - [gdf1, gdf2, gdf3, gdf_empty1], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ), - check_index_type=False, - ) + with _hide_concat_empty_dtype_warning(): + assert_eq( + pd.concat( + [pdf1, pdf2, pdf3, pdf_empty1], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ), + gd.concat( + [gdf1, gdf2, gdf3, gdf_empty1], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ), + check_index_type=False, + ) @pytest.mark.parametrize("ignore_index", [True, False]) @@ -970,20 +990,21 @@ def test_concat_join_no_overlapping_columns_many_and_empty( gdf6 = gd.from_pandas(pdf6) gdf_empty = gd.from_pandas(pdf_empty) - expected = pd.concat( - [pdf4, pdf5, pdf6, pdf_empty], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - actual = gd.concat( - [gdf4, gdf5, gdf6, gdf_empty], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) + with _hide_concat_empty_dtype_warning(): + expected = pd.concat( + [pdf4, pdf5, pdf6, pdf_empty], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) + actual = gd.concat( + [gdf4, gdf5, gdf6, gdf_empty], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) assert_eq( expected, actual, @@ -1042,20 +1063,21 @@ def test_concat_join_no_overlapping_columns_many_and_empty2( ): objs_gd = [gd.from_pandas(o) if o is not None else o for o in objs] - expected = pd.concat( - objs, - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - actual = gd.concat( - objs_gd, - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) + with _hide_concat_empty_dtype_warning(): + expected = pd.concat( + objs, + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) + actual = gd.concat( + objs_gd, + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) assert_eq(expected, actual, check_index_type=False) @@ -1079,20 +1101,21 @@ def test_concat_join_no_overlapping_columns_empty_df_basic( gdf6 = gd.from_pandas(pdf6) gdf_empty = gd.from_pandas(pdf_empty) - expected = pd.concat( - [pdf6, pdf_empty], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - actual = gd.concat( - [gdf6, gdf_empty], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) + with _hide_concat_empty_dtype_warning(): + expected = pd.concat( + [pdf6, pdf_empty], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) + actual = gd.concat( + [gdf6, gdf_empty], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) assert_eq( expected, actual, @@ -1109,7 +1132,7 @@ def test_concat_join_series(ignore_index, sort, join, axis): s1 = gd.Series(["a", "b", "c"]) s2 = gd.Series(["a", "b"]) s3 = gd.Series(["a", "b", "c", "d"]) - s4 = gd.Series() + s4 = gd.Series(dtype="str") ps1 = s1.to_pandas() ps2 = s2.to_pandas() @@ -1123,13 +1146,14 @@ def test_concat_join_series(ignore_index, sort, join, axis): ignore_index=ignore_index, axis=axis, ) - actual = gd.concat( - [s1, s2, s3, s4], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) + with expect_warning_if(axis == 1): + actual = gd.concat( + [s1, s2, s3, s4], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) if PANDAS_GE_150: assert_eq( @@ -1327,12 +1351,21 @@ def test_concat_join_empty_dataframes_axis_1( gdf = gd.from_pandas(df) other_gd = [gdf] + [gd.from_pandas(o) for o in other] - expected = pd.concat( - other_pd, ignore_index=ignore_index, axis=axis, join=join, sort=sort - ) - actual = gd.concat( - other_gd, ignore_index=ignore_index, axis=axis, join=join, sort=sort - ) + with _hide_concat_empty_dtype_warning(): + expected = pd.concat( + other_pd, + ignore_index=ignore_index, + axis=axis, + join=join, + sort=sort, + ) + actual = gd.concat( + other_gd, + ignore_index=ignore_index, + axis=axis, + join=join, + sort=sort, + ) if expected.shape != df.shape: if axis == 0: for key, col in actual[actual.columns].items(): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 6e9b9a37ac0..94aff555c7f 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -93,6 +93,20 @@ def _hide_ufunc_warnings(eval_str): yield +@contextmanager +def _hide_concat_empty_dtype_warning(): + with warnings.catch_warnings(): + # Ignoring warnings in this test as warnings are + # being caught and validated in other tests. + warnings.filterwarnings( + "ignore", + "The behavior of array concatenation with empty " + "entries is deprecated.", + category=FutureWarning, + ) + yield + + def test_init_via_list_of_tuples(): data = [ (5, "cats", "jump", np.nan), @@ -1601,8 +1615,9 @@ def test_dataframe_concat_different_column_types(): "df_2", [cudf.DataFrame({"a": [], "b": []}), cudf.DataFrame({})] ) def test_concat_empty_dataframe(df_1, df_2): - got = cudf.concat([df_1, df_2]) - expect = pd.concat([df_1.to_pandas(), df_2.to_pandas()], sort=False) + with _hide_concat_empty_dtype_warning(): + got = cudf.concat([df_1, df_2]) + expect = pd.concat([df_1.to_pandas(), df_2.to_pandas()], sort=False) # ignoring dtypes as pandas upcasts int to float # on concatenation with empty dataframes @@ -1628,10 +1643,15 @@ def test_concat_empty_dataframe(df_1, df_2): ], ) def test_concat_different_column_dataframe(df1_d, df2_d): - got = cudf.concat( - [cudf.DataFrame(df1_d), cudf.DataFrame(df2_d), cudf.DataFrame(df1_d)], - sort=False, - ) + with _hide_concat_empty_dtype_warning(): + got = cudf.concat( + [ + cudf.DataFrame(df1_d), + cudf.DataFrame(df2_d), + cudf.DataFrame(df1_d), + ], + sort=False, + ) pdf1 = pd.DataFrame(df1_d) pdf2 = pd.DataFrame(df2_d) @@ -1670,8 +1690,9 @@ def is_invalid_concat(left, right): ) @pytest.mark.parametrize("ser_2", [pd.Series([], dtype="float64")]) def test_concat_empty_series(ser_1, ser_2): - got = cudf.concat([cudf.Series(ser_1), cudf.Series(ser_2)]) - expect = pd.concat([ser_1, ser_2]) + with _hide_concat_empty_dtype_warning(): + got = cudf.concat([cudf.Series(ser_1), cudf.Series(ser_2)]) + expect = pd.concat([ser_1, ser_2]) assert_eq(got, expect, check_index_type=True) @@ -7500,8 +7521,13 @@ def test_dataframe_concat_dataframe(df, other, sort, ignore_index): gdf = cudf.from_pandas(df) other_gd = cudf.from_pandas(other) - expected = pd.concat([pdf, other_pd], sort=sort, ignore_index=ignore_index) - actual = cudf.concat([gdf, other_gd], sort=sort, ignore_index=ignore_index) + with _hide_concat_empty_dtype_warning(): + expected = pd.concat( + [pdf, other_pd], sort=sort, ignore_index=ignore_index + ) + actual = cudf.concat( + [gdf, other_gd], sort=sort, ignore_index=ignore_index + ) # In empty dataframe cases, Pandas & cudf differ in columns # creation, pandas creates RangeIndex(0, 0) @@ -7739,12 +7765,13 @@ def test_dataframe_concat_dataframe_lists(df, other, sort, ignore_index): gdf = cudf.from_pandas(df) other_gd = [cudf.from_pandas(o) for o in other] - expected = pd.concat( - [pdf] + other_pd, sort=sort, ignore_index=ignore_index - ) - actual = cudf.concat( - [gdf] + other_gd, sort=sort, ignore_index=ignore_index - ) + with _hide_concat_empty_dtype_warning(): + expected = pd.concat( + [pdf] + other_pd, sort=sort, ignore_index=ignore_index + ) + actual = cudf.concat( + [gdf] + other_gd, sort=sort, ignore_index=ignore_index + ) # In some cases, Pandas creates an empty Index([], dtype="object") for # columns whereas cudf creates a RangeIndex(0, 0). @@ -7854,12 +7881,13 @@ def test_dataframe_concat_lists(df, other, sort, ignore_index): gdf = cudf.from_pandas(df) other_gd = [cudf.from_pandas(o) for o in other_pd] - expected = pd.concat( - [pdf] + other_pd, sort=sort, ignore_index=ignore_index - ) - actual = cudf.concat( - [gdf] + other_gd, sort=sort, ignore_index=ignore_index - ) + with _hide_concat_empty_dtype_warning(): + expected = pd.concat( + [pdf] + other_pd, sort=sort, ignore_index=ignore_index + ) + actual = cudf.concat( + [gdf] + other_gd, sort=sort, ignore_index=ignore_index + ) if expected.shape != df.shape: assert_eq( diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 445fc84981b..d06041301b9 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1034,16 +1034,19 @@ def test_index_append(data, other): pd_data = pd.Index(data) pd_other = pd.Index(other) - gd_data = cudf.core.index.as_index(data) - gd_other = cudf.core.index.as_index(other) + gd_data = cudf.Index(data) + gd_other = cudf.Index(other) if cudf.utils.dtypes.is_mixed_with_object_dtype(gd_data, gd_other): gd_data = gd_data.astype("str") gd_other = gd_other.astype("str") - expected = pd_data.append(pd_other) - - actual = gd_data.append(gd_other) + with expect_warning_if( + (len(data) == 0 or len(other) == 0) and pd_data.dtype != pd_other.dtype + ): + expected = pd_data.append(pd_other) + with expect_warning_if(len(data) == 0 or len(other) == 0): + actual = gd_data.append(gd_other) if len(data) == 0 and len(other) == 0: # Pandas default dtype to "object" for empty list # cudf default dtype to "float" for empty list @@ -1233,8 +1236,13 @@ def test_index_append_list(data, other): gd_data = cudf.from_pandas(data) gd_other = [cudf.from_pandas(i) for i in other] - expected = pd_data.append(pd_other) - actual = gd_data.append(gd_other) + with expect_warning_if( + (len(data) == 0 or any(len(d) == 0 for d in other)) + and (any(d.dtype != data.dtype for d in other)) + ): + expected = pd_data.append(pd_other) + with expect_warning_if(len(data) == 0 or any(len(d) == 0 for d in other)): + actual = gd_data.append(gd_other) assert_eq(expected, actual) From e218f5c384b64a93597273265435f19d087206ee Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 28 Dec 2023 00:56:56 +0530 Subject: [PATCH 088/162] Deprecate setting of incompatible dtypes to an existing column (#14668) This PR deprecates the setting of a value that is not of same dtype as that of a column. On `pandas_2.0_feature_branch`: ``` = 198 failed, 101241 passed, 2091 skipped, 954 xfailed, 312 xpassed in 1098.81s (0:18:18) = ``` This PR: ``` = 176 failed, 101263 passed, 2091 skipped, 954 xfailed, 312 xpassed in 1096.08s (0:18:16) = ``` --- python/cudf/cudf/core/index.py | 2 +- python/cudf/cudf/core/series.py | 9 +++++++ python/cudf/cudf/tests/test_indexing.py | 34 ++++++++++++++++++++----- python/cudf/cudf/tests/test_setitem.py | 14 +++++++--- 4 files changed, 48 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 25a58d77830..0b0b25281ce 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -3016,7 +3016,7 @@ def _get_indexer_basic(index, positions, method, target_col, tolerance): # sentinel for missing values else: # Mark indices to the right of the largest value as not found - positions[positions == len(index)] = -1 + positions[positions == len(index)] = np.int32(-1) if tolerance is not None: distance = abs(index[positions] - target_col) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 959b91afd32..6080a37f0a2 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -284,6 +284,15 @@ def __setitem__(self, key, value): to_dtype = np.result_type(value.dtype, self._frame._column.dtype) value = value.astype(to_dtype) if to_dtype != self._frame._column.dtype: + # Do not remove until pandas-3.0 support is added. + warnings.warn( + f"Setting an item of incompatible dtype is deprecated " + "and will raise in a future error of pandas. " + f"Value '{value}' has dtype incompatible with " + f"{self._frame._column.dtype}, " + "please explicitly cast to a compatible dtype first.", + FutureWarning, + ) self._frame._column._mimic_inplace( self._frame._column.astype(to_dtype), inplace=True ) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index e921a6ccf3f..f2b58a80362 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -930,8 +930,17 @@ def test_series_setitem_basics(key, value, nulls): elif nulls == "all": psr[:] = None gsr = cudf.from_pandas(psr) - psr[key] = value - gsr[key] = value + with expect_warning_if( + PANDAS_GE_210 + and isinstance(value, list) + and len(value) == 0 + and nulls == "none" + ): + psr[key] = value + with expect_warning_if( + isinstance(value, list) and len(value) == 0 and nulls == "none" + ): + gsr[key] = value assert_eq(psr, gsr, check_dtype=False) @@ -974,8 +983,17 @@ def test_series_setitem_iloc(key, value, nulls): elif nulls == "all": psr[:] = None gsr = cudf.from_pandas(psr) - psr.iloc[key] = value - gsr.iloc[key] = value + with expect_warning_if( + PANDAS_GE_210 + and isinstance(value, list) + and len(value) == 0 + and nulls == "none" + ): + psr.iloc[key] = value + with expect_warning_if( + isinstance(value, list) and len(value) == 0 and nulls == "none" + ): + gsr.iloc[key] = value assert_eq(psr, gsr, check_dtype=False) @@ -994,8 +1012,12 @@ def test_series_setitem_iloc(key, value, nulls): def test_series_setitem_dtype(key, value): psr = pd.Series([1, 2, 3], dtype="int32") gsr = cudf.from_pandas(psr) - psr[key] = value - gsr[key] = value + + with expect_warning_if(isinstance(value, (float, list))): + psr[key] = value + with expect_warning_if(isinstance(value, (float, list))): + gsr[key] = value + assert_eq(psr, gsr) diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index 6e1e53fc869..2d663a6c329 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -5,8 +5,12 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_150 -from cudf.testing._utils import assert_eq, assert_exceptions_equal +from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_210 +from cudf.testing._utils import ( + assert_eq, + assert_exceptions_equal, + expect_warning_if, +) @pytest.mark.parametrize("df", [pd.DataFrame({"a": [1, 2, 3]})]) @@ -310,8 +314,10 @@ def test_series_setitem_upcasting(dtype, indices): # column dtype. new_value = np.float64(np.pi) col_ref = cr._column - sr[indices] = new_value - cr[indices] = new_value + with expect_warning_if(PANDAS_GE_210 and dtype != np.float64): + sr[indices] = new_value + with expect_warning_if(dtype != np.float64): + cr[indices] = new_value if PANDAS_GE_150: assert_eq(sr, cr) else: From fd1f98641fe5c705e24ea645d12b27cd8ee4cea2 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 28 Dec 2023 01:05:34 +0530 Subject: [PATCH 089/162] Fix datetime related assertions and warnings in pytests (#14673) This PR fixes all `datetime` related pytests by properly handling their assertions with bug-fixes made in pandas-2.x and filtering newly introduced warnings where not necessary to propagate to the end-user. On `pandas_2.0_feature_branch`: ``` = 198 failed, 101241 passed, 2091 skipped, 954 xfailed, 312 xpassed in 1098.81s (0:18:18) = ``` This PR: ``` = 161 failed, 101280 passed, 2091 skipped, 952 xfailed, 312 xpassed in 1106.29s (0:18:26) = ``` --- python/cudf/cudf/core/column/column.py | 8 ++- python/cudf/cudf/core/tools/datetimes.py | 12 +++-- python/cudf/cudf/tests/test_datetime.py | 65 +++++++++--------------- python/cudf/cudf/tests/test_joining.py | 8 +-- python/cudf/cudf/tests/test_orc.py | 29 +++-------- python/cudf/cudf/tests/test_parquet.py | 7 --- 6 files changed, 45 insertions(+), 84 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index b79d0644696..e83d82307e5 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -4,6 +4,8 @@ import builtins import pickle +import warnings + from collections import abc from functools import cached_property from itertools import chain @@ -2596,7 +2598,11 @@ def _construct_array( ): # We may have date-like strings with timezones try: - pd_arbitrary = pd.to_datetime(arbitrary) + with warnings.catch_warnings(): + # Need to ignore userwarnings when + # datetime format cannot be inferred. + warnings.simplefilter("ignore", UserWarning) + pd_arbitrary = pd.to_datetime(arbitrary) if isinstance(pd_arbitrary.dtype, pd.DatetimeTZDtype): raise NotImplementedError( "cuDF does not yet support timezone-aware datetimes" diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 3d06e82d4cb..1525dd1da22 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -923,10 +923,14 @@ def date_range( # FIXME: when `end_estim` is out of bound, but the actual `end` is not, # we shouldn't raise but compute the sequence as is. The trailing overflow # part should get trimmed at the end. - end_estim = ( - pd.Timestamp(start.value) - + periods * offset._maybe_as_fast_pandas_offset() - ).to_datetime64() + with warnings.catch_warnings(): + # Need to ignore userwarnings where nonzero nanoseconds + # are dropped in conversion during the binops + warnings.simplefilter("ignore", UserWarning) + end_estim = ( + pd.Timestamp(start.value) + + periods * offset._maybe_as_fast_pandas_offset() + ).to_datetime64() if "months" in offset.kwds or "years" in offset.kwds: # If `offset` is non-fixed frequency, resort to libcudf. diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 2368b3e539c..88a50c7936e 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -642,21 +642,10 @@ def test_cudf_to_datetime(data, dayfirst): expected = pd.to_datetime(pd_data, dayfirst=dayfirst) actual = cudf.to_datetime(gd_data, dayfirst=dayfirst) - # TODO: Remove typecast to `ns` and following if/else - # workaround after following issue is fixed: - # https://github.com/pandas-dev/pandas/issues/52449 - - if actual is not None and expected is not None: - assert_eq( - actual.astype(pd_data.dtype) - if pd_data is not None - and hasattr(pd_data, "dtype") - and cudf.api.types.is_datetime_dtype(pd_data.dtype) - else actual.astype("datetime64[ns]"), - expected, - ) + if isinstance(expected, pd.Series): + assert_eq(actual, expected, check_dtype=False) else: - assert_eq(actual, expected) + assert_eq(actual, expected, check_exact=False) @pytest.mark.parametrize( @@ -748,11 +737,10 @@ def test_to_datetime_units(data, unit): expected = pd.to_datetime(pd_data, unit=unit) actual = cudf.to_datetime(gd_data, unit=unit) - # TODO: Remove typecast to `ns` after following - # issue is fixed: - # https://github.com/pandas-dev/pandas/issues/52449 - - assert_eq(actual.astype("datetime64[ns]"), expected) + if isinstance(expected, pd.Series): + assert_eq(actual, expected, check_dtype=False) + else: + assert_eq(actual, expected, exact=False, check_exact=False) @pytest.mark.parametrize( @@ -810,11 +798,11 @@ def test_to_datetime_format(data, format, infer_datetime_format): actual = cudf.to_datetime( gd_data, format=format, infer_datetime_format=infer_datetime_format ) - # TODO: Remove typecast to `ns` after following - # issue is fixed: - # https://github.com/pandas-dev/pandas/issues/52449 - assert_eq(actual.astype("datetime64[ns]"), expected) + if isinstance(expected, pd.Series): + assert_eq(actual, expected, check_dtype=False) + else: + assert_eq(actual, expected, check_exact=False) def test_to_datetime_data_out_of_range_for_format(): @@ -879,11 +867,8 @@ def test_datetime_scalar_timeunit_cast(timeunit): gs = Series(testscalar) ps = pd.Series(testscalar) - # TODO: Remove typecast to `ns` after following - # issue is fixed: - # https://github.com/pandas-dev/pandas/issues/52449 - assert_eq(ps, gs.astype("datetime64[ns]")) + assert_eq(ps, gs, check_dtype=False) gdf = DataFrame() gdf["a"] = np.arange(5) @@ -894,11 +879,7 @@ def test_datetime_scalar_timeunit_cast(timeunit): pdf["b"] = testscalar assert gdf["b"].dtype == cudf.dtype("datetime64[s]") - # TODO: Remove typecast to `ns` after following - # issue is fixed: - # https://github.com/pandas-dev/pandas/issues/52449 - gdf["b"] = gdf["b"].astype("datetime64[ns]") - assert_eq(pdf, gdf) + assert_eq(pdf, gdf, check_dtype=True) @pytest.mark.parametrize( @@ -1328,14 +1309,13 @@ def test_datetime_infer_format(data, timezone, dtype): assert_eq(expected, actual) else: - with pytest.raises(NotImplementedError): - assert_exceptions_equal( - lfunc=psr.astype, - rfunc=sr.astype, - lfunc_args_and_kwargs=([], {"dtype": dtype}), - rfunc_args_and_kwargs=([], {"dtype": dtype}), - check_exception_type=False, - ) + assert_exceptions_equal( + lfunc=psr.astype, + rfunc=sr.astype, + lfunc_args_and_kwargs=([], {"dtype": dtype}), + rfunc_args_and_kwargs=([], {"dtype": dtype}), + check_exception_type=False, + ) def test_dateoffset_instance_subclass_check(): @@ -1634,7 +1614,8 @@ def test_date_range_end_freq_periods(request, end, freq, periods): request.applymarker( pytest.mark.xfail( condition=( - "nanoseconds" in freq + not PANDAS_GE_210 + and "nanoseconds" in freq and periods != 1 and end == "1970-01-01 00:00:00" ), @@ -2268,7 +2249,7 @@ def test_format_timezone_not_implemented(code): @pytest.mark.parametrize("tz", ["Z", "UTC-3", "+01:00"]) def test_no_format_timezone_not_implemented(tz): - with pytest.raises(NotImplementedError): + with pytest.raises((NotImplementedError, ValueError)): cudf.to_datetime([f"2020-01-01 00:00:00{tz}"]) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 0544406924f..b273e554158 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -787,13 +787,7 @@ def test_join_datetimes_index(dtype): assert gdf["d"].dtype == cudf.dtype(dtype) - if PANDAS_GE_200: - # TODO: Remove typecast to `ns` after following - # issue is fixed: - # https://github.com/pandas-dev/pandas/issues/52449 - gdf = gdf.astype("datetime64[ns]") - - assert_join_results_equal(pdf, gdf, how="inner") + assert_join_results_equal(pdf, gdf, how="inner", check_dtype=False) def test_join_with_different_names(): diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index e3f1c8eec4d..e53fa1fb4bf 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -21,7 +21,6 @@ gen_rand_series, supported_numpy_dtypes, ) -from cudf.core._compat import PANDAS_GE_200 # Removal of these deprecated features is no longer imminent. They will not be # removed until a suitable alternative has been implemented. As a result, we @@ -160,13 +159,7 @@ def test_orc_reader_datetimestamp(datadir, inputfile, use_index): pdf = orcfile.read().to_pandas(date_as_object=False) gdf = cudf.read_orc(path, use_index=use_index) - if PANDAS_GE_200: - # TODO: Remove typecast to `ns` after following - # issue is fixed: - # https://github.com/pandas-dev/pandas/issues/52449 - gdf = gdf.astype("datetime64[ns]") - - assert_eq(pdf, gdf, check_categorical=False) + assert_eq(pdf, gdf, check_categorical=False, check_exact=False) def test_orc_reader_strings(datadir): @@ -1832,13 +1825,7 @@ def test_orc_reader_negative_timestamp(negative_timestamp_df, engine): with expect_warning_if(engine == "pyarrow", UserWarning): got = cudf.read_orc(buffer, engine=engine) - if PANDAS_GE_200: - # TODO: Remove typecast to `ns` after following - # issue is fixed: - # https://github.com/pandas-dev/pandas/issues/52449 - negative_timestamp_df = negative_timestamp_df.astype("datetime64[ns]") - - assert_eq(negative_timestamp_df, got) + assert_eq(negative_timestamp_df, got, check_dtype=False) def test_orc_writer_negative_timestamp(negative_timestamp_df): @@ -1847,14 +1834,10 @@ def test_orc_writer_negative_timestamp(negative_timestamp_df): buffer = BytesIO() negative_timestamp_df.to_orc(buffer) - if PANDAS_GE_200: - # TODO: Remove typecast to `ns` after following - # issue is fixed: - # https://github.com/pandas-dev/pandas/issues/52449 - negative_timestamp_df = negative_timestamp_df.astype("datetime64[ns]") - - assert_eq(negative_timestamp_df, pd.read_orc(buffer)) - assert_eq(negative_timestamp_df, orc.ORCFile(buffer).read()) + assert_eq(negative_timestamp_df, pd.read_orc(buffer), check_dtype=False) + assert_eq( + negative_timestamp_df, orc.ORCFile(buffer).read(), check_dtype=False + ) def test_orc_reader_apache_negative_timestamp(datadir): diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index adadf147503..971bfe74185 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -632,13 +632,6 @@ def test_parquet_reader_microsecond_timestamps(datadir): expect = pd.read_parquet(fname) got = cudf.read_parquet(fname) - if PANDAS_GE_200: - # TODO: Remove typecast to `ns` after following - # issue is fixed: - # https://github.com/pandas-dev/pandas/issues/52449 - assert got["a"].dtype == cudf.dtype("datetime64[us]") - got = got.astype("datetime64[ns]") - assert_eq(expect, got) From cb09a3911ff75f4a9557912a0a426827b52e2ed3 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 29 Dec 2023 06:13:09 +0530 Subject: [PATCH 090/162] Fix pytest condition to include more warning scenarios (#14680) This PR fixes calculation of cond variable in test_corr1d which will include more cases for warnings. This change fixes, 9 pytest failures. --- python/cudf/cudf/tests/test_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 41fac49ea83..f24a5ea7b41 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -480,7 +480,7 @@ def test_corr1d(data1, data2, method): # Spearman allows for size 1 samples, but will error if all data in a # sample is identical since the covariance is zero and so the correlation # coefficient is not defined. - cond = (is_singular and method == "pearson") or ( + cond = ((is_singular or is_identical) and method == "pearson") or ( is_identical and not is_singular and method == "spearman" ) if method == "spearman": From 1c54354bcdde1c2728213ff3fac8e5be0e613242 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 29 Dec 2023 14:38:41 +0530 Subject: [PATCH 091/162] Sort `Index.difference` & `union` results for early exit scenarios (#14681) This PR sorts results in `Index.difference` & `union` in the early exit scenarios similar to: https://github.com/pandas-dev/pandas/pull/51346/ On `pandas_2.0_feature_branch`: ``` = 110 failed, 101331 passed, 2091 skipped, 952 xfailed, 312 xpassed in 1064.30s (0:17:44) = ``` This PR: ``` = 87 failed, 101354 passed, 2091 skipped, 952 xfailed, 312 xpassed in 1004.34s (0:16:44) = ``` --- python/cudf/cudf/core/_base_index.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 0a70f3050eb..82d496a5c78 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -629,12 +629,18 @@ def union(self, other, sort=None): common_dtype = cudf.utils.dtypes.find_common_type( [self.dtype, other.dtype] ) - return self._get_reconciled_name_object(other).astype(common_dtype) + res = self._get_reconciled_name_object(other).astype(common_dtype) + if sort: + return res.sort_values() + return res elif not len(self): common_dtype = cudf.utils.dtypes.find_common_type( [self.dtype, other.dtype] ) - return other._get_reconciled_name_object(self).astype(common_dtype) + res = other._get_reconciled_name_object(self).astype(common_dtype) + if sort: + return res.sort_values() + return res result = self._union(other, sort=sort) result.name = _get_result_name(self.name, other.name) @@ -1091,9 +1097,15 @@ def difference(self, other, sort=None): other = cudf.Index(other, name=getattr(other, "name", self.name)) if not len(other): - return self._get_reconciled_name_object(other) + res = self._get_reconciled_name_object(other) + if sort: + return res.sort_values() + return res elif self.equals(other): - return self[:0]._get_reconciled_name_object(other) + res = self[:0]._get_reconciled_name_object(other) + if sort: + return res.sort_values() + return res res_name = _get_result_name(self.name, other.name) From 8a8b627076b42f44aab2eaac9f166a3e3e6ff2fc Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Sat, 30 Dec 2023 14:51:27 +0530 Subject: [PATCH 092/162] Fix column parameter handling in `read_orc` (#14666) When `columns=[]` for `read_orc`, pandas actually only drops the column and preserves the `index` while reading an orc file. Fixing the `cudf` behavior to match the same. On `pandas_2.0_feature_branch`: ``` = 198 failed, 101241 passed, 2091 skipped, 954 xfailed, 312 xpassed in 1098.81s (0:18:18) = ``` This PR: ``` = 185 failed, 101254 passed, 2091 skipped, 954 xfailed, 312 xpassed in 1088.47s (0:18:08) = ``` --- python/cudf/cudf/_lib/orc.pyx | 11 +++++++++-- python/cudf/cudf/core/_compat.py | 1 + python/cudf/cudf/tests/test_orc.py | 13 ++----------- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index 0ae039b14d2..62a9a2886b6 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -116,6 +116,7 @@ cpdef read_orc(object filepaths_or_buffers, ) cdef table_with_metadata c_result + cdef size_type nrows with nogil: c_result = move(libcudf_read_orc(c_orc_reader_options)) @@ -127,6 +128,12 @@ cpdef read_orc(object filepaths_or_buffers, skip_rows, num_rows) + if columns is not None and (isinstance(columns, list) and len(columns) == 0): + # When `columns=[]`, index needs to be + # established, but not the columns. + nrows = c_result.tbl.get()[0].view().num_rows() + return {}, cudf.RangeIndex(nrows) + data, index = data_from_unique_ptr( move(c_result.tbl), col_names if columns is None else names, @@ -173,7 +180,6 @@ cdef tuple _get_index_from_metadata( range_idx = None if json_str != "": meta = json.loads(json_str) - if 'index_columns' in meta and len(meta['index_columns']) > 0: index_col = meta['index_columns'] if isinstance(index_col[0], dict) and \ @@ -353,7 +359,8 @@ cdef orc_reader_options make_orc_reader_options( c_column_names.reserve(len(column_names)) for col in column_names: c_column_names.push_back(str(col).encode()) - opts.set_columns(c_column_names) + if len(column_names) > 0: + opts.set_columns(c_column_names) return opts diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py index c326b19307d..3783e9ded6d 100644 --- a/python/cudf/cudf/core/_compat.py +++ b/python/cudf/cudf/core/_compat.py @@ -11,6 +11,7 @@ PANDAS_LT_153 = PANDAS_VERSION < version.parse("1.5.3") PANDAS_EQ_200 = PANDAS_VERSION == version.parse("2.0.0") PANDAS_GE_200 = PANDAS_VERSION >= version.parse("2.0.0") +PANDAS_GE_201 = PANDAS_VERSION >= version.parse("2.0.1") PANDAS_GE_210 = PANDAS_VERSION >= version.parse("2.1.0") PANDAS_GE_214 = PANDAS_VERSION >= version.parse("2.1.4") PANDAS_GE_220 = PANDAS_VERSION >= version.parse("2.2.0") diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index e53fa1fb4bf..fc3e0ce56e1 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -576,7 +576,7 @@ def test_int_overflow(tmpdir): # The number of rows and the large element trigger delta encoding num_rows = 513 - df = cudf.DataFrame({"a": [None] * num_rows}, dtype="int32") + df = cudf.DataFrame({"a": [None] * num_rows}, dtype="int64") df["a"][0] = 1024 * 1024 * 1024 df["a"][num_rows - 1] = 1 df.to_orc(file_path) @@ -1669,16 +1669,7 @@ def run_orc_columns_and_index_param(index_obj, index, columns): expected = pd.read_orc(buffer, columns=columns) got = cudf.read_orc(buffer, columns=columns) - if columns: - # TODO: Remove workaround after this issue is fixed: - # https://github.com/pandas-dev/pandas/issues/47944 - assert_eq( - expected.sort_index(axis=1), - got.sort_index(axis=1), - check_index_type=True, - ) - else: - assert_eq(expected, got, check_index_type=True) + assert_eq(expected, got, check_index_type=True) @pytest.mark.parametrize("index_obj", [None, [10, 11, 12], ["x", "y", "z"]]) From 3344377b118ae63c71d79405c694f89c885dbdf8 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Sat, 30 Dec 2023 14:52:11 +0530 Subject: [PATCH 093/162] Handle missing warning assertions for concat pytests (#14682) This PR adds warning assertions that were missed in https://github.com/rapidsai/cudf/pull/14672 On `pandas_2.0_feature_branch`: ``` = 110 failed, 101331 passed, 2091 skipped, 952 xfailed, 312 xpassed in 1064.30s (0:17:44) = ``` This PR: ``` = 105 failed, 101336 passed, 2091 skipped, 952 xfailed, 312 xpassed in 1068.90s (0:17:48) = ``` --- python/cudf/cudf/tests/test_index.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index d06041301b9..9a927e65eb1 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1062,10 +1062,12 @@ def test_index_empty_append_name_conflict(): non_empty = cudf.Index([1], name="bar") expected = cudf.Index([1]) - result = non_empty.append(empty) + with pytest.warns(FutureWarning): + result = non_empty.append(empty) assert_eq(result, expected) - result = empty.append(non_empty) + with pytest.warns(FutureWarning): + result = empty.append(non_empty) assert_eq(result, expected) @@ -2861,7 +2863,8 @@ def test_index_methods(index, func): if func == "append": expected = pidx.append(other=pidx) - actual = gidx.append(other=gidx) + with expect_warning_if(len(gidx) == 0): + actual = gidx.append(other=gidx) else: expected = getattr(pidx, func)() actual = getattr(gidx, func)() From eabba98c593b4603240b187e9207f01d5459e248 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Sat, 30 Dec 2023 14:52:41 +0530 Subject: [PATCH 094/162] Fix a typo error in where pytest (#14683) This PR fixes a typo in isinstance check, thus fixing 6 pytest failures. --- python/cudf/cudf/tests/test_dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 94aff555c7f..72a232f3d41 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -6108,7 +6108,7 @@ def test_df_sr_mask_where(data, condition, other, error, inplace): got_mask = gs_mask if hasattr(expect_where, "dtype") and isinstance( - expect_where, pd.CategoricalDtype + expect_where.dtype, pd.CategoricalDtype ): np.testing.assert_array_equal( expect_where.cat.codes, From bcdeb19ed66ce5dbcdb711ebb8cf803c23adb38a Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 8 Jan 2024 20:34:40 +0530 Subject: [PATCH 095/162] Change empty column dtype to `string` from `float64` (#14691) This PR enforces deprecation where an empty column now defaults to `str` dtype rather than `float64` dtype. On `pandas_2.0_feature_branch`: ``` = 68 failed, 101373 passed, 2091 skipped, 952 xfailed, 312 xpassed in 1000.21s (0:16:40) = ``` This PR: ``` = 65 failed, 101364 passed, 2091 skipped, 964 xfailed, 312 xpassed in 1054.55s (0:17:34) = ``` --- python/cudf/cudf/core/column/column.py | 4 +-- python/cudf/cudf/core/column/numerical.py | 7 ++++- python/cudf/cudf/core/reshape.py | 8 ++--- python/cudf/cudf/core/series.py | 19 +++--------- python/cudf/cudf/testing/_utils.py | 27 ----------------- python/cudf/cudf/tests/test_dataframe.py | 24 ++++++++++----- python/cudf/cudf/tests/test_dropna.py | 7 ++--- python/cudf/cudf/tests/test_duplicates.py | 3 +- python/cudf/cudf/tests/test_index.py | 8 ++--- python/cudf/cudf/tests/test_indexing.py | 13 +++++--- python/cudf/cudf/tests/test_joining.py | 11 ++++++- python/cudf/cudf/tests/test_onehot.py | 2 +- python/cudf/cudf/tests/test_rolling.py | 11 +++---- python/cudf/cudf/tests/test_series.py | 37 ++++++++++------------- python/cudf/cudf/tests/test_sorting.py | 2 +- python/cudf/cudf/tests/test_stats.py | 14 ++++----- 16 files changed, 86 insertions(+), 111 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index e83d82307e5..5dbbbe5ac10 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2039,8 +2039,8 @@ def as_column( new_dtype = dtype elif len(arbitrary) == 0: # If the column is empty, it has to be - # a `float64` dtype. - new_dtype = cudf.dtype("float64") + # a `str` dtype. + new_dtype = cudf.dtype("str") else: # If the null column is not empty, it has to # be of `object` dtype. diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index cdecf44cc8f..fb9fa954c68 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -173,7 +173,12 @@ def __setitem__(self, key: Any, value: Any): if isinstance(key, slice): out = self._scatter_by_slice(key, device_value) else: - key = as_column(key) + key = as_column( + key, + dtype="float64" + if isinstance(key, list) and len(key) == 0 + else None, + ) if not isinstance(key, cudf.core.column.NumericalColumn): raise ValueError(f"Invalid scatter map type {key.dtype}.") out = self._scatter_by_column(key, device_value) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 465186d81d2..5f9d333811a 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -429,11 +429,9 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): return result elif typ is cudf.Series: - objs = [obj for obj in objs if len(obj)] - if len(objs) == 0: - return cudf.Series() - elif len(objs) == 1 and not ignore_index: - return objs[0] + new_objs = [obj for obj in objs if len(obj)] + if len(new_objs) == 1 and not ignore_index: + return new_objs[0] else: return cudf.Series._concat( objs, axis=axis, index=None if ignore_index else True diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 6080a37f0a2..7b40e172da3 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -14,7 +14,6 @@ Dict, MutableMapping, Optional, - Sequence, Set, Tuple, Union, @@ -601,18 +600,6 @@ def __init__( copy=False, nan_as_null=True, ): - if ( - isinstance(data, Sequence) - and len(data) == 0 - and dtype is None - and getattr(data, "dtype", None) is None - ): - warnings.warn( - "The default dtype for empty Series will be 'object' instead " - "of 'float64' in a future version. Specify a dtype explicitly " - "to silence this warning.", - FutureWarning, - ) if isinstance(data, pd.Series): if name is None: name = data.name @@ -1621,7 +1608,11 @@ def _concat(cls, objs, axis=0, index=True): if isinstance(objs[0].index, cudf.MultiIndex): index = cudf.MultiIndex._concat([o.index for o in objs]) else: - index = cudf.core.index.Index._concat([o.index for o in objs]) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + index = cudf.core.index.Index._concat( + [o.index for o in objs] + ) names = {obj.name for obj in objs} if len(names) == 1: diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 9182246826f..af8a38b8f01 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -19,7 +19,6 @@ import cudf from cudf._lib.null_mask import bitmask_allocation_size_bytes -from cudf.api.types import is_scalar from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion from cudf.core.udf.strings_lowering import cast_string_view_to_udf_string from cudf.core.udf.strings_typing import StringView, string_view, udf_string @@ -397,32 +396,6 @@ def assert_column_memory_ne( raise AssertionError("lhs and rhs holds the same memory.") -def _create_pandas_series_float64_default( - data=None, index=None, dtype=None, *args, **kwargs -): - # Wrapper around pd.Series using a float64 - # default dtype for empty data to silence warnings. - # TODO: Remove this in pandas-2.0 upgrade - if dtype is None and ( - data is None or (not is_scalar(data) and len(data) == 0) - ): - dtype = "float64" - return pd.Series(data=data, index=index, dtype=dtype, *args, **kwargs) - - -def _create_cudf_series_float64_default( - data=None, index=None, dtype=None, *args, **kwargs -): - # Wrapper around cudf.Series using a float64 - # default dtype for empty data to silence warnings. - # TODO: Remove this in pandas-2.0 upgrade - if dtype is None and ( - data is None or (not is_scalar(data) and len(data) == 0) - ): - dtype = "float64" - return cudf.Series(data=data, index=index, dtype=dtype, *args, **kwargs) - - parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize( "left_dtype,right_dtype", list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)), diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 72a232f3d41..e4007ec9cf9 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -40,7 +40,6 @@ ALL_TYPES, DATETIME_TYPES, NUMERIC_TYPES, - _create_cudf_series_float64_default, assert_eq, assert_exceptions_equal, assert_neq, @@ -1376,6 +1375,11 @@ def test_dataframe_setitem_from_masked_object(): def test_dataframe_append_to_empty(): pdf = pd.DataFrame() pdf["a"] = [] + if PANDAS_GE_200: + # TODO: Remove this workaround after + # the following bug is fixed: + # https://github.com/pandas-dev/pandas/issues/56679 + pdf["a"] = pdf["a"].astype("str") pdf["b"] = [1, 2, 3] gdf = cudf.DataFrame() @@ -2713,8 +2717,8 @@ def test_decimal_quantile(q, interpolation, decimal_type): def test_empty_quantile(): - pdf = pd.DataFrame({"x": []}) - df = cudf.DataFrame({"x": []}) + pdf = pd.DataFrame({"x": []}, dtype="float64") + df = cudf.DataFrame({"x": []}, dtype="float64") actual = df.quantile() expected = pdf.quantile() @@ -2972,7 +2976,7 @@ def test_series_all_null(num_elements, null_type): @pytest.mark.parametrize("num_elements", [0, 2, 10, 100]) def test_series_all_valid_nan(num_elements): data = [np.nan] * num_elements - sr = _create_cudf_series_float64_default(data, nan_as_null=False) + sr = cudf.Series(data, nan_as_null=False) np.testing.assert_equal(sr.null_count, 0) @@ -4653,7 +4657,7 @@ def test_dataframe_columns_empty_data_preserves_dtype(dtype, idx_data, data): ) def test_series_values_host_property(data): pds = pd.Series(data=data, dtype=None if data else float) - gds = _create_cudf_series_float64_default(data) + gds = cudf.Series(data=data, dtype=None if data else float) np.testing.assert_array_equal(pds.values, gds.values_host) @@ -4676,7 +4680,7 @@ def test_series_values_host_property(data): ) def test_series_values_property(data): pds = pd.Series(data=data, dtype=None if data else float) - gds = _create_cudf_series_float64_default(data) + gds = cudf.from_pandas(pds) gds_vals = gds.values assert isinstance(gds_vals, cupy.ndarray) np.testing.assert_array_equal(gds_vals.get(), pds.values) @@ -6663,7 +6667,13 @@ def test_dataframe_init_from_arrays_cols(data, cols, index): None, ], ) -def test_dataframe_assign_scalar(col_data, assign_val): +def test_dataframe_assign_scalar(request, col_data, assign_val): + request.applymarker( + pytest.mark.xfail( + condition=PANDAS_GE_200 and len(col_data) == 0, + reason="https://github.com/pandas-dev/pandas/issues/56679", + ) + ) pdf = pd.DataFrame({"a": col_data}) gdf = cudf.DataFrame({"a": col_data}) diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py index d53d24cd6c6..b9dbfd330a4 100644 --- a/python/cudf/cudf/tests/test_dropna.py +++ b/python/cudf/cudf/tests/test_dropna.py @@ -5,10 +5,7 @@ import pytest import cudf -from cudf.testing._utils import ( - _create_pandas_series_float64_default, - assert_eq, -) +from cudf.testing._utils import assert_eq @pytest.mark.parametrize( @@ -25,7 +22,7 @@ @pytest.mark.parametrize("inplace", [True, False]) def test_dropna_series(data, nulls, inplace): - psr = _create_pandas_series_float64_default(data) + psr = pd.Series(data) if len(data) > 0: if nulls == "one": diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py index ddbfdf5eee2..c6f025aa956 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/test_duplicates.py @@ -10,7 +10,6 @@ import cudf from cudf import concat from cudf.testing._utils import ( - _create_pandas_series_float64_default, assert_eq, assert_exceptions_equal, ) @@ -62,7 +61,7 @@ def test_duplicated_with_misspelled_column_name(subset): ], ) def test_drop_duplicates_series(data, keep): - pds = _create_pandas_series_float64_default(data) + pds = pd.Series(data) gds = cudf.from_pandas(pds) assert_df(pds.drop_duplicates(keep=keep), gds.drop_duplicates(keep=keep)) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 9a927e65eb1..38ac1a844b8 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -31,8 +31,6 @@ SERIES_OR_INDEX_NAMES, SIGNED_INTEGER_TYPES, UNSIGNED_TYPES, - _create_cudf_series_float64_default, - _create_pandas_series_float64_default, assert_column_memory_eq, assert_column_memory_ne, assert_eq, @@ -987,8 +985,8 @@ def test_index_equal_misc(data, other): actual = gd_data.equals(np.array(gd_other)) assert_eq(expected, actual) - expected = pd_data.equals(_create_pandas_series_float64_default(pd_other)) - actual = gd_data.equals(_create_cudf_series_float64_default(gd_other)) + expected = pd_data.equals(pd.Series(pd_other)) + actual = gd_data.equals(cudf.Series(gd_other)) assert_eq(expected, actual) expected = pd_data.astype("category").equals(pd_other) @@ -2559,7 +2557,7 @@ def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null): ], ) def test_isin_index(data, values): - psr = _create_pandas_series_float64_default(data) + psr = pd.Series(data) gsr = cudf.Series.from_pandas(psr) got = gsr.index.isin(values) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index f2b58a80362..3e09a11ad35 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -938,7 +938,7 @@ def test_series_setitem_basics(key, value, nulls): ): psr[key] = value with expect_warning_if( - isinstance(value, list) and len(value) == 0 and nulls == "none" + isinstance(value, list) and len(value) == 0 and not len(key) == 0 ): gsr[key] = value assert_eq(psr, gsr, check_dtype=False) @@ -991,7 +991,7 @@ def test_series_setitem_iloc(key, value, nulls): ): psr.iloc[key] = value with expect_warning_if( - isinstance(value, list) and len(value) == 0 and nulls == "none" + isinstance(value, list) and len(value) == 0 and not len(key) == 0 ): gsr.iloc[key] = value assert_eq(psr, gsr, check_dtype=False) @@ -1610,9 +1610,12 @@ def test_dataframe_loc_inplace_update_with_invalid_RHS_df_columns(): actual = gdf.loc[[0, 2], ["x", "y"]] = cudf.DataFrame( {"b": [10, 20], "y": [30, 40]}, index=cudf.Index([0, 2]) ) - expected = pdf.loc[[0, 2], ["x", "y"]] = pd.DataFrame( - {"b": [10, 20], "y": [30, 40]}, index=pd.Index([0, 2]) - ) + with pytest.warns(FutureWarning): + # Seems to be a false warning from pandas, + # but nevertheless catching it. + expected = pdf.loc[[0, 2], ["x", "y"]] = pd.DataFrame( + {"b": [10, 20], "y": [30, 40]}, index=pd.Index([0, 2]) + ) assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index b273e554158..670536ac32e 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -1941,7 +1941,11 @@ def test_string_join_key(str_data, num_keys, how): gdf[i] = cudf.Series(str_data, dtype="str") pdf["a"] = other_data gdf["a"] = other_data - + if PANDAS_GE_200 and len(other_data) == 0: + # TODO: Remove this workaround after + # the following bug is fixed: + # https://github.com/pandas-dev/pandas/issues/56679 + pdf["a"] = pdf["a"].astype("str") pdf2 = pdf.copy() gdf2 = gdf.copy() @@ -2017,6 +2021,11 @@ def test_string_join_non_key(str_data, num_cols, how): gdf[i] = cudf.Series(str_data, dtype="str") pdf["a"] = other_data gdf["a"] = other_data + if PANDAS_GE_200 and len(other_data) == 0: + # TODO: Remove this workaround after + # the following bug is fixed: + # https://github.com/pandas-dev/pandas/issues/56679 + pdf["a"] = pdf["a"].astype("str") pdf2 = pdf.copy() gdf2 = gdf.copy() diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py index 9a4e71b6c9d..f60f80fcec7 100644 --- a/python/cudf/cudf/tests/test_onehot.py +++ b/python/cudf/cudf/tests/test_onehot.py @@ -24,8 +24,8 @@ ) @pytest.mark.parametrize("dtype", ["bool", "uint8"]) def test_get_dummies(data, index, dtype): - gdf = cudf.DataFrame({"x": data}, index=index) pdf = pd.DataFrame({"x": data}, index=index) + gdf = cudf.from_pandas(pdf) encoded_expected = pd.get_dummies(pdf, prefix="test", dtype=dtype) encoded_actual = cudf.get_dummies(gdf, prefix="test", dtype=dtype) diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index 8aa5050671a..b4e69f47d2a 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -9,10 +9,7 @@ import cudf from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_200 -from cudf.testing._utils import ( - _create_pandas_series_float64_default, - assert_eq, -) +from cudf.testing._utils import assert_eq from cudf.testing.dataset_generator import rand_dataframe @@ -58,8 +55,8 @@ def test_rolling_series_basic(data, index, agg, nulls, center): elif nulls == "all": data = [np.nan] * len(data) - psr = _create_pandas_series_float64_default(data, index=index) - gsr = cudf.Series(psr) + psr = pd.Series(data, index=index) + gsr = cudf.from_pandas(psr) for window_size in range(1, len(data) + 1): for min_periods in range(1, window_size + 1): expect = getattr( @@ -316,7 +313,7 @@ def test_rolling_getitem_window(): @pytest.mark.parametrize("center", [True, False]) def test_rollling_series_numba_udf_basic(data, index, center): - psr = _create_pandas_series_float64_default(data, index=index) + psr = pd.Series(data, index=index) gsr = cudf.from_pandas(psr) def some_func(A): diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 1e5d41888de..1aae58f47d1 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -20,8 +20,6 @@ NUMERIC_TYPES, SERIES_OR_INDEX_NAMES, TIMEDELTA_TYPES, - _create_cudf_series_float64_default, - _create_pandas_series_float64_default, assert_eq, assert_exceptions_equal, expect_warning_if, @@ -392,8 +390,8 @@ def test_series_tolist(data): [[], [None, None], ["a"], ["a", "b", "c"] * 500, [1.0, 2.0, 0.3] * 57], ) def test_series_size(data): - psr = _create_pandas_series_float64_default(data) - gsr = _create_cudf_series_float64_default(data) + psr = pd.Series(data) + gsr = cudf.Series(data) assert_eq(psr.size, gsr.size) @@ -475,7 +473,7 @@ def test_series_describe_other_types(ps): ) @pytest.mark.parametrize("use_na_sentinel", [True, False]) def test_series_factorize_use_na_sentinel(data, use_na_sentinel): - gsr = _create_cudf_series_float64_default(data) + gsr = cudf.Series(data) psr = gsr.to_pandas(nullable=True) expected_labels, expected_cats = psr.factorize( @@ -499,7 +497,7 @@ def test_series_factorize_use_na_sentinel(data, use_na_sentinel): ) @pytest.mark.parametrize("sort", [True, False]) def test_series_factorize_sort(data, sort): - gsr = _create_cudf_series_float64_default(data) + gsr = cudf.Series(data) psr = gsr.to_pandas(nullable=True) expected_labels, expected_cats = psr.factorize(sort=sort) @@ -1665,7 +1663,7 @@ def test_series_nunique_index(data): ], ) def test_axes(data): - csr = _create_cudf_series_float64_default(data) + csr = cudf.Series(data) psr = csr.to_pandas() expected = psr.axes @@ -1743,7 +1741,7 @@ def test_series_truncate_datetimeindex(): ) def test_isin_numeric(data, values): index = np.random.randint(0, 100, len(data)) - psr = _create_pandas_series_float64_default(data, index=index) + psr = pd.Series(data, index=index) gsr = cudf.Series.from_pandas(psr, nan_as_null=False) expected = psr.isin(values) @@ -1803,7 +1801,7 @@ def test_fill_new_category(): ], ) def test_isin_datetime(data, values): - psr = _create_pandas_series_float64_default(data) + psr = pd.Series(data) gsr = cudf.Series.from_pandas(psr) got = gsr.isin(values) @@ -1832,7 +1830,7 @@ def test_isin_datetime(data, values): ], ) def test_isin_string(data, values): - psr = _create_pandas_series_float64_default(data) + psr = pd.Series(data) gsr = cudf.Series.from_pandas(psr) got = gsr.isin(values) @@ -1861,7 +1859,7 @@ def test_isin_string(data, values): ], ) def test_isin_categorical(data, values): - psr = _create_pandas_series_float64_default(data) + psr = pd.Series(data) gsr = cudf.Series.from_pandas(psr) got = gsr.isin(values) @@ -2082,7 +2080,7 @@ def test_series_to_dict(into): ], ) def test_series_hasnans(data): - gs = _create_cudf_series_float64_default(data, nan_as_null=False) + gs = cudf.Series(data, nan_as_null=False) ps = gs.to_pandas(nullable=True) # Check type to avoid mixing Python bool and NumPy bool @@ -2155,8 +2153,8 @@ def test_series_init_dict_with_index(data, index): "index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]] ) def test_series_init_scalar_with_index(data, index): - pandas_series = _create_pandas_series_float64_default(data, index=index) - cudf_series = _create_cudf_series_float64_default(data, index=index) + pandas_series = pd.Series(data, index=index) + cudf_series = cudf.Series(data, index=index) assert_eq( pandas_series, @@ -2305,15 +2303,12 @@ def test_series_round_builtin(data, digits): assert_eq(expected, actual) -def test_series_empty_warning(): - with pytest.warns(FutureWarning): - expected = pd.Series([]) - with pytest.warns(FutureWarning): - actual = cudf.Series([]) - assert_eq(expected, actual) +def test_series_empty_dtype(): + expected = pd.Series([]) + actual = cudf.Series([]) + assert_eq(expected, actual, check_dtype=True) -@pytest.mark.filterwarnings("ignore::FutureWarning") # tested above @pytest.mark.parametrize("data", [None, {}, []]) def test_series_empty_index_rangeindex(data): expected = cudf.RangeIndex(0) diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index b3db1310adb..518b7597a12 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -394,6 +394,6 @@ def test_dataframe_scatter_by_map_7513(ids): def test_dataframe_scatter_by_map_empty(): - df = DataFrame({"a": [], "b": []}) + df = DataFrame({"a": [], "b": []}, dtype="float64") scattered = df.scatter_by_map(df["a"]) assert len(scattered) == 0 diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index f24a5ea7b41..8e1a91c7a4f 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -11,8 +11,6 @@ from cudf.api.extensions import no_default from cudf.datasets import randomdata from cudf.testing._utils import ( - _create_cudf_series_float64_default, - _create_pandas_series_float64_default, assert_eq, assert_exceptions_equal, expect_warning_if, @@ -225,8 +223,8 @@ def test_approx_quantiles_int(): ) def test_misc_quantiles(data, q): - pdf_series = _create_pandas_series_float64_default(data) - gdf_series = _create_cudf_series_float64_default(data) + pdf_series = pd.Series(data, dtype="float64" if len(data) == 0 else None) + gdf_series = cudf.from_pandas(pdf_series) expected = pdf_series.quantile(q.get() if isinstance(q, cp.ndarray) else q) actual = gdf_series.quantile(q) @@ -539,14 +537,16 @@ def test_df_corr(method): ) @pytest.mark.parametrize("skipna", [True, False]) def test_nans_stats(data, ops, skipna): - psr = _create_pandas_series_float64_default(data) - gsr = _create_cudf_series_float64_default(data, nan_as_null=False) + psr = pd.Series(data, dtype="float64" if len(data) == 0 else None) + gsr = cudf.from_pandas(psr) assert_eq( getattr(psr, ops)(skipna=skipna), getattr(gsr, ops)(skipna=skipna) ) - gsr = _create_cudf_series_float64_default(data, nan_as_null=False) + gsr = cudf.Series( + data, dtype="float64" if len(data) == 0 else None, nan_as_null=False + ) # Since there is no concept of `nan_as_null` in pandas, # nulls will be returned in the operations. So only # testing for `skipna=True` when `nan_as_null=False` From 6bcaf443d902accc3def2bb15cd6542abef6885b Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 23 Jan 2024 11:39:17 +0530 Subject: [PATCH 096/162] Preserve empty index types in parquet reader (#14818) This PR preserves types of empty column index objects whose metadata is already present in the parquet file. This PR: = 107 failed, 101869 passed, 2091 skipped, 976 xfailed, 312 xpassed in 1265.57s (0:21:05) = On pandas_2.0_feature_branch: = 111 failed, 101865 passed, 2091 skipped, 976 xfailed, 312 xpassed in 1292.26s (0:21:32) = --- python/cudf/cudf/_lib/parquet.pyx | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 78606a45fc1..e12b5be3c71 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -110,6 +110,7 @@ cdef class BufferArrayFromVector: def _parse_metadata(meta): file_is_range_index = False file_index_cols = None + file_column_dtype = None if 'index_columns' in meta and len(meta['index_columns']) > 0: file_index_cols = meta['index_columns'] @@ -117,7 +118,9 @@ def _parse_metadata(meta): if isinstance(file_index_cols[0], dict) and \ file_index_cols[0]['kind'] == 'range': file_is_range_index = True - return file_is_range_index, file_index_cols + if 'column_indexes' in meta and len(meta['column_indexes']) == 1: + file_column_dtype = meta['column_indexes'][0]["numpy_type"] + return file_is_range_index, file_index_cols, file_column_dtype cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, @@ -185,6 +188,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, cdef vector[unordered_map[string, string]] per_file_user_data = \ c_result.metadata.per_file_user_data + column_index_type = None index_col_names = None is_range_index = True for single_file in per_file_user_data: @@ -192,7 +196,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, meta = None if json_str != "": meta = json.loads(json_str) - file_is_range_index, index_col = _parse_metadata(meta) + file_is_range_index, index_col, column_index_type = _parse_metadata(meta) is_range_index &= file_is_range_index if not file_is_range_index and index_col is not None \ @@ -302,6 +306,9 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, if use_pandas_metadata: df.index.names = index_col + # Set column dtype for empty types. + if len(df._data.names) == 0 and column_index_type is not None: + df._data.label_dtype = cudf.dtype(column_index_type) return df From bdbf0bc97cc257d018c1b58433a71c294030da4b Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 24 Jan 2024 12:14:30 +0530 Subject: [PATCH 097/162] Fix `Dataframe.agg` to not return incorrect dtypes (#14851) This PR fixes `DataFrame.agg` API where the actual dataframe was being casted to incorrect dtype (object dtype) and then the operations were being performed. This PR adds strict checks at two places. This PR: ``` = 95 failed, 101829 passed, 2091 skipped, 976 xfailed, 312 xpassed in 1479.83s (0:24:39) = ``` On `pandas_2.0_feature_branch`: ``` = 111 failed, 101865 passed, 2091 skipped, 976 xfailed, 312 xpassed in 1265.57s (0:21:05) = ``` --- python/cudf/cudf/core/dataframe.py | 32 +++++++++------ python/cudf/cudf/tests/test_dataframe.py | 51 ++++++++++++++---------- 2 files changed, 50 insertions(+), 33 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 5ef7c6027a9..1057fd0b716 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -84,6 +84,7 @@ from cudf.core.resample import DataFrameResampler from cudf.core.series import Series from cudf.core.udf.row_function import _get_row_kernel +from cudf.errors import MixedTypeError from cudf.utils import applyutils, docutils, ioutils, queryutils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( @@ -3609,11 +3610,12 @@ def agg(self, aggs, axis=None): * Not supporting: ``axis``, ``*args``, ``**kwargs`` """ - # TODO: Remove the typecasting below once issue #6846 is fixed - # link dtypes = [self[col].dtype for col in self._column_names] common_dtype = find_common_type(dtypes) - df_normalized = self.astype(common_dtype) + if not is_bool_dtype(common_dtype) and any( + is_bool_dtype(dtype) for dtype in dtypes + ): + raise MixedTypeError("Cannot create a column with mixed types") if any(is_string_dtype(dt) for dt in dtypes): raise NotImplementedError( @@ -3631,17 +3633,17 @@ def agg(self, aggs, axis=None): # TODO : Allow simultaneous pass for multi-aggregation as # a future optimization for agg in aggs: - result[agg] = getattr(df_normalized, agg)() + result[agg] = getattr(self, agg)() return result.T.sort_index(axis=1, ascending=True) elif isinstance(aggs, str): - if not hasattr(df_normalized, aggs): + if not hasattr(self, aggs): raise AttributeError( f"{aggs} is not a valid function for " f"'DataFrame' object" ) result = DataFrame() - result[aggs] = getattr(df_normalized, aggs)() + result[aggs] = getattr(self, aggs)() result = result.iloc[:, 0] result.name = None return result @@ -3653,15 +3655,16 @@ def agg(self, aggs, axis=None): "callable parameter is not implemented yet" ) elif all(isinstance(val, str) for val in aggs.values()): - result = cudf.Series(index=cols) + res = {} for key, value in aggs.items(): - col = df_normalized[key] + col = self[key] if not hasattr(col, value): raise AttributeError( f"{value} is not a valid function for " f"'Series' object" ) - result[key] = getattr(col, value)() + res[key] = getattr(col, value)() + result = cudf.Series(list(res.values()), index=res.keys()) elif all(isinstance(val, abc.Iterable) for val in aggs.values()): idxs = set() for val in aggs.values(): @@ -3677,7 +3680,7 @@ def agg(self, aggs, axis=None): ) result = DataFrame(index=idxs, columns=cols) for key in aggs.keys(): - col = df_normalized[key] + col = self[key] col_empty = column_empty( len(idxs), dtype=col.dtype, masked=True ) @@ -6160,8 +6163,13 @@ def _reduce( else: source_dtypes = [c.dtype for c in source._data.columns] common_dtype = find_common_type(source_dtypes) - if is_object_dtype(common_dtype) and any( - not is_object_dtype(dtype) for dtype in source_dtypes + if ( + is_object_dtype(common_dtype) + and any( + not is_object_dtype(dtype) for dtype in source_dtypes + ) + or not is_bool_dtype(common_dtype) + and any(is_bool_dtype(dtype) for dtype in source_dtypes) ): raise TypeError( "Columns must all have the same dtype to " diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 325097968f7..026f0aa845d 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9173,17 +9173,8 @@ def test_dataframe_constructor_column_index_only(): @pytest.mark.parametrize( "data", [ - {"a": [1, 2, 3], "b": [3.0, 4.0, 5.0], "c": [True, True, False]}, - {"a": [1.0, 2.0, 3.0], "b": [3.0, 4.0, 5.0], "c": [True, True, False]}, - {"a": [1, 2, 3], "b": [3, 4, 5], "c": [True, True, False]}, - {"a": [1, 2, 3], "b": [True, True, False], "c": [False, True, False]}, - { - "a": [1.0, 2.0, 3.0], - "b": [True, True, False], - "c": [False, True, False], - }, - {"a": [1, 2, 3], "b": [3, 4, 5], "c": [2.0, 3.0, 4.0]}, - {"a": [1, 2, 3], "b": [2.0, 3.0, 4.0], "c": [5.0, 6.0, 4.0]}, + {"a": [1, 2.5, 3], "b": [3, 4.5, 5], "c": [2.0, 3.0, 4.0]}, + {"a": [1, 2.2, 3], "b": [2.0, 3.0, 4.0], "c": [5.0, 6.0, 4.0]}, ], ) @pytest.mark.parametrize( @@ -9208,14 +9199,36 @@ def test_agg_for_dataframes(data, aggs): expect = pdf.agg(aggs).sort_index() got = gdf.agg(aggs).sort_index() - assert_eq(expect, got, check_dtype=False) + + assert_eq(expect, got, check_dtype=True) + + +@pytest_unmark_spilling +@pytest.mark.parametrize( + "data", + [ + {"a": [1, 2, 3], "b": [3.0, 4.0, 5.0], "c": [True, True, False]}, + {"a": [1, 2, 3], "b": [True, True, False], "c": [False, True, False]}, + ], +) +@pytest.mark.parametrize( + "aggs", + [ + ["min", "sum", "max"], + "sum", + {"a": "sum", "b": "min", "c": "max"}, + ], +) +def test_agg_for_dataframes_error(data, aggs): + gdf = cudf.DataFrame(data) + + with pytest.raises(TypeError): + gdf.agg(aggs) @pytest.mark.parametrize("aggs", [{"a": np.sum, "b": np.min, "c": np.max}]) def test_agg_for_unsupported_function(aggs): - gdf = cudf.DataFrame( - {"a": [1, 2, 3], "b": [3.0, 4.0, 5.0], "c": [True, True, False]} - ) + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) with pytest.raises(NotImplementedError): gdf.agg(aggs) @@ -9223,9 +9236,7 @@ def test_agg_for_unsupported_function(aggs): @pytest.mark.parametrize("aggs", ["asdf"]) def test_agg_for_dataframe_with_invalid_function(aggs): - gdf = cudf.DataFrame( - {"a": [1, 2, 3], "b": [3.0, 4.0, 5.0], "c": [True, True, False]} - ) + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) with pytest.raises( AttributeError, @@ -9236,9 +9247,7 @@ def test_agg_for_dataframe_with_invalid_function(aggs): @pytest.mark.parametrize("aggs", [{"a": "asdf"}]) def test_agg_for_series_with_invalid_function(aggs): - gdf = cudf.DataFrame( - {"a": [1, 2, 3], "b": [3.0, 4.0, 5.0], "c": [True, True, False]} - ) + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) with pytest.raises( AttributeError, From 28b1814e9fb509fb2bfe6783613e5a8f792ee34f Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 24 Jan 2024 12:15:01 +0530 Subject: [PATCH 098/162] Catch warnings in reductions (#14852) This PR validates the warnings generated by certain reduction ops. --- python/cudf/cudf/tests/test_reductions.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index e8bbffcacaa..1a38cb3dd22 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -360,10 +360,9 @@ def test_reductions_axis_none_warning(op): FutureWarning, ): actual = getattr(df, op)(axis=None) - # with expect_warning_if( - # op in {"kurt", "kurtosis", "skew", "min", "max", "mean", "median"}, - # FutureWarning, - # ): - - expected = getattr(pdf, op)(axis=None) + with expect_warning_if( + op in {"sum", "product", "std", "var"}, + FutureWarning, + ): + expected = getattr(pdf, op)(axis=None) assert_eq(expected, actual, check_dtype=False) From df5c78b6fa5e3e3c3c673a2d8e5b8757d903cbd3 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 24 Jan 2024 22:24:31 +0530 Subject: [PATCH 099/162] Catch groupby jit apply warnings (#14858) This PR catches `RuntimeWarning`'s in jit groupby pytests. This PR: ``` = 61 failed, 101866 passed, 2091 skipped, 976 xfailed, 312 xpassed in 1446.19s (0:24:06) = ``` On `pandas_2.0_feature_branch`: ``` = 91 failed, 101836 passed, 2091 skipped, 976 xfailed, 312 xpassed in 1348.36s (0:22:28) = ``` --- python/cudf/cudf/tests/test_groupby.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 456ce961a79..f594963dcda 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -565,9 +565,10 @@ def test_groupby_apply_jit_reductions_special_vals( func, dtype, dataset, groupby_jit_datasets, special_val ): dataset = groupby_jit_datasets[dataset] - groupby_apply_jit_reductions_special_vals_inner( - func, dataset, dtype, special_val - ) + with expect_warning_if(func in {"var", "std"} and not np.isnan(special_val), RuntimeWarning): + groupby_apply_jit_reductions_special_vals_inner( + func, dataset, dtype, special_val + ) @pytest.mark.parametrize("dtype", ["float64"]) @@ -652,7 +653,8 @@ def func(group): with pytest.raises(UDFError, match=m): run_groupby_apply_jit_test(dataset, func, keys) return - run_groupby_apply_jit_test(dataset, func, keys) + with expect_warning_if(dtype in {"int32", "int64"}, RuntimeWarning): + run_groupby_apply_jit_test(dataset, func, keys) @pytest.mark.parametrize("dtype", ["int32", "int64"]) @@ -667,7 +669,8 @@ def test_groupby_apply_jit_correlation_zero_variance(dtype): def func(group): return group["b"].corr(group["c"]) - run_groupby_apply_jit_test(data, func, ["a"]) + with expect_warning_if(dtype in {"int32", "int64"}, RuntimeWarning): + run_groupby_apply_jit_test(data, func, ["a"]) @pytest.mark.parametrize("op", unary_ops) From 8784551f84e06acb0486ddd72beed8fa6a197511 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 25 Jan 2024 01:29:42 +0530 Subject: [PATCH 100/162] Fix all reduction pytest failures (#14869) This PR fixes all the remaining one-off reduction pytest failures. This PR: ``` = 54 failed, 101872 passed, 2091 skipped, 977 xfailed, 312 xpassed in 1432.99s (0:23:52) = ``` On `pandas_2.0_feature_branch`: ``` = 61 failed, 101866 passed, 2091 skipped, 976 xfailed, 312 xpassed in 1446.19s (0:24:06) = ``` --- python/cudf/cudf/tests/test_stats.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index c8357699350..edd7da3d42c 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -244,7 +244,7 @@ def test_misc_quantiles(data, q): "nan_as_null": False, }, {"data": [1.1032, 2.32, 43.4, 13, -312.0], "index": [0, 4, 3, 19, 6]}, - {"data": []}, + {"data": [], "dtype": "float64"}, {"data": [-3]}, ], ) @@ -274,13 +274,12 @@ def test_kurt_skew_error(op): gs = cudf.Series(["ab", "cd"]) ps = gs.to_pandas() - with pytest.warns(FutureWarning): - assert_exceptions_equal( - getattr(gs, op), - getattr(ps, op), - lfunc_args_and_kwargs=([], {"numeric_only": True}), - rfunc_args_and_kwargs=([], {"numeric_only": True}), - ) + assert_exceptions_equal( + getattr(gs, op), + getattr(ps, op), + lfunc_args_and_kwargs=([], {"numeric_only": True}), + rfunc_args_and_kwargs=([], {"numeric_only": True}), + ) @pytest.mark.parametrize( @@ -359,10 +358,17 @@ def test_series_median(dtype, num_na): @pytest.mark.parametrize( "fill_method", ["ffill", "bfill", "pad", "backfill", no_default, None] ) -def test_series_pct_change(data, periods, fill_method): +def test_series_pct_change(request, data, periods, fill_method): cs = cudf.Series(data) ps = cs.to_pandas() - + request.applymarker( + pytest.mark.xfail( + condition=( + len(cs) == 0 and periods == 0 and fill_method is no_default + ), + reason="https://github.com/pandas-dev/pandas/issues/57056", + ) + ) if np.abs(periods) <= len(cs): with expect_warning_if(fill_method not in (no_default, None)): got = cs.pct_change(periods=periods, fill_method=fill_method) From d7f9688bb58ceee32a9753dc2f3f6dd046a92257 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Wed, 24 Jan 2024 22:45:32 -0500 Subject: [PATCH 101/162] Fix empty groupby return types (#14871) Closes #14862 This PR fixes the errors in #14862 by ensuring we match the pandas return type when doing grouped count, size, idxmax, idxmin. --------- Co-authored-by: Ashwin Srinath --- python/cudf/cudf/core/groupby/groupby.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index bf470c29c99..6aba93855a7 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -596,7 +596,7 @@ def agg(self, func): # Structs lose their labels which we reconstruct here col = col._with_type_metadata(cudf.ListDtype(orig_dtype)) - if agg_kind in {"COUNT", "SIZE"}: + if agg_kind in {"COUNT", "SIZE", "ARGMIN", "ARGMAX"}: data[key] = col.astype("int64") elif ( self.obj.empty @@ -1449,9 +1449,11 @@ def mult(df): dtype: int64 """ - if self.obj.empty: - res = self.obj.copy(deep=True) + if function in {"count", "size", "idxmin", "idxmax"}: + res = cudf.Series([], dtype="int64") + else: + res = self.obj.copy(deep=True) res.index = self.grouping.keys if function in {"sum", "product"}: # For `sum` & `product`, boolean types From 8a25f70c13991f5bb9e904e4e11283e9020f9381 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 24 Jan 2024 18:13:14 -1000 Subject: [PATCH 102/162] Support kurt/skew(axis=None) for multi columns/low row count (#14874) closes #14866 @galipremsagar it appears the linked failing test in the issue test_reductions_axis_none_warning expected FutureWarning from these calls. Should they be expected for kurt/skew too? --- python/cudf/cudf/core/dataframe.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1057fd0b716..a3642bcc43f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -57,6 +57,7 @@ is_string_dtype, ) from cudf.core import column, df_protocol, indexing_utils, reshape +from cudf.core._compat import PANDAS_GE_200 from cudf.core.abc import Serializable from cudf.core.column import ( CategoricalColumn, @@ -95,11 +96,8 @@ min_scalar_type, numeric_normalize_types, ) - from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api -from cudf.core._compat import PANDAS_GE_200 - _cupy_nan_methods_map = { "min": "nanmin", @@ -6112,8 +6110,13 @@ def _reduce( if axis == 0 else source.index ) - if axis in {0, 2}: + if axis == 2 and op in ("kurtosis", "kurt", "skew"): + # TODO: concat + op can probably be done in the general case + # for axis == 2. + return getattr(concat_columns(source._data.columns), op)( + **kwargs + ) try: result = [ getattr(source._data[col], op)(**kwargs) From 7bf4376d8067130d7e3c5eb5afc0b033e3658cd9 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 25 Jan 2024 16:54:10 +0530 Subject: [PATCH 103/162] Fix miscellaneous failures in pytests (#14879) --- python/cudf/cudf/core/series.py | 5 +---- python/cudf/cudf/tests/test_index.py | 4 +++- python/cudf/cudf/tests/test_joining.py | 11 +++++++++-- python/cudf/cudf/tests/test_series.py | 3 ++- python/dask_cudf/dask_cudf/backends.py | 7 +++---- 5 files changed, 18 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 53218903ed2..e1015e53c88 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -606,10 +606,7 @@ def __init__( name_from_data = data.name column = as_column(data, nan_as_null=nan_as_null, dtype=dtype) if isinstance(data, (pd.Series, Series)): - if isinstance(data.index, pd.MultiIndex): - index = cudf.from_pandas(data.index) - else: - index = as_index(data.index) + index_from_data = as_index(data.index) elif isinstance(data, ColumnAccessor): raise TypeError( "Use cudf.Series._from_data for constructing a Series from " diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 20af94576fe..e47b2f5d5d5 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2800,7 +2800,9 @@ def test_rangeindex_join_user_option(default_integer_bitwidth): actual = idx1.join(idx2, how="inner", sort=True) expected = idx1.to_pandas().join(idx2.to_pandas(), how="inner", sort=True) assert actual.dtype == cudf.dtype(f"int{default_integer_bitwidth}") - assert_eq(expected, actual) + # exact=False to ignore dtype comparison, + # because `default_integer_bitwidth` is cudf only option + assert_eq(expected, actual, exact=False) def test_rangeindex_where_user_option(default_integer_bitwidth): diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 42b466f486b..8ce2adae15b 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -2246,11 +2246,18 @@ def test_index_join_return_indexers_notimplemented(): @pytest.mark.parametrize("how", ["inner", "outer"]) -def test_index_join_names(how): +def test_index_join_names(request, how): idx1 = cudf.Index([10, 1, 2, 4, 2, 1], name="a") idx2 = cudf.Index([-10, 2, 3, 1, 2], name="b") + request.applymarker( + pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/57065", + ) + ) + pidx1 = idx1.to_pandas() + pidx2 = idx2.to_pandas() - expected = idx1.to_pandas().join(idx2.to_pandas(), how=how) + expected = pidx1.join(pidx2, how=how) actual = idx1.join(idx2, how=how) assert_join_results_equal(actual, expected, how=how) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 2772ce6ffee..623657d127f 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2159,7 +2159,8 @@ def test_series_init_scalar_with_index(data, index): assert_eq( pandas_series, cudf_series, - check_index_type=False if data is None and index is None else True, + check_index_type=data is not None or index is not None, + check_dtype=data is not None, ) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 11e0f1e0e60..86283f57366 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import warnings from collections.abc import Iterator @@ -8,7 +8,6 @@ import pandas as pd import pyarrow as pa from pandas.api.types import is_scalar -from pandas.core.tools.datetimes import is_datetime64tz_dtype import dask.dataframe as dd from dask import config @@ -42,7 +41,7 @@ from dask.utils import Dispatch, is_arraylike import cudf -from cudf.api.types import is_string_dtype +from cudf.api.types import _is_datetime64tz_dtype, is_string_dtype from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate from .core import DataFrame, Index, Series @@ -127,7 +126,7 @@ def _get_non_empty_data(s): data = cudf.core.column.as_column(data, dtype=s.dtype) elif is_string_dtype(s.dtype): data = pa.array(["cat", "dog"]) - elif is_datetime64tz_dtype(s.dtype): + elif _is_datetime64tz_dtype(s.dtype): from cudf.utils.dtypes import get_time_unit data = cudf.date_range("2001-01-01", periods=2, freq=get_time_unit(s)) From d83f12e37a2a42d3fce7f2302b104ee8f4b0619e Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 26 Jan 2024 01:29:38 +0530 Subject: [PATCH 104/162] Preserve columns dtype in dataframe constructor (#14878) This PR preserves columns dtype in DataFrame constructor. This PR: = 52 failed, 101872 passed, 2091 skipped, 977 xfailed, 312 xpassed in 1188.72s (0:19:48) = On pandas_2.0_feature_branch: = 61 failed, 101866 passed, 2091 skipped, 976 xfailed, 312 xpassed in 1446.19s (0:24:06) = --- python/cudf/cudf/core/dataframe.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a3642bcc43f..5fa1956eaf1 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -795,6 +795,7 @@ def __init__( if is_list_like(data): if len(data) > 0 and is_scalar(data[0]): if columns is not None: + label_dtype = getattr(columns, "dtype", None) data = dict(zip(columns, [data])) rangeindex = isinstance( columns, (range, pd.RangeIndex, cudf.RangeIndex) @@ -802,6 +803,7 @@ def __init__( else: data = dict(enumerate([data])) rangeindex = True + label_dtype = None new_df = DataFrame(data=data, index=index) self._data = new_df._data @@ -812,6 +814,11 @@ def __init__( else self._data._level_names ) self._data.rangeindex = rangeindex + self._data.label_dtype = ( + cudf.dtype(label_dtype) + if label_dtype is not None + else None + ) elif len(data) > 0 and isinstance(data[0], Series): self._init_from_series_list( data=data, columns=columns, index=index From 8db3b706d1ff4c6182abe5d5c7374a8233772c96 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 25 Jan 2024 22:35:13 +0000 Subject: [PATCH 105/162] Disable style check --- .github/workflows/pr.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index edcc140b191..25c1294d11a 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -37,6 +37,7 @@ jobs: uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.02 with: enable_check_generated_files: false + enable_check_style: false conda-cpp-build: needs: checks secrets: inherit From 4b5b8af994956315836fdf252cd453620e5e9aea Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 25 Jan 2024 22:37:59 +0000 Subject: [PATCH 106/162] Pin pandas --- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-120_arch-x86_64.yaml | 2 +- dependencies.yaml | 2 +- python/cudf/pyproject.toml | 2 +- python/dask_cudf/pyproject.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index a5e3ea4c531..0c58976de86 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -64,7 +64,7 @@ dependencies: - nvcomp==3.0.5 - nvtx>=0.2.1 - packaging -- pandas>=1.3,<1.6.0dev0 +- pandas==2.1.4 - pandoc - pip - pre-commit diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index 579bbb6d52d..0671a1a8f98 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -62,7 +62,7 @@ dependencies: - nvcomp==3.0.5 - nvtx>=0.2.1 - packaging -- pandas>=1.3,<1.6.0dev0 +- pandas==2.1.4 - pandoc - pip - pre-commit diff --git a/dependencies.yaml b/dependencies.yaml index 20998847a75..3513c6161c8 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -500,7 +500,7 @@ dependencies: packages: - fsspec>=0.6.0 - *numpy - - pandas>=1.3,<1.6.0dev0 + - pandas==2.1.4 run_cudf: common: - output_types: [conda, requirements, pyproject] diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 18771804f61..62f218e86a4 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -33,7 +33,7 @@ dependencies = [ "numpy>=1.21,<1.25", "nvtx>=0.2.1", "packaging", - "pandas>=1.3,<1.6.0dev0", + "pandas==2.1.4", "protobuf>=4.21,<5", "ptxcompiler", "pyarrow>=14.0.1,<15.0.0a0", diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 33065da6e8d..6613a5f32e3 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -22,7 +22,7 @@ dependencies = [ "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "numpy>=1.21,<1.25", - "pandas>=1.3,<1.6.0dev0", + "pandas==2.1.4", "rapids-dask-dependency==24.2.*", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ From d2cc4db4a46fd787122b2117ff8981d450b2a06a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 25 Jan 2024 22:41:59 +0000 Subject: [PATCH 107/162] Disable some more jobs --- .github/workflows/pr.yaml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 25c1294d11a..67396cb0274 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -13,19 +13,19 @@ jobs: pr-builder: needs: - checks - - conda-cpp-build - - conda-cpp-tests - - conda-python-build - - conda-python-cudf-tests - - conda-python-other-tests - - conda-java-tests - - conda-notebook-tests - - docs-build + #- conda-cpp-build + #- conda-cpp-tests + #- conda-python-build + #- conda-python-cudf-tests + #- conda-python-other-tests + #- conda-java-tests + #- conda-notebook-tests + #- docs-build - wheel-build-cudf - wheel-tests-cudf - wheel-build-dask-cudf - wheel-tests-dask-cudf - - devcontainer + #- devcontainer - unit-tests-cudf-pandas - pandas-tests #- pandas-tests-diff From 32e0982ed73319f2459581ad83e21d19d567ce56 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 25 Jan 2024 22:44:55 +0000 Subject: [PATCH 108/162] Actually remove the jobs --- .github/workflows/pr.yaml | 142 +++++++++++++++++++------------------- 1 file changed, 71 insertions(+), 71 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 67396cb0274..c570fea3ea4 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -38,69 +38,69 @@ jobs: with: enable_check_generated_files: false enable_check_style: false - conda-cpp-build: - needs: checks - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.02 - with: - build_type: pull-request - conda-cpp-tests: - needs: conda-cpp-build - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.02 - with: - build_type: pull-request - conda-python-build: - needs: conda-cpp-build - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.02 - with: - build_type: pull-request - conda-python-cudf-tests: - needs: conda-python-build - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02 - with: - build_type: pull-request - test_script: "ci/test_python_cudf.sh" - conda-python-other-tests: - # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism - needs: conda-python-build - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02 - with: - build_type: pull-request - test_script: "ci/test_python_other.sh" - conda-java-tests: - needs: conda-cpp-build - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02 - with: - build_type: pull-request - node_type: "gpu-v100-latest-1" - arch: "amd64" - container_image: "rapidsai/ci-conda:latest" - run_script: "ci/test_java.sh" - conda-notebook-tests: - needs: conda-python-build - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02 - with: - build_type: pull-request - node_type: "gpu-v100-latest-1" - arch: "amd64" - container_image: "rapidsai/ci-conda:latest" - run_script: "ci/test_notebooks.sh" - docs-build: - needs: conda-python-build - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02 - with: - build_type: pull-request - node_type: "gpu-v100-latest-1" - arch: "amd64" - container_image: "rapidsai/ci-conda:latest" - run_script: "ci/build_docs.sh" + #conda-cpp-build: + # needs: checks + # secrets: inherit + # uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.02 + # with: + # build_type: pull-request + #conda-cpp-tests: + # needs: conda-cpp-build + # secrets: inherit + # uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.02 + # with: + # build_type: pull-request + #conda-python-build: + # needs: conda-cpp-build + # secrets: inherit + # uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.02 + # with: + # build_type: pull-request + #conda-python-cudf-tests: + # needs: conda-python-build + # secrets: inherit + # uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02 + # with: + # build_type: pull-request + # test_script: "ci/test_python_cudf.sh" + #conda-python-other-tests: + # # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism + # needs: conda-python-build + # secrets: inherit + # uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02 + # with: + # build_type: pull-request + # test_script: "ci/test_python_other.sh" + #conda-java-tests: + # needs: conda-cpp-build + # secrets: inherit + # uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02 + # with: + # build_type: pull-request + # node_type: "gpu-v100-latest-1" + # arch: "amd64" + # container_image: "rapidsai/ci-conda:latest" + # run_script: "ci/test_java.sh" + #conda-notebook-tests: + # needs: conda-python-build + # secrets: inherit + # uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02 + # with: + # build_type: pull-request + # node_type: "gpu-v100-latest-1" + # arch: "amd64" + # container_image: "rapidsai/ci-conda:latest" + # run_script: "ci/test_notebooks.sh" + #docs-build: + # needs: conda-python-build + # secrets: inherit + # uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02 + # with: + # build_type: pull-request + # node_type: "gpu-v100-latest-1" + # arch: "amd64" + # container_image: "rapidsai/ci-conda:latest" + # run_script: "ci/build_docs.sh" wheel-build-cudf: needs: checks secrets: inherit @@ -132,14 +132,14 @@ jobs: matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1"))) build_type: pull-request script: ci/test_wheel_dask_cudf.sh - devcontainer: - secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.02 - with: - build_command: | - sccache -z; - build-all -DBUILD_BENCHMARKS=ON -DNVBench_ENABLE_CUPTI=OFF --verbose; - sccache -s; + #devcontainer: + # secrets: inherit + # uses: rapidsai/shared-action-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.02 + # with: + # build_command: | + # sccache -z; + # build-all -DBUILD_BENCHMARKS=ON -DNVBench_ENABLE_CUPTI=OFF --verbose; + # sccache -s; unit-tests-cudf-pandas: needs: wheel-build-cudf secrets: inherit From 302c8760008afee155034afe5a7913b94bc899c2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 25 Jan 2024 16:10:01 -0800 Subject: [PATCH 109/162] Unpin numpy<1.25 --- .../all_cuda-118_arch-x86_64.yaml | 2 +- .../all_cuda-120_arch-x86_64.yaml | 2 +- conda/recipes/cudf/meta.yaml | 3 +- dependencies.yaml | 3 +- python/cudf/cudf/core/column/categorical.py | 7 +++-- python/cudf/cudf/core/column/column.py | 7 ++--- python/cudf/cudf/core/column/numerical.py | 2 +- python/cudf/cudf/core/join/_join_helpers.py | 2 +- python/cudf/cudf/tests/test_array_function.py | 6 ++-- python/cudf/cudf/tests/test_datasets.py | 2 +- python/cudf/cudf/tests/test_joining.py | 8 ++--- python/cudf/cudf/utils/dtypes.py | 30 ++++++++++++++++++- python/cudf/pyproject.toml | 4 +-- python/cudf_kafka/pyproject.toml | 2 +- python/dask_cudf/pyproject.toml | 2 +- 15 files changed, 54 insertions(+), 28 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 0c58976de86..3d3830e1dc9 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -58,7 +58,7 @@ dependencies: - ninja - notebook - numba>=0.57 -- numpy>=1.21,<1.25 +- numpy>=1.21 - numpydoc - nvcc_linux-64=11.8 - nvcomp==3.0.5 diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index 0671a1a8f98..e441a0aac4b 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -57,7 +57,7 @@ dependencies: - ninja - notebook - numba>=0.57 -- numpy>=1.21,<1.25 +- numpy>=1.21 - numpydoc - nvcomp==3.0.5 - nvtx>=0.2.1 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index bc91ee61f6f..89c3d8ecab2 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -80,8 +80,7 @@ requirements: - cupy >=12.0.0 # TODO: Pin to numba<0.58 until #14160 is resolved - numba >=0.57,<0.58 - # TODO: Pin to numpy<1.25 until cudf requires pandas 2 - - numpy >=1.21,<1.25 + - numpy >=1.21 - {{ pin_compatible('pyarrow', max_pin='x') }} - libcudf ={{ version }} - {{ pin_compatible('rmm', max_pin='x.x') }} diff --git a/dependencies.yaml b/dependencies.yaml index 3513c6161c8..9289a07083e 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -266,8 +266,7 @@ dependencies: - *cmake_ver - cython>=3.0.3 - *ninja - # TODO: Pin to numpy<1.25 until cudf requires pandas 2 - - &numpy numpy>=1.21,<1.25 + - &numpy numpy>=1.21 # Hard pin the patch version used during the build. This must be kept # in sync with the version pinned in get_arrow.cmake. - pyarrow==14.0.1.* diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index fb7f841c3f3..f036703a147 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -7,12 +7,12 @@ from typing import TYPE_CHECKING, Any, Mapping, Optional, Sequence, Tuple, cast import numpy as np +import pandas as pd import pyarrow as pa from numba import cuda from typing_extensions import Self import cudf -import pandas as pd from cudf import _lib as libcudf from cudf._lib.transform import bools_to_mask from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike @@ -21,6 +21,7 @@ from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import CategoricalDtype, IntervalDtype from cudf.utils.dtypes import ( + find_common_type, is_mixed_with_object_dtype, min_signed_type, min_unsigned_type, @@ -265,8 +266,8 @@ def add_categories(self, new_categories: Any) -> Optional[SeriesOrIndex]: f"type-cast new_categories to the same type as " f"existing categories." ) - common_dtype = np.find_common_type( - [old_categories.dtype, new_categories.dtype], [] + common_dtype = find_common_type( + [old_categories.dtype, new_categories.dtype] ) new_categories = new_categories.astype(common_dtype) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 70e7717be33..d37f0d54c1a 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -5,7 +5,6 @@ import builtins import pickle import warnings - from collections import abc from functools import cached_property from itertools import chain @@ -25,6 +24,7 @@ import cupy import numpy as np +import pandas as pd import pyarrow as pa import pyarrow.compute as pc from numba import cuda @@ -33,7 +33,6 @@ import rmm import cudf -import pandas as pd from cudf import _lib as libcudf from cudf._lib.column import Column from cudf._lib.null_mask import ( @@ -87,6 +86,7 @@ from cudf.utils.dtypes import ( _maybe_convert_to_default_type, cudf_dtype_from_pa_type, + find_common_type, get_time_unit, is_mixed_with_object_dtype, min_scalar_type, @@ -2671,8 +2671,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: and np.issubdtype(dtyp, np.datetime64) for dtyp in not_null_col_dtypes ): - # Use NumPy to find a common dtype - common_dtype = np.find_common_type(not_null_col_dtypes, []) + common_dtype = find_common_type(not_null_col_dtypes) # Cast all columns to the common dtype objs = [obj.astype(common_dtype) for obj in objs] diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 35de4d0ae7c..ae4ad9c5136 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -702,7 +702,7 @@ def _reduction_result_dtype(self, reduction_op: str) -> Dtype: col_dtype if col_dtype.kind == "f" else np.dtype("int64") ) elif reduction_op == "sum_of_squares": - col_dtype = np.find_common_type([col_dtype], [np.dtype("uint64")]) + col_dtype = np.result_dtype(col_dtype, np.dtype("uint64")) return col_dtype diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 0aebc6453bc..6a619945e75 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -98,7 +98,7 @@ def _match_join_keys( common_type = ( max(ltype, rtype) if ltype.kind == rtype.kind - else np.find_common_type([], (ltype, rtype)) + else np.result_type(ltype, rtype) ) elif ( np.issubdtype(ltype, np.datetime64) diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py index 758a8cbb535..db48a017138 100644 --- a/python/cudf/cudf/tests/test_array_function.py +++ b/python/cudf/cudf/tests/test_array_function.py @@ -69,9 +69,9 @@ def test_array_func_cudf_series(np_ar, func): lambda x: np.dot(x, x.transpose()), lambda x: np.all(x), lambda x: np.any(x), - lambda x: np.product(x), - lambda x: np.product(x, axis=0), - lambda x: np.product(x, axis=1), + lambda x: np.prod(x), + lambda x: np.prod(x, axis=0), + lambda x: np.prod(x, axis=1), ], ) def test_array_func_cudf_dataframe(pd_df, func): diff --git a/python/cudf/cudf/tests/test_datasets.py b/python/cudf/cudf/tests/test_datasets.py index 98f801d0cba..fdab8cb5edf 100644 --- a/python/cudf/cudf/tests/test_datasets.py +++ b/python/cudf/cudf/tests/test_datasets.py @@ -52,6 +52,6 @@ def test_make_bool(): n = 10 state = np.random.RandomState(12) arr = gd.datasets.make_bool(n, state) - assert np.alltrue(np.isin(arr, [True, False])) + assert np.all(np.isin(arr, [True, False])) assert arr.size == n assert arr.dtype == bool diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 8ce2adae15b..2b35c808466 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -7,6 +7,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_200 from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype, Decimal128Dtype from cudf.testing._utils import ( INTEGER_TYPES, @@ -16,7 +17,6 @@ assert_exceptions_equal, expect_warning_if, ) -from cudf.core._compat import PANDAS_GE_200 _JOIN_TYPES = ("left", "inner", "outer", "right", "leftanti", "leftsemi") @@ -982,7 +982,7 @@ def test_typecast_on_join_int_to_int(dtype_l, dtype_r): gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) - exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)]) + exp_dtype = np.result_type(np.dtype(dtype_l), np.dtype(dtype_r)) exp_join_data = [1, 2] exp_other_data = ["a", "b"] @@ -1012,7 +1012,7 @@ def test_typecast_on_join_float_to_float(dtype_l, dtype_r): gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) - exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)]) + exp_dtype = np.result_type(np.dtype(dtype_l), np.dtype(dtype_r)) if dtype_l != dtype_r: exp_join_data = [1, 2, 3, 4.5] @@ -1053,7 +1053,7 @@ def test_typecast_on_join_mixed_int_float(dtype_l, dtype_r): gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) - exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)]) + exp_dtype = np.result_type(np.dtype(dtype_l), np.dtype(dtype_r)) exp_join_data = [1, 2, 3] exp_other_data = ["a", "b", "c"] diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 538678b47b0..345b7b0aad6 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -507,6 +507,34 @@ def get_allowed_combinations_for_operator(dtype_l, dtype_r, op): raise error +def np_find_common_type(*dtypes: np.dtype) -> np.dtype: + """ + np.find_common_type implementation pre-1.25 deprecation using np.result_type + https://github.com/pandas-dev/pandas/pull/49569#issuecomment-1308300065 + + Parameters + ---------- + dtypes : np.dtypes + + Returns + ------- + np.dtype + """ + # TODO: possibly raise the TypeError. Coercing to np.dtype("O") (string) + # might not make sense in cudf + try: + common_dtype = np.result_type(*dtypes) + if common_dtype.kind in "mMSU": + # NumPy promotion currently (1.25) misbehaves for for times and strings, + # so fall back to object (find_common_dtype did unless there + # was only one dtype) + common_dtype = np.dtype("O") + + except TypeError: + common_dtype = np.dtype("O") + return common_dtype + + def find_common_type(dtypes): """ Wrapper over np.find_common_type to handle special cases @@ -614,7 +642,7 @@ def find_common_type(dtypes): dtypes = dtypes - td_dtypes dtypes.add(np.result_type(*td_dtypes)) - common_dtype = np.find_common_type(list(dtypes), []) + common_dtype = np_find_common_type(*dtypes) if common_dtype == np.dtype("float16"): return cudf.dtype("float32") return cudf.dtype(common_dtype) diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 62f218e86a4..9f31c34cbf2 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cmake>=3.26.4", "cython>=3.0.3", "ninja", - "numpy>=1.21,<1.25", + "numpy>=1.21", "protoc-wheel", "pyarrow==14.0.1.*", "rmm==24.2.*", @@ -30,7 +30,7 @@ dependencies = [ "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "numba>=0.57", - "numpy>=1.21,<1.25", + "numpy>=1.21", "nvtx>=0.2.1", "packaging", "pandas==2.1.4", diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index 062a0224c1f..e7549eb7e10 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cmake>=3.26.4", "cython>=3.0.3", "ninja", - "numpy>=1.21,<1.25", + "numpy>=1.21", "pyarrow==14.0.1.*", "scikit-build-core[pyproject]>=0.7.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 6613a5f32e3..2567a6ec565 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -21,7 +21,7 @@ dependencies = [ "cudf==24.2.*", "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", - "numpy>=1.21,<1.25", + "numpy>=1.21", "pandas==2.1.4", "rapids-dask-dependency==24.2.*", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. From 481ea9cecb8721ebe13aca3650933d34e76de511 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 25 Jan 2024 18:15:51 -0800 Subject: [PATCH 110/162] Remove pandas shim and use result_type --- python/cudf/cudf/utils/dtypes.py | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 345b7b0aad6..2dc3a08eb8d 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -507,34 +507,6 @@ def get_allowed_combinations_for_operator(dtype_l, dtype_r, op): raise error -def np_find_common_type(*dtypes: np.dtype) -> np.dtype: - """ - np.find_common_type implementation pre-1.25 deprecation using np.result_type - https://github.com/pandas-dev/pandas/pull/49569#issuecomment-1308300065 - - Parameters - ---------- - dtypes : np.dtypes - - Returns - ------- - np.dtype - """ - # TODO: possibly raise the TypeError. Coercing to np.dtype("O") (string) - # might not make sense in cudf - try: - common_dtype = np.result_type(*dtypes) - if common_dtype.kind in "mMSU": - # NumPy promotion currently (1.25) misbehaves for for times and strings, - # so fall back to object (find_common_dtype did unless there - # was only one dtype) - common_dtype = np.dtype("O") - - except TypeError: - common_dtype = np.dtype("O") - return common_dtype - - def find_common_type(dtypes): """ Wrapper over np.find_common_type to handle special cases @@ -642,7 +614,7 @@ def find_common_type(dtypes): dtypes = dtypes - td_dtypes dtypes.add(np.result_type(*td_dtypes)) - common_dtype = np_find_common_type(*dtypes) + common_dtype = np.result_type(*dtypes) if common_dtype == np.dtype("float16"): return cudf.dtype("float32") return cudf.dtype(common_dtype) From 4444909b63b6854a9202f8093dfd7ae7833b0d1b Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 26 Jan 2024 08:27:50 +0530 Subject: [PATCH 111/162] FIx more miscellaneous pytests failures (#14895) This PR fixes multiple issues: Enables corr and cov for Datetime and Timedelta types. Properly disables all and any for StringColumn. Preserves groupby categorical index ordering. Catches FutureWarnings in pytests. --- python/cudf/cudf/core/column/datetime.py | 18 ++++++++++ python/cudf/cudf/core/column/string.py | 14 ++++++++ python/cudf/cudf/core/column/timedelta.py | 18 ++++++++++ python/cudf/cudf/core/dataframe.py | 4 +-- python/cudf/cudf/core/groupby/groupby.py | 9 ----- python/cudf/cudf/tests/test_dataframe.py | 43 ++++++++--------------- python/cudf/cudf/tests/test_groupby.py | 22 +++++++----- python/cudf/cudf/tests/test_joining.py | 4 +-- python/cudf/cudf/tests/test_stats.py | 38 ++++++++++---------- 9 files changed, 101 insertions(+), 69 deletions(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 6f7baebddd3..08a5103b409 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -532,6 +532,24 @@ def median(self, skipna: Optional[bool] = None) -> pd.Timestamp: unit=self.time_unit, ).as_unit(self.time_unit) + def cov(self, other: DatetimeColumn) -> float: + if not isinstance(other, DatetimeColumn): + raise TypeError( + f"cannot perform corr with types {self.dtype}, {other.dtype}" + ) + return self.as_numerical_column("int64").cov( + other.as_numerical_column("int64") + ) + + def corr(self, other: DatetimeColumn) -> float: + if not isinstance(other, DatetimeColumn): + raise TypeError( + f"cannot perform corr with types {self.dtype}, {other.dtype}" + ) + return self.as_numerical_column("int64").corr( + other.as_numerical_column("int64") + ) + def quantile( self, q: np.ndarray, diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 3d222cb762e..b115e6cda48 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5584,6 +5584,20 @@ def data(self): ] return self._data + def all(self, skipna: bool = True) -> bool: + # The skipna argument is only used for numerical columns. + # If all entries are null the result is True, including when the column + # is empty. + + raise NotImplementedError("`all` not implemented for `StringColumn`") + + def any(self, skipna: bool = True) -> bool: + # The skipna argument is only used for numerical columns. + # If all entries are null the result is True, including when the column + # is empty. + + raise NotImplementedError("`any` not implemented for `StringColumn`") + def data_array_view( self, *, mode="write" ) -> cuda.devicearray.DeviceNDArray: diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index b7209bbe7d0..2c12c77277c 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -404,6 +404,24 @@ def std( unit=self.time_unit, ).as_unit(self.time_unit) + def cov(self, other: TimeDeltaColumn) -> float: + if not isinstance(other, TimeDeltaColumn): + raise TypeError( + f"cannot perform corr with types {self.dtype}, {other.dtype}" + ) + return self.as_numerical_column("int64").cov( + other.as_numerical_column("int64") + ) + + def corr(self, other: TimeDeltaColumn) -> float: + if not isinstance(other, TimeDeltaColumn): + raise TypeError( + f"cannot perform corr with types {self.dtype}, {other.dtype}" + ) + return self.as_numerical_column("int64").corr( + other.as_numerical_column("int64") + ) + def components(self, index=None) -> "cudf.DataFrame": """ Return a Dataframe of the components of the Timedeltas. diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 5fa1956eaf1..c94b9040693 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1038,7 +1038,6 @@ def _init_from_dict_like( empty_column = functools.partial( cudf.core.column.column_empty, row_count=(0 if index is None else len(index)), - dtype=None, masked=index is not None, ) @@ -6115,7 +6114,8 @@ def _reduce( return Series( index=self._data.to_pandas_index()[:0] if axis == 0 - else source.index + else source.index, + dtype="float64", ) if axis in {0, 2}: if axis == 2 and op in ("kurtosis", "kurt", "skew"): diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 6aba93855a7..3d0d7d9eba6 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -646,15 +646,6 @@ def agg(self, func): how="left", ) result = result.take(indices) - if isinstance(result._index, cudf.CategoricalIndex): - # Needs re-ordering the categories in the order - # they are after grouping. - result._index = cudf.Index( - result._index._column.reorder_categories( - result._index._column._get_decategorized_column() - ), - name=result._index.name, - ) if not self._as_index: result = result.reset_index() diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 026f0aa845d..69be352cf63 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -4173,15 +4173,7 @@ def test_dataframe_round_dict_decimal_validation(): [None, None], [[0, 5], [1, 6], [2, 7], [3, 8], [4, 9]], [[1, True], [2, False], [3, False]], - pytest.param( - [["a", True], ["b", False], ["c", False]], - marks=[ - pytest_xfail( - reason="NotImplementedError: all does not " - "support columns of object dtype." - ) - ], - ), + [["a", True], ["b", False], ["c", False]], ], ) def test_all(data): @@ -4192,6 +4184,9 @@ def test_all(data): if np.array(data).ndim <= 1: pdata = pd.Series(data=data, dtype=dtype).replace([None], False) gdata = cudf.Series.from_pandas(pdata) + got = gdata.all() + expected = pdata.all() + assert_eq(got, expected) else: pdata = pd.DataFrame(data, columns=["a", "b"], dtype=dtype).replace( [None], False @@ -4203,10 +4198,10 @@ def test_all(data): got = gdata.all(bool_only=True) expected = pdata.all(bool_only=True) assert_eq(got, expected) - - got = gdata.all() - expected = pdata.all() - assert_eq(got, expected) + else: + got = gdata.all() + expected = pdata.all() + assert_eq(got, expected) @pytest.mark.parametrize( @@ -4226,21 +4221,13 @@ def test_all(data): [None, None], [[0, 5], [1, 6], [2, 7], [3, 8], [4, 9]], [[1, True], [2, False], [3, False]], - pytest.param( - [["a", True], ["b", False], ["c", False]], - marks=[ - pytest_xfail( - reason="NotImplementedError: any does not " - "support columns of object dtype." - ) - ], - ), + [["a", True], ["b", False], ["c", False]], ], ) @pytest.mark.parametrize("axis", [0, 1]) def test_any(data, axis): # Provide a dtype when data is empty to avoid future pandas changes. - dtype = None if data else float + dtype = float if all(x is None for x in data) or len(data) < 1 else None if np.array(data).ndim <= 1: pdata = pd.Series(data=data, dtype=dtype) gdata = cudf.Series(data=data, dtype=dtype) @@ -4261,10 +4248,10 @@ def test_any(data, axis): got = gdata.any(bool_only=True) expected = pdata.any(bool_only=True) assert_eq(got, expected) - - got = gdata.any(axis=axis) - expected = pdata.any(axis=axis) - assert_eq(got, expected) + else: + got = gdata.any(axis=axis) + expected = pdata.any(axis=axis) + assert_eq(got, expected) @pytest.mark.parametrize("axis", [0, 1]) @@ -10197,7 +10184,7 @@ def test_empty_numeric_only(data): pdf = gdf.to_pandas() expected = pdf.prod(numeric_only=True) actual = gdf.prod(numeric_only=True) - assert_eq(expected, actual) + assert_eq(expected, actual, check_dtype=True) @pytest.fixture(params=[0, 10], ids=["empty", "10"]) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index f594963dcda..e3dceeca1f3 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -565,7 +565,9 @@ def test_groupby_apply_jit_reductions_special_vals( func, dtype, dataset, groupby_jit_datasets, special_val ): dataset = groupby_jit_datasets[dataset] - with expect_warning_if(func in {"var", "std"} and not np.isnan(special_val), RuntimeWarning): + with expect_warning_if( + func in {"var", "std"} and not np.isnan(special_val), RuntimeWarning + ): groupby_apply_jit_reductions_special_vals_inner( func, dataset, dtype, special_val ) @@ -1409,7 +1411,7 @@ def test_groupby_multi_agg_hash_groupby(agg): @pytest.mark.parametrize( - "agg", ["min", "max", "idxmax", "idxmax", "sum", "prod", "count", "mean"] + "agg", ["min", "max", "idxmax", "idxmin", "sum", "prod", "count", "mean"] ) def test_groupby_nulls_basic(agg): check_dtype = agg not in _index_type_aggs @@ -1447,11 +1449,12 @@ def test_groupby_nulls_basic(agg): # TODO: fillna() used here since we don't follow # Pandas' null semantics. Should we change it? - assert_groupby_results_equal( - getattr(pdf.groupby("a"), agg)().fillna(0), - getattr(gdf.groupby("a"), agg)().fillna(0 if agg != "prod" else 1), - check_dtype=check_dtype, - ) + with expect_warning_if(agg in {"idxmax", "idxmin"}): + assert_groupby_results_equal( + getattr(pdf.groupby("a"), agg)().fillna(0), + getattr(gdf.groupby("a"), agg)().fillna(0 if agg != "prod" else 1), + check_dtype=check_dtype, + ) def test_groupby_nulls_in_index(): @@ -3702,8 +3705,9 @@ def test_categorical_grouping_pandas_compatibility(): with cudf.option_context("mode.pandas_compatible", True): actual = gdf.groupby("key", sort=False).sum() - expected = pdf.groupby("key", sort=False).sum() - + with pytest.warns(FutureWarning): + # observed param deprecation. + expected = pdf.groupby("key", sort=False).sum() assert_eq(actual, expected) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 8ce2adae15b..00b4a9b0e01 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -183,8 +183,8 @@ def test_dataframe_join_suffix(): assert list(expect.columns) == list(got.columns) assert_eq(expect.index.values, got.index.values) - got_sorted = got.sort_values(by=list(got.columns), axis=0) - expect_sorted = expect.sort_values(by=list(expect.columns), axis=0) + got_sorted = got.sort_values(by=["b_left", "c", "b_right"], axis=0) + expect_sorted = expect.sort_values(by=["b_left", "c", "b_right"], axis=0) for k in expect_sorted.columns: _check_series(expect_sorted[k].fillna(-1), got_sorted[k].fillna(-1)) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index edd7da3d42c..6dbb23fbf04 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -581,28 +581,28 @@ def test_min_count_ops(data, ops, skipna, min_count): @pytest.mark.parametrize( - "gsr", + "data1", [ - cudf.Series([1, 2, 3, 4], dtype="datetime64[ns]"), - cudf.Series([1, 2, 3, 4], dtype="timedelta64[ns]"), + [1, 2, 3, 4], + [10, 1, 3, 5], ], ) -def test_cov_corr_invalid_dtypes(gsr): - psr = gsr.to_pandas() - - assert_exceptions_equal( - lfunc=psr.corr, - rfunc=gsr.corr, - lfunc_args_and_kwargs=([psr],), - rfunc_args_and_kwargs=([gsr],), - ) - - assert_exceptions_equal( - lfunc=psr.cov, - rfunc=gsr.cov, - lfunc_args_and_kwargs=([psr],), - rfunc_args_and_kwargs=([gsr],), - ) +@pytest.mark.parametrize( + "data2", + [ + [1, 2, 3, 4], + [10, 1, 3, 5], + ], +) +@pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) +def test_cov_corr_datetime_timedelta(data1, data2, dtype): + gsr1 = cudf.Series(data1, dtype=dtype) + gsr2 = cudf.Series(data2, dtype=dtype) + psr1 = gsr1.to_pandas() + psr2 = gsr2.to_pandas() + + assert_eq(psr1.corr(psr2), gsr1.corr(gsr2)) + assert_eq(psr1.cov(psr2), gsr1.cov(gsr2)) @pytest.mark.parametrize( From 23d189beb1a6f4dc281f22f5c4ce7772d2848767 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 25 Jan 2024 17:09:34 -1000 Subject: [PATCH 112/162] Fix some pytests (#14894) np.product call I think will be redundant with the existing params, np.var call adjusted to what was tested before matmul failure existed upstream in pandas Snuck in a clean up files leftover by a parquet test (found these leftover when running the test suite locally) --- python/cudf/cudf/tests/test_array_function.py | 3 +-- python/cudf/cudf/tests/test_array_ufunc.py | 11 ++++++++++- python/cudf/cudf/tests/test_parquet.py | 18 ++++++++++-------- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py index 758a8cbb535..58658f8b3cc 100644 --- a/python/cudf/cudf/tests/test_array_function.py +++ b/python/cudf/cudf/tests/test_array_function.py @@ -65,11 +65,10 @@ def test_array_func_cudf_series(np_ar, func): [ lambda x: np.mean(x, axis=0), lambda x: np.sum(x, axis=0), - lambda x: np.var(x, ddof=1), + lambda x: np.var(x, ddof=1, axis=0), lambda x: np.dot(x, x.transpose()), lambda x: np.all(x), lambda x: np.any(x), - lambda x: np.product(x), lambda x: np.product(x, axis=0), lambda x: np.product(x, axis=1), ], diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index f5e999559b3..3e3f3aa5dfa 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -7,14 +7,16 @@ import cupy as cp import numpy as np +import pandas as pd import pytest +from packaging import version import cudf from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_200, PANDAS_GE_210 from cudf.testing._utils import ( assert_eq, - set_random_null_mask_inplace, expect_warning_if, + set_random_null_mask_inplace, ) _UFUNCS = [ @@ -89,6 +91,13 @@ def test_ufunc_index(request, ufunc): reason=f"cupy has no support for '{fname}'", ) ) + request.applymarker( + pytest.mark.xfail( + condition=fname == "matmul" + and version.parse(pd.__version__) < version.parse("3.0"), + reason="Fixed by https://github.com/pandas-dev/pandas/pull/57079", + ) + ) N = 100 # Avoid zeros in either array to skip division by 0 errors. Also limit the diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 73cbb924c65..69d3fe0b83f 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -21,7 +21,7 @@ from pyarrow import fs as pa_fs, parquet as pq import cudf -from cudf.core._compat import PANDAS_LT_153, PANDAS_GE_200 +from cudf.core._compat import PANDAS_GE_200, PANDAS_LT_153 from cudf.io.parquet import ( ParquetDatasetWriter, ParquetWriter, @@ -2683,29 +2683,31 @@ def test_parquet_writer_decimal(decimal_type, data): def test_parquet_writer_column_validation(): + cudf_parquet = BytesIO() + pandas_parquet = BytesIO() df = cudf.DataFrame({1: [1, 2, 3], "a": ["a", "b", "c"]}) pdf = df.to_pandas() with cudf.option_context("mode.pandas_compatible", True): with pytest.warns(UserWarning): - df.to_parquet("cudf.parquet") + df.to_parquet(cudf_parquet) if PANDAS_GE_200: with pytest.warns(UserWarning): - pdf.to_parquet("pandas.parquet") + pdf.to_parquet(pandas_parquet) assert_eq( - pd.read_parquet("cudf.parquet"), - cudf.read_parquet("pandas.parquet"), + pd.read_parquet(cudf_parquet), + cudf.read_parquet(pandas_parquet), ) assert_eq( - cudf.read_parquet("cudf.parquet"), - pd.read_parquet("pandas.parquet"), + cudf.read_parquet(cudf_parquet), + pd.read_parquet(pandas_parquet), ) with cudf.option_context("mode.pandas_compatible", False): with pytest.raises(ValueError): - df.to_parquet("cudf.parquet") + df.to_parquet(cudf_parquet) def test_parquet_writer_nulls_pandas_read(tmpdir, pdf): From 7df96e70289ee38a3a03ce7d70086edc9af62933 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Thu, 25 Jan 2024 23:24:09 -0500 Subject: [PATCH 113/162] Align datetimeindex slicing behaviour with Pandas 2.x (#14887) * Align with pandas slicing behaviour for non-monotonic datetime index * Not a TODO --------- Co-authored-by: Ashwin Srinath --- python/cudf/cudf/core/indexed_frame.py | 15 +++++++++++--- python/cudf/cudf/tests/test_indexing.py | 27 +++++++++++++++++++++---- 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index fbbc606d7b8..cb7ff6a00d0 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -198,9 +198,18 @@ def _get_label_range_or_mask(index, start, stop, step): if start is not None and stop is not None: if start > stop: return slice(0, 0, None) - # TODO: Once Index binary ops are updated to support logical_and, - # can use that instead of using cupy. - boolean_mask = cp.logical_and((index >= start), (index <= stop)) + if (start in index) and (stop in index): + # when we have a non-monotonic datetime index, return + # values in the slice defined by index_of(start) and + # index_of(end) + start_loc = index.get_loc(start.to_datetime64()) + stop_loc = index.get_loc(stop.to_datetime64()) + 1 + return slice(start_loc, stop_loc) + else: + raise KeyError( + "Value based partial slicing on non-monotonic DatetimeIndexes " + "with non-existing keys is not allowed.", + ) elif start is not None: boolean_mask = index >= start else: diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 8a84a84f681..1cdaa3c52a7 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -1278,15 +1278,15 @@ def test_iloc_categorical_index(index): @pytest.mark.parametrize( "sli", [ - slice("2001", "2020"), slice("2001", "2002"), slice("2002", "2001"), - slice(None, "2020"), slice("2001", None), ], ) @pytest.mark.parametrize("is_dataframe", [True, False]) def test_loc_datetime_index(sli, is_dataframe): + sli = slice(pd.to_datetime(sli.start), pd.to_datetime(sli.stop)) + if is_dataframe is True: pd_data = pd.DataFrame( {"a": [1, 2, 3]}, @@ -1299,13 +1299,32 @@ def test_loc_datetime_index(sli, is_dataframe): ) gd_data = cudf.from_pandas(pd_data) - expect = pd_data.loc[sli] got = gd_data.loc[sli] - assert_eq(expect, got) +@pytest.mark.parametrize( + "sli", + [ + slice("2001", "2020"), + slice(None, "2020"), + ], +) +def test_loc_datetime_index_slice_not_in(sli): + pd_data = pd.Series( + [1, 2, 3], + pd.Series(["2001", "2009", "2002"], dtype="datetime64[ns]"), + ) + gd_data = cudf.from_pandas(pd_data) + with pytest.raises(KeyError): + assert_eq(pd_data.loc[sli], gd_data.loc[sli]) + + with pytest.raises(KeyError): + sli = slice(pd.to_datetime(sli.start), pd.to_datetime(sli.stop)) + assert_eq(pd_data.loc[sli], gd_data.loc[sli]) + + @pytest.mark.parametrize( "gdf_kwargs", [ From 87a4d124f73a9f6283e708a876d78d4bdd2c162b Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 26 Jan 2024 14:11:14 +0000 Subject: [PATCH 114/162] Deprecations in replace --- python/cudf/cudf/core/indexed_frame.py | 34 ++++++++++++++++++++------ python/cudf/cudf/core/series.py | 5 ++-- python/cudf/cudf/tests/test_replace.py | 6 +++-- 3 files changed, 32 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 9d264822d14..ea85ee66f1b 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -598,11 +598,11 @@ def index(self, value): def replace( self, to_replace=None, - value=None, + value=no_default, inplace=False, limit=None, regex=False, - method=None, + method=no_default, ): """Replace values given in ``to_replace`` with ``value``. @@ -803,12 +803,30 @@ def replace( if regex: raise NotImplementedError("regex parameter is not implemented yet") - if method not in ("pad", None): - raise NotImplementedError( - "method parameter is not implemented yet" + if method is not no_default: + warnings.warn( + "The 'method' keyword in " + f"{type(self).__name__}.replace is deprecated and " + "will be removed in a future version.", + FutureWarning, ) + elif method not in ("pad", None, no_default): + raise NotImplementedError("method parameter is not implemented") - if not (to_replace is None and value is None): + if ( + value is no_default + and method is no_default + and not is_dict_like(to_replace) + and regex is False + ): + warnings.warn( + f"{type(self).__name__}.replace without 'value' and with " + "non-dict-like 'to_replace' is deprecated " + "and will raise in a future version. " + "Explicitly specify the new values instead.", + FutureWarning, + ) + if not (to_replace is None and value is no_default): copy_data = {} ( all_na_per_column, @@ -5320,7 +5338,7 @@ def _get_replacement_values_for_columns( "value argument must be scalar, list-like or Series" ) elif _is_series(to_replace): - if value is None: + if value is None or value is no_default: to_replace_columns = { col: as_column(to_replace.index) for col in columns_dtype_map } @@ -5351,7 +5369,7 @@ def _get_replacement_values_for_columns( "value" ) elif is_dict_like(to_replace): - if value is None: + if value is None or value is no_default: to_replace_columns = { col: list(to_replace.keys()) for col in columns_dtype_map } diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 5f03f368664..7ff529dbd05 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -594,7 +594,6 @@ def __init__( copy=False, nan_as_null=True, ): - index_from_data = None name_from_data = None if data is None: @@ -2317,8 +2316,8 @@ def argsort( return obj @_cudf_nvtx_annotate - def replace(self, to_replace=None, value=None, *args, **kwargs): - if is_dict_like(to_replace) and value is not None: + def replace(self, to_replace=None, value=no_default, *args, **kwargs): + if is_dict_like(to_replace) and value not in {None, no_default}: raise ValueError( "Series.replace cannot use dict-like to_replace and non-None " "value" diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index 94cee2dca68..ac2b2c6cd30 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -1068,8 +1068,10 @@ def test_replace_inplace(pframe, replace_args): assert_eq(gpu_frame, pandas_frame) assert_eq(gpu_copy, cpu_copy) - gpu_frame.replace(**replace_args) - pandas_frame.replace(**replace_args) + with expect_warning_if(len(replace_args) == 0): + gpu_frame.replace(**replace_args) + with expect_warning_if(len(replace_args) == 0): + pandas_frame.replace(**replace_args) assert_eq(gpu_frame, pandas_frame) assert_eq(gpu_copy, cpu_copy) From 7d3e72af69ea38f4150a5a2ff352a300f704fcd0 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 26 Jan 2024 23:28:39 +0530 Subject: [PATCH 115/162] Parquet Writer: Write `non-string` columns pandas-compatibility mode only (#14899) This PR enables writing of non-string columns in parquet writer only in pandas-compatibility mode. This PR: ``` = 8 failed, 102249 passed, 2090 skipped, 976 xfailed, 312 xpassed in 1363.59s (0:22:43) = ``` On `pandas_2.0_feature_branch`: ``` = 9 failed, 102247 passed, 2091 skipped, 976 xfailed, 312 xpassed in 1336.47s (0:22:16) = ``` Co-authored-by: Lawrence Mitchell --------- Co-authored-by: Lawrence Mitchell --- python/cudf/cudf/_lib/utils.pyx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 50a47b4f507..7ba717a0003 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -85,7 +85,12 @@ cpdef generate_pandas_metadata(table, index): # Columns for name, col in table._data.items(): - col_names.append(name) + if cudf.get_option("mode.pandas_compatible"): + # in pandas-compat mode, non-string column names are stringified. + col_names.append(str(name)) + else: + col_names.append(name) + if isinstance(col.dtype, cudf.CategoricalDtype): raise ValueError( "'category' column dtypes are currently not " From b61b39d0ebd33113d4070ceae24adf8c58a46ddf Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 26 Jan 2024 13:05:05 -0600 Subject: [PATCH 116/162] Use sets for argument checking. --- python/cudf/cudf/core/indexed_frame.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index ea85ee66f1b..6f80f6bb0bc 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -810,7 +810,7 @@ def replace( "will be removed in a future version.", FutureWarning, ) - elif method not in ("pad", None, no_default): + elif method not in {"pad", None, no_default}: raise NotImplementedError("method parameter is not implemented") if ( @@ -5338,7 +5338,7 @@ def _get_replacement_values_for_columns( "value argument must be scalar, list-like or Series" ) elif _is_series(to_replace): - if value is None or value is no_default: + if value in {None, no_default}: to_replace_columns = { col: as_column(to_replace.index) for col in columns_dtype_map } @@ -5369,7 +5369,7 @@ def _get_replacement_values_for_columns( "value" ) elif is_dict_like(to_replace): - if value is None or value is no_default: + if value in {None, no_default}: to_replace_columns = { col: list(to_replace.keys()) for col in columns_dtype_map } From 78eff481a6e8c96ae9076a79700f0873bbbd9fba Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 26 Jan 2024 20:06:24 +0000 Subject: [PATCH 117/162] Fix usage --- python/cudf/cudf/core/indexed_frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 6f80f6bb0bc..d7239dbcf2f 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -5338,7 +5338,7 @@ def _get_replacement_values_for_columns( "value argument must be scalar, list-like or Series" ) elif _is_series(to_replace): - if value in {None, no_default}: + if value is None or value is no_default: to_replace_columns = { col: as_column(to_replace.index) for col in columns_dtype_map } @@ -5369,7 +5369,7 @@ def _get_replacement_values_for_columns( "value" ) elif is_dict_like(to_replace): - if value in {None, no_default}: + if value is None or value is no_default: to_replace_columns = { col: list(to_replace.keys()) for col in columns_dtype_map } From 5618d3da17b3d1d911151120e260e9c0dd5be6cf Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 26 Jan 2024 14:10:35 -1000 Subject: [PATCH 118/162] Remove pandas Index subclasses in cudf pandas (#14902) We won't have to proxy these types anymore since they are removed in pandas 2.0 Also removed references to the cudf Index subclasses that are removed in this branch Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/14902 --- docs/cudf/source/conf.py | 6 -- python/cudf/cudf/core/index.py | 16 ++---- python/cudf/cudf/core/reshape.py | 4 +- python/cudf/cudf/pandas/_wrappers/pandas.py | 47 ---------------- python/cudf/cudf/tests/test_dataframe.py | 8 +-- python/cudf/cudf/tests/test_index.py | 21 +------ python/cudf/cudf/tests/test_sorting.py | 2 +- .../cudf_pandas_tests/test_cudf_pandas.py | 55 +------------------ 8 files changed, 17 insertions(+), 142 deletions(-) diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 01a6c5316bd..e76bdb802e4 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -443,12 +443,6 @@ def on_missing_reference(app, env, node, contnode): _prefixed_domain_objects[f"{prefix}{name}"] = name reftarget = node.get("reftarget") - if reftarget == "cudf.core.index.GenericIndex": - # We don't exposed docs for `cudf.core.index.GenericIndex` - # hence we would want the docstring & mypy references to - # use `cudf.Index` - node["reftarget"] = "cudf.Index" - return contnode if "namespacecudf" in reftarget: node["reftarget"] = "cudf" return contnode diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index b506dfe6158..2bd4219997f 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -14,17 +14,17 @@ MutableMapping, Optional, Tuple, - Type, Union, cast, ) import cupy import numpy as np +import pandas as pd +from pandas._config import get_option from typing_extensions import Self import cudf -import pandas as pd from cudf import _lib as libcudf from cudf._lib.datetime import extract_quarter, is_leap_year from cudf._lib.filling import sequence @@ -66,12 +66,8 @@ is_mixed_with_object_dtype, numeric_normalize_types, ) -from cudf.utils.utils import ( - _warn_no_dask_cudf, - search_range, -) from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate -from pandas._config import get_option +from cudf.utils.utils import _warn_no_dask_cudf, search_range class IndexMeta(type): @@ -1356,9 +1352,9 @@ def __repr__(self): # from the output due to the type-cast to # object dtype happening above. # Note : The replacing of single quotes has - # to happen only in case of non-StringIndex types, + # to happen only in case of non-Index[string] types, # as we want to preserve single quotes in case - # of StringIndex and it is valid to have them. + # of Index[string] and it is valid to have them. output = output.replace("'", "") else: output = repr(preprocess.to_pandas()) @@ -2947,7 +2943,7 @@ def as_index( result : subclass of Index - CategoricalIndex for Categorical input. - DatetimeIndex for Datetime input. - - GenericIndex for all other inputs. + - Index for all other inputs. Notes ----- diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index d8bb09c668a..05ab1edfaba 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1,11 +1,11 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. import itertools +import warnings from collections import abc from typing import Dict, Optional import cupy -import warnings import numpy as np import pandas as pd @@ -35,7 +35,7 @@ def _align_objs(objs, how="outer", sort=None): A list of reindexed and aligned objects ready for concatenation """ - # Check if multiindex then check if indexes match. GenericIndex + # Check if multiindex then check if indexes match. Index # returns ndarray tuple of bools requiring additional filter. # Then check for duplicate index value. i_objs = iter(objs) diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index afcfc13a9c4..137709925df 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -17,7 +17,6 @@ _FastSlowAttribute, _FunctionProxy, _Unusable, - get_final_type_map, make_final_proxy_type as _make_final_proxy_type, make_intermediate_proxy_type as _make_intermediate_proxy_type, register_proxy_func, @@ -203,19 +202,6 @@ def Index__new__(cls, *args, **kwargs): }, ) -get_final_type_map()[cudf.StringIndex] = Index -get_final_type_map()[cudf.Int8Index] = Index -get_final_type_map()[cudf.Int8Index] = Index -get_final_type_map()[cudf.Int16Index] = Index -get_final_type_map()[cudf.Int32Index] = Index -get_final_type_map()[cudf.UInt8Index] = Index -get_final_type_map()[cudf.UInt16Index] = Index -get_final_type_map()[cudf.UInt32Index] = Index -get_final_type_map()[cudf.UInt64Index] = Index -get_final_type_map()[cudf.Float32Index] = Index -get_final_type_map()[cudf.GenericIndex] = Index - - RangeIndex = make_final_proxy_type( "RangeIndex", cudf.RangeIndex, @@ -471,17 +457,6 @@ def Index__new__(cls, *args, **kwargs): additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, ) - -Int64Index = make_final_proxy_type( - "Int64Index", - cudf.Int64Index, - pd.core.indexes.numeric.Int64Index, - fast_to_slow=lambda fast: fast.to_pandas(), - slow_to_fast=cudf.from_pandas, - bases=(Index,), - additional_attributes={"__init__": _DELETE}, -) - UInt8Dtype = make_final_proxy_type( "UInt8Dtype", _Unusable, @@ -518,16 +493,6 @@ def Index__new__(cls, *args, **kwargs): additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, ) -UInt64Index = make_final_proxy_type( - "UInt64Index", - cudf.UInt64Index, - pd.core.indexes.numeric.UInt64Index, - fast_to_slow=lambda fast: fast.to_pandas(), - slow_to_fast=cudf.from_pandas, - bases=(Index,), - additional_attributes={"__init__": _DELETE}, -) - IntervalIndex = make_final_proxy_type( "IntervalIndex", cudf.IntervalIndex, @@ -593,16 +558,6 @@ def Index__new__(cls, *args, **kwargs): additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, ) -Float64Index = make_final_proxy_type( - "Float64Index", - cudf.Float64Index, - pd.core.indexes.numeric.Float64Index, - fast_to_slow=lambda fast: fast.to_pandas(), - slow_to_fast=cudf.from_pandas, - bases=(Index,), - additional_attributes={"__init__": _DELETE}, -) - SeriesGroupBy = make_intermediate_proxy_type( "SeriesGroupBy", cudf.core.groupby.groupby.SeriesGroupBy, @@ -1273,8 +1228,6 @@ def _df_query_method(self, *args, local_dict=None, global_dict=None, **kwargs): pd.core.indexes.datetimelike.DatetimeTimedeltaMixin, pd.core.indexes.datetimelike.DatetimeIndexOpsMixin, pd.core.indexes.extension.NDArrayBackedExtensionIndex, - pd.core.indexes.numeric.IntegerIndex, - pd.core.indexes.numeric.NumericIndex, pd.core.generic.NDFrame, pd.core.indexes.accessors.PeriodProperties, pd.core.indexes.accessors.Properties, diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index f7242941561..34dc7ebc68e 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -11,8 +11,8 @@ import string import textwrap import warnings -from contextlib import contextmanager from collections import OrderedDict, defaultdict, namedtuple +from contextlib import contextmanager from copy import copy import cupy @@ -24,6 +24,7 @@ from packaging import version import cudf +from cudf.api.extensions import no_default from cudf.core._compat import ( PANDAS_GE_134, PANDAS_GE_150, @@ -32,7 +33,6 @@ PANDAS_LT_140, PANDAS_LT_203, ) -from cudf.api.extensions import no_default from cudf.core.buffer.spill_manager import get_global_manager from cudf.core.column import column from cudf.errors import MixedTypeError @@ -5499,7 +5499,6 @@ def test_rowwise_ops_nullable_int_dtypes(op, expected): @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("numeric_only", [True, False]) def test_rowwise_ops_datetime_dtypes(data, op, skipna, numeric_only): - gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() @@ -5528,7 +5527,6 @@ def test_rowwise_ops_datetime_dtypes(data, op, skipna, numeric_only): # https://github.com/pandas-dev/pandas/issues/52524 assert_eq(got.astype("datetime64[ns]"), expected) else: - assert_eq(got, expected, check_dtype=False) @@ -10921,7 +10919,7 @@ def test_dataframe_contains(name, contains, other_names): assert (contains in pdf) == expectation assert (contains in gdf) == expectation elif pd.api.types.is_float_dtype(gdf.columns.dtype): - # In some cases, the columns are converted to a Float64Index based on + # In some cases, the columns are converted to a Index[float] based on # the other column names. That casts name values from None to np.nan. expectation = contains is np.nan and (name is None or name is np.nan) assert (contains in pdf) == expectation diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index e43416e323c..ca8ef83316d 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1265,12 +1265,7 @@ def test_index_basic(data, dtype, name): @pytest.mark.parametrize("name", [1, "a", None]) @pytest.mark.parametrize("dtype", SIGNED_INTEGER_TYPES) def test_integer_index_apis(data, name, dtype): - if PANDAS_GE_200: - pindex = pd.Index(data, dtype=dtype, name=name) - else: - with pytest.warns(FutureWarning): - pindex = pd.Int64Index(data, dtype=dtype, name=name) - + pindex = pd.Index(data, dtype=dtype, name=name) gindex = cudf.Index(data, dtype=dtype, name=name) assert_eq(pindex, gindex) @@ -1281,12 +1276,7 @@ def test_integer_index_apis(data, name, dtype): @pytest.mark.parametrize("name", [1, "a", None]) @pytest.mark.parametrize("dtype", UNSIGNED_TYPES) def test_unsigned_integer_index_apis(data, name, dtype): - if PANDAS_GE_200: - pindex = pd.Index(data, dtype=dtype, name=name) - else: - with pytest.warns(FutureWarning): - pindex = pd.UInt64Index(data, dtype=dtype, name=name) - + pindex = pd.Index(data, dtype=dtype, name=name) gindex = cudf.Index(data, dtype=dtype, name=name) assert_eq(pindex, gindex) @@ -1297,12 +1287,7 @@ def test_unsigned_integer_index_apis(data, name, dtype): @pytest.mark.parametrize("name", [1, "a", None]) @pytest.mark.parametrize("dtype", FLOAT_TYPES) def test_float_index_apis(data, name, dtype): - if PANDAS_GE_200: - pindex = pd.Index(data, dtype=dtype, name=name) - else: - with pytest.warns(FutureWarning): - pindex = pd.Float64Index(data, dtype=dtype, name=name) - + pindex = pd.Index(data, dtype=dtype, name=name) gindex = cudf.Index(data, dtype=dtype, name=name) assert_eq(pindex, gindex) diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index 07f76f1103c..f30c14373bf 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -346,7 +346,7 @@ def _check_scatter_by_map(dfs, col): with pytest.warns(UserWarning): df.scatter_by_map("a", map_size=1, debug=True) # Bad map_size - # Test GenericIndex + # Test Index df2 = df.set_index("c") generic_result = df2.scatter_by_map("b", map_size, keep_index=keep) _check_scatter_by_map(generic_result, df2["b"]) diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index df4bed0be0a..2425c323060 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -269,25 +269,6 @@ def test_rename_categories(): tm.assert_series_equal(psr, sr) -def test_rename_categories_inplace(): - psr = pd.Series([1, 2, 3], dtype="category") - sr = xpd.Series([1, 2, 3], dtype="category") - with pytest.warns(FutureWarning): - psr.cat.rename_categories({1: 5}, inplace=True) - sr.cat.rename_categories({1: 5}, inplace=True) - tm.assert_series_equal(psr, sr) - - -def test_rename_categories_inplace_after_copying_parent(): - s = xpd.Series([1, 2, 3], dtype="category") - # cudf does not define "rename_categories", - # so this copies `s` from device to host: - rename_categories = s.cat.rename_categories - _ = len(s) # trigger a copy of `s` from host to device: - with pytest.warns(FutureWarning): - rename_categories([5, 2, 3], inplace=True) - assert s.cat.categories.tolist() == [5, 2, 3] - def test_column_rename(dataframe): pdf, df = dataframe @@ -663,8 +644,7 @@ def test_rolling_win_type(): pdf = pd.DataFrame(range(5)) df = xpd.DataFrame(range(5)) result = df.rolling(2, win_type="boxcar").mean() - with pytest.warns(DeprecationWarning): - expected = pdf.rolling(2, win_type="boxcar").mean() + expected = pdf.rolling(2, win_type="boxcar").mean() tm.assert_equal(result, expected) @@ -1017,12 +997,6 @@ def __init__(self, myinput): xpd.PeriodIndex, xpd.MultiIndex, xpd.IntervalIndex, - xpd.UInt64Index, - xpd.Int64Index, - xpd.Float64Index, - xpd.core.indexes.numeric.UInt64Index, - xpd.core.indexes.numeric.Int64Index, - xpd.core.indexes.numeric.Float64Index, ], ) def test_index_subclass(index_type): @@ -1032,22 +1006,6 @@ def test_index_subclass(index_type): assert not issubclass(xpd.Index, index_type) -def test_index_internal_subclass(): - # test that proxy index types that are not related by inheritance - # still appear to be so if the underlying slow types are related - # by inheritance: - assert issubclass( - xpd.Int64Index, - xpd.core.indexes.numeric.NumericIndex, - ) == issubclass( - pd.Int64Index, - pd.core.indexes.numeric.NumericIndex, - ) - assert isinstance( - xpd.Index([1, 2, 3]), xpd.core.indexes.numeric.NumericIndex - ) == isinstance(pd.Index([1, 2, 3]), pd.core.indexes.numeric.NumericIndex) - - def test_np_array_of_timestamps(): expected = np.array([pd.Timestamp(1)]) + pd.tseries.offsets.MonthEnd() got = np.array([xpd.Timestamp(1)]) + xpd.tseries.offsets.MonthEnd() @@ -1080,7 +1038,7 @@ def test_np_array_of_timestamps(): # Other types xpd.tseries.offsets.BDay(5), xpd.Timestamp("2001-01-01"), - xpd.Timestamp("2001-01-01", freq="D"), + xpd.Timestamp("2001-01-01", tz="UTC"), xpd.Timedelta("1 days"), xpd.Timedelta(1, "D"), ], @@ -1214,15 +1172,6 @@ def test_read_sas_context(): assert isinstance(df, xpd.DataFrame) -@pytest.mark.parametrize( - "idx_obj", ["Float64Index", "Int64Index", "UInt64Index"] -) -def test_pandas_module_getattr_objects(idx_obj): - # Objects that are behind pandas.__getattr__ (version 1.5 specific) - idx = getattr(xpd, idx_obj)([1, 2, 3]) - assert isinstance(idx, xpd.Index) - - def test_concat_fast(): pytest.importorskip("cudf") From d8df8e469c9aa5668121c6e16636f1af3ec8c269 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Sun, 28 Jan 2024 11:09:39 -0500 Subject: [PATCH 119/162] Allow `any` and `all` only for all-`NA` and empty string columns (#14898) This PR allows any and all for all-NA string columns and string columns that have size 0. This is an essential workaround for time-being because any and all aren't natively supported for string types in libcudf and without these workarounds, multiple places in the reduction APIs will need if/elif checks which will make it harder to maintain. This PR: = 5 failed, 102252 passed, 2090 skipped, 976 xfailed, 312 xpassed in 1375.59s (0:22:55) = On pandas_2.0_feature_branch: = 9 failed, 102247 passed, 2091 skipped, 976 xfailed, 312 xpassed in 1336.47s (0:22:16) = --- python/cudf/cudf/core/column/string.py | 15 ++++++++------- python/cudf/cudf/tests/test_string.py | 18 ++++++++++++++++++ 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index fa07b299ecf..2fdcf30606a 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5594,16 +5594,17 @@ def data(self): return self._data def all(self, skipna: bool = True) -> bool: - # The skipna argument is only used for numerical columns. - # If all entries are null the result is True, including when the column - # is empty. - + if skipna and self.null_count == self.size: + return True + elif not skipna and self.has_nulls(): + raise TypeError("boolean value of NA is ambiguous") raise NotImplementedError("`all` not implemented for `StringColumn`") def any(self, skipna: bool = True) -> bool: - # The skipna argument is only used for numerical columns. - # If all entries are null the result is True, including when the column - # is empty. + if not skipna and self.has_nulls(): + raise TypeError("boolean value of NA is ambiguous") + elif skipna and self.null_count == self.size: + return False raise NotImplementedError("`any` not implemented for `StringColumn`") diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 8eca1a56525..8c8a3cb2399 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -3484,3 +3484,21 @@ def test_str_iterate_error(): s = cudf.Series(["abc", "xyz"]) with pytest.raises(TypeError): iter(s.str) + + +def test_string_reduction_error(): + s = cudf.Series([None, None], dtype="str") + ps = s.to_pandas(nullable=True) + assert_exceptions_equal( + s.any, + ps.any, + lfunc_args_and_kwargs=([], {"skipna": False}), + rfunc_args_and_kwargs=([], {"skipna": False}), + ) + + assert_exceptions_equal( + s.all, + ps.all, + lfunc_args_and_kwargs=([], {"skipna": False}), + rfunc_args_and_kwargs=([], {"skipna": False}), + ) From 9fa9dc5222df5bf482b798206b70b02288bd23ca Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 29 Jan 2024 07:11:45 -0600 Subject: [PATCH 120/162] Prevent converting strings to arrow strings in `dask_cudf` pytests (#14914) dask is natively converting all object types to arrow[string] types if proper pyarrow dependency is detected. This is being done in assert_eq API. We will need a change in cudf and dask upstream to be able to support this kind of conversion. I'm coming up with a solution in 24.04 dev cycle, but in the interest of shipping pandas-2.x I'm feeling confident to disable this auto-conversion by setting the dataframe.convert-string dask config to False where necessary. --- .../dask_cudf/dask_cudf/io/tests/test_csv.py | 8 ++--- .../dask_cudf/dask_cudf/io/tests/test_json.py | 6 ++-- .../dask_cudf/io/tests/test_parquet.py | 2 +- .../dask_cudf/tests/test_accessor.py | 36 ++++++++++--------- python/dask_cudf/dask_cudf/tests/test_core.py | 26 ++++++++++---- .../dask_cudf/dask_cudf/tests/test_groupby.py | 3 +- .../dask_cudf/tests/test_reductions.py | 28 ++++++++------- 7 files changed, 65 insertions(+), 44 deletions(-) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_csv.py b/python/dask_cudf/dask_cudf/io/tests/test_csv.py index 5f1aa98e888..987fcf6b4ae 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_csv.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_csv.py @@ -226,11 +226,11 @@ def test_read_csv_skiprows_error(csv_begin_bad_lines): def test_read_csv_skipfooter(csv_end_bad_lines): # Repro from Issue#13552 + with dask.config.set({"dataframe.convert-string": False}): + ddf_cpu = dd.read_csv(csv_end_bad_lines, skipfooter=3).compute() + ddf_gpu = dask_cudf.read_csv(csv_end_bad_lines, skipfooter=3).compute() - ddf_cpu = dd.read_csv(csv_end_bad_lines, skipfooter=3).compute() - ddf_gpu = dask_cudf.read_csv(csv_end_bad_lines, skipfooter=3).compute() - - dd.assert_eq(ddf_cpu, ddf_gpu, check_dtype=False) + dd.assert_eq(ddf_cpu, ddf_gpu, check_dtype=False) def test_read_csv_skipfooter_error(csv_end_bad_lines): diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py index fddbfb16e27..5e06832ed94 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_json.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import os @@ -80,7 +80,9 @@ def test_read_json_nested(tmp_path): } ) kwargs = dict(orient="records", lines=True) - with tmp_path / "data.json" as f: + with tmp_path / "data.json" as f, dask.config.set( + {"dataframe.convert-string": False} + ): df.to_json(f, **kwargs) # Ensure engine='cudf' is tested. actual = dask_cudf.read_json(f, engine="cudf", **kwargs) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 6c53193d7cd..583d4b07f6f 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import glob import math diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 2f5dcb524a5..f6b8c34fef0 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -2,7 +2,9 @@ import numpy as np import pandas as pd +import dask import pytest + from pandas.testing import assert_series_equal from dask import dataframe as dd @@ -137,30 +139,30 @@ def test_categorical_basic(data): 4 a """ assert all(x == y for x, y in zip(string.split(), expect_str.split())) + with dask.config.set({"dataframe.convert-string": False}): + df = DataFrame() + df["a"] = ["xyz", "abc", "def"] * 10 - df = DataFrame() - df["a"] = ["xyz", "abc", "def"] * 10 - - pdf = df.to_pandas() - cddf = dgd.from_cudf(df, 1) - cddf["b"] = cddf["a"].astype("category") + pdf = df.to_pandas() + cddf = dgd.from_cudf(df, 1) + cddf["b"] = cddf["a"].astype("category") - ddf = dd.from_pandas(pdf, 1) - ddf["b"] = ddf["a"].astype("category") + ddf = dd.from_pandas(pdf, 1) + ddf["b"] = ddf["a"].astype("category") - assert_eq(ddf._meta_nonempty["b"], cddf._meta_nonempty["b"]) + assert_eq(ddf._meta_nonempty["b"], cddf._meta_nonempty["b"]) - with pytest.raises(NotImplementedError): - cddf["b"].cat.categories + with pytest.raises(NotImplementedError): + cddf["b"].cat.categories - with pytest.raises(NotImplementedError): - ddf["b"].cat.categories + with pytest.raises(NotImplementedError): + ddf["b"].cat.categories - cddf = cddf.categorize() - ddf = ddf.categorize() + cddf = cddf.categorize() + ddf = ddf.categorize() - assert_eq(ddf["b"].cat.categories, cddf["b"].cat.categories) - assert_eq(ddf["b"].cat.ordered, cddf["b"].cat.ordered) + assert_eq(ddf["b"].cat.categories, cddf["b"].cat.categories) + assert_eq(ddf["b"].cat.ordered, cddf["b"].cat.ordered) @pytest.mark.parametrize("data", [data_cat_1()]) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 4f77887033a..552d800e2dd 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -782,14 +782,16 @@ def test_dataframe_set_index(): df["str"] = list("abcdefghijklmnopqrstuvwxyz") pdf = df.to_pandas() - ddf = dgd.from_cudf(df, npartitions=4) - ddf = ddf.set_index("str") + with dask.config.set({"dataframe.convert-string": False}): + ddf = dgd.from_cudf(df, npartitions=4) + ddf = ddf.set_index("str") - pddf = dd.from_pandas(pdf, npartitions=4) - pddf = pddf.set_index("str") - from cudf.testing._utils import assert_eq + pddf = dd.from_pandas(pdf, npartitions=4) + pddf = pddf.set_index("str") + + from cudf.testing._utils import assert_eq - assert_eq(ddf.compute(), pddf.compute()) + assert_eq(ddf.compute(), pddf.compute()) def test_series_describe(): @@ -938,3 +940,15 @@ def test_categorical_dtype_round_trip(): actual = ds.compute() expected = pds.compute() assert actual.dtype.ordered == expected.dtype.ordered + + +def test_object_to_string_fail(request): + request.applymarker( + pytest.mark.xfail( + reason="https://github.com/rapidsai/cudf/issues/14915", + ) + ) + s = cudf.Series(["a", "b", "c"] * 10) + ds = dgd.from_cudf(s, npartitions=2) + pds = dd.from_pandas(s.to_pandas(), npartitions=2) + dd.assert_eq(ds.sort_values(), pds.sort_values()) diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 0dc57d8df55..cef8bdacace 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -610,7 +610,8 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index): if as_index: # Groupby columns became the index. # Sorting the index should not change anything. - dd.assert_eq(gf.index, gf.sort_index().index) + with dask.config.set({"dataframe.convert-string": False}): + dd.assert_eq(gf.index, gf.sort_index().index) else: # Groupby columns are did NOT become the index. # Sorting by these columns should not change anything. diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py index 56d2b42efbc..e347e8be9e4 100644 --- a/python/dask_cudf/dask_cudf/tests/test_reductions.py +++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py @@ -4,6 +4,7 @@ import pandas as pd import pytest +import dask from dask import dataframe as dd import cudf @@ -69,16 +70,17 @@ def test_rowwise_reductions(data, op): gddf = dgd.from_cudf(data, npartitions=10) pddf = gddf.to_dask_dataframe() - if op in ("var", "std"): - expected = getattr(pddf, op)(axis=1, numeric_only=True, ddof=0) - got = getattr(gddf, op)(axis=1, numeric_only=True, ddof=0) - else: - expected = getattr(pddf, op)(numeric_only=True, axis=1) - got = getattr(pddf, op)(numeric_only=True, axis=1) - - dd.assert_eq( - expected, - got, - check_exact=False, - check_dtype=op not in ("var", "std"), - ) + with dask.config.set({"dataframe.convert-string": False}): + if op in ("var", "std"): + expected = getattr(pddf, op)(axis=1, numeric_only=True, ddof=0) + got = getattr(gddf, op)(axis=1, numeric_only=True, ddof=0) + else: + expected = getattr(pddf, op)(numeric_only=True, axis=1) + got = getattr(pddf, op)(numeric_only=True, axis=1) + + dd.assert_eq( + expected, + got, + check_exact=False, + check_dtype=op not in ("var", "std"), + ) From 784fe95bf21b77e58102cee3bce8bea20ec262ee Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 29 Jan 2024 13:31:28 +0000 Subject: [PATCH 121/162] Enable full CI --- .github/workflows/pr.yaml | 181 +++++++++++++++++++------------------- 1 file changed, 90 insertions(+), 91 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 21138651f76..9c30161ab36 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -13,24 +13,24 @@ jobs: pr-builder: needs: - checks - #- conda-cpp-build - #- conda-cpp-checks - #- conda-cpp-tests - #- conda-python-build - #- conda-python-cudf-tests - #- conda-python-other-tests - #- conda-java-tests - #- conda-notebook-tests - #- docs-build + - conda-cpp-build + - conda-cpp-checks + - conda-cpp-tests + - conda-python-build + - conda-python-cudf-tests + - conda-python-other-tests + - conda-java-tests + - conda-notebook-tests + - docs-build - wheel-build-cudf - wheel-tests-cudf - wheel-build-dask-cudf - wheel-tests-dask-cudf - #- devcontainer + - devcontainer - unit-tests-cudf-pandas - pandas-tests - #- pandas-tests-diff - #- pandas-tests-diff-comment + - pandas-tests-diff + - pandas-tests-diff-comment secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.04 checks: @@ -38,77 +38,76 @@ jobs: uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.04 with: enable_check_generated_files: false - enable_check_style: false - #conda-cpp-build: - # needs: checks - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04 - # with: - # build_type: pull-request - #conda-cpp-checks: - # needs: conda-cpp-build - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04 - # with: - # build_type: pull-request - # enable_check_symbols: true - #conda-cpp-tests: - # needs: conda-cpp-build - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04 - # with: - # build_type: pull-request - #conda-python-build: - # needs: conda-cpp-build - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04 - # with: - # build_type: pull-request - #conda-python-cudf-tests: - # needs: conda-python-build - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04 - # with: - # build_type: pull-request - # test_script: "ci/test_python_cudf.sh" - #conda-python-other-tests: - # # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism - # needs: conda-python-build - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04 - # with: - # build_type: pull-request - # test_script: "ci/test_python_other.sh" - #conda-java-tests: - # needs: conda-cpp-build - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04 - # with: - # build_type: pull-request - # node_type: "gpu-v100-latest-1" - # arch: "amd64" - # container_image: "rapidsai/ci-conda:latest" - # run_script: "ci/test_java.sh" - #conda-notebook-tests: - # needs: conda-python-build - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04 - # with: - # build_type: pull-request - # node_type: "gpu-v100-latest-1" - # arch: "amd64" - # container_image: "rapidsai/ci-conda:latest" - # run_script: "ci/test_notebooks.sh" - #docs-build: - # needs: conda-python-build - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04 - # with: - # build_type: pull-request - # node_type: "gpu-v100-latest-1" - # arch: "amd64" - # container_image: "rapidsai/ci-conda:latest" - # run_script: "ci/build_docs.sh" + conda-cpp-build: + needs: checks + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04 + with: + build_type: pull-request + conda-cpp-checks: + needs: conda-cpp-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04 + with: + build_type: pull-request + enable_check_symbols: true + conda-cpp-tests: + needs: conda-cpp-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04 + with: + build_type: pull-request + conda-python-build: + needs: conda-cpp-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04 + with: + build_type: pull-request + conda-python-cudf-tests: + needs: conda-python-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04 + with: + build_type: pull-request + test_script: "ci/test_python_cudf.sh" + conda-python-other-tests: + # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism + needs: conda-python-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04 + with: + build_type: pull-request + test_script: "ci/test_python_other.sh" + conda-java-tests: + needs: conda-cpp-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04 + with: + build_type: pull-request + node_type: "gpu-v100-latest-1" + arch: "amd64" + container_image: "rapidsai/ci-conda:latest" + run_script: "ci/test_java.sh" + conda-notebook-tests: + needs: conda-python-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04 + with: + build_type: pull-request + node_type: "gpu-v100-latest-1" + arch: "amd64" + container_image: "rapidsai/ci-conda:latest" + run_script: "ci/test_notebooks.sh" + docs-build: + needs: conda-python-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04 + with: + build_type: pull-request + node_type: "gpu-v100-latest-1" + arch: "amd64" + container_image: "rapidsai/ci-conda:latest" + run_script: "ci/build_docs.sh" wheel-build-cudf: needs: checks secrets: inherit @@ -140,14 +139,14 @@ jobs: matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1"))) build_type: pull-request script: ci/test_wheel_dask_cudf.sh - #devcontainer: - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.04 - # with: - # build_command: | - # sccache -z; - # build-all -DBUILD_BENCHMARKS=ON --verbose; - # sccache -s; + devcontainer: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.04 + with: + build_command: | + sccache -z; + build-all -DBUILD_BENCHMARKS=ON --verbose; + sccache -s; unit-tests-cudf-pandas: needs: wheel-build-cudf secrets: inherit From 51e42c10fb8d2eab8d5e56502aaaae299af7d397 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 29 Jan 2024 13:34:13 +0000 Subject: [PATCH 122/162] Fix spacings --- .github/workflows/pr.yaml | 142 +++++++++++++++++++------------------- 1 file changed, 71 insertions(+), 71 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 9c30161ab36..308f09b4b7f 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -29,8 +29,8 @@ jobs: - devcontainer - unit-tests-cudf-pandas - pandas-tests - - pandas-tests-diff - - pandas-tests-diff-comment + #- pandas-tests-diff + #- pandas-tests-diff-comment secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.04 checks: @@ -39,75 +39,75 @@ jobs: with: enable_check_generated_files: false conda-cpp-build: - needs: checks - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04 - with: - build_type: pull-request + needs: checks + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04 + with: + build_type: pull-request conda-cpp-checks: - needs: conda-cpp-build - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04 - with: - build_type: pull-request - enable_check_symbols: true + needs: conda-cpp-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04 + with: + build_type: pull-request + enable_check_symbols: true conda-cpp-tests: - needs: conda-cpp-build - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04 - with: - build_type: pull-request + needs: conda-cpp-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04 + with: + build_type: pull-request conda-python-build: - needs: conda-cpp-build - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04 - with: - build_type: pull-request + needs: conda-cpp-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04 + with: + build_type: pull-request conda-python-cudf-tests: - needs: conda-python-build - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04 - with: - build_type: pull-request - test_script: "ci/test_python_cudf.sh" + needs: conda-python-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04 + with: + build_type: pull-request + test_script: "ci/test_python_cudf.sh" conda-python-other-tests: - # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism - needs: conda-python-build - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04 - with: - build_type: pull-request - test_script: "ci/test_python_other.sh" + # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism + needs: conda-python-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04 + with: + build_type: pull-request + test_script: "ci/test_python_other.sh" conda-java-tests: - needs: conda-cpp-build - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04 - with: - build_type: pull-request - node_type: "gpu-v100-latest-1" - arch: "amd64" - container_image: "rapidsai/ci-conda:latest" - run_script: "ci/test_java.sh" + needs: conda-cpp-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04 + with: + build_type: pull-request + node_type: "gpu-v100-latest-1" + arch: "amd64" + container_image: "rapidsai/ci-conda:latest" + run_script: "ci/test_java.sh" conda-notebook-tests: - needs: conda-python-build - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04 - with: - build_type: pull-request - node_type: "gpu-v100-latest-1" - arch: "amd64" - container_image: "rapidsai/ci-conda:latest" - run_script: "ci/test_notebooks.sh" + needs: conda-python-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04 + with: + build_type: pull-request + node_type: "gpu-v100-latest-1" + arch: "amd64" + container_image: "rapidsai/ci-conda:latest" + run_script: "ci/test_notebooks.sh" docs-build: - needs: conda-python-build - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04 - with: - build_type: pull-request - node_type: "gpu-v100-latest-1" - arch: "amd64" - container_image: "rapidsai/ci-conda:latest" - run_script: "ci/build_docs.sh" + needs: conda-python-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04 + with: + build_type: pull-request + node_type: "gpu-v100-latest-1" + arch: "amd64" + container_image: "rapidsai/ci-conda:latest" + run_script: "ci/build_docs.sh" wheel-build-cudf: needs: checks secrets: inherit @@ -140,13 +140,13 @@ jobs: build_type: pull-request script: ci/test_wheel_dask_cudf.sh devcontainer: - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.04 - with: - build_command: | - sccache -z; - build-all -DBUILD_BENCHMARKS=ON --verbose; - sccache -s; + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.04 + with: + build_command: | + sccache -z; + build-all -DBUILD_BENCHMARKS=ON --verbose; + sccache -s; unit-tests-cudf-pandas: needs: wheel-build-cudf secrets: inherit @@ -195,4 +195,4 @@ jobs: # owner: context.repo.owner, # repo: context.repo.repo, # body: `${summary_comment}\n\nHere is [a link to the full test summary](${summary_url}).\n` - # }) + # }) \ No newline at end of file From eae873e6b4f5f7234f7f8cee858d5d1adc788cc5 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 29 Jan 2024 07:36:06 -0600 Subject: [PATCH 123/162] Update pr.yaml --- .github/workflows/pr.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 308f09b4b7f..734c7643808 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -195,4 +195,5 @@ jobs: # owner: context.repo.owner, # repo: context.repo.repo, # body: `${summary_comment}\n\nHere is [a link to the full test summary](${summary_url}).\n` - # }) \ No newline at end of file + # }) + From dbf08cb84026e185971f29b71e278765c3d55093 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 29 Jan 2024 08:48:51 -0600 Subject: [PATCH 124/162] Fix style issues in 2.0 feature branch (#14918) This PR fixes all style issues in pandas-2.0 feature branch --- .github/workflows/pr.yaml | 1 - python/cudf/cudf/_lib/column.pyx | 1 - python/cudf/cudf/api/types.py | 4 ++-- python/cudf/cudf/core/column/datetime.py | 1 - python/cudf/cudf/core/column/interval.py | 3 +-- python/cudf/cudf/core/column/timedelta.py | 3 +-- python/cudf/cudf/core/dataframe.py | 1 - python/cudf/cudf/core/dtypes.py | 14 +++++------ python/cudf/cudf/core/multiindex.py | 8 +++++-- python/cudf/cudf/testing/testing.py | 2 +- python/cudf/cudf/tests/test_api_types.py | 3 +-- python/cudf/cudf/tests/test_applymap.py | 2 +- python/cudf/cudf/tests/test_array_function.py | 3 ++- python/cudf/cudf/tests/test_categorical.py | 5 ---- .../cudf/cudf/tests/test_column_accessor.py | 2 +- python/cudf/cudf/tests/test_concat.py | 7 +++--- python/cudf/cudf/tests/test_csv.py | 2 +- python/cudf/cudf/tests/test_datasets.py | 2 +- python/cudf/cudf/tests/test_datetime.py | 24 ++++++------------- python/cudf/cudf/tests/test_dropna.py | 1 - python/cudf/cudf/tests/test_duplicates.py | 5 +--- python/cudf/cudf/tests/test_groupby.py | 2 +- python/cudf/cudf/tests/test_monotonic.py | 7 +----- python/cudf/cudf/tests/test_parquet.py | 1 - python/cudf/cudf/tests/test_resampling.py | 2 +- python/cudf/cudf/tests/test_rolling.py | 3 --- python/cudf/cudf/tests/test_series.py | 2 +- python/cudf/cudf/tests/test_stats.py | 5 +--- python/cudf/cudf/tests/test_timedelta.py | 2 +- .../cudf_pandas_tests/test_cudf_pandas.py | 1 - .../dask_cudf/tests/test_accessor.py | 3 +-- 31 files changed, 43 insertions(+), 79 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 734c7643808..14a74618413 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -196,4 +196,3 @@ jobs: # repo: context.repo.repo, # body: `${summary_comment}\n\nHere is [a link to the full test summary](${summary_url}).\n` # }) - diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index b97dc85ef8b..45aa1081b8d 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -12,7 +12,6 @@ import rmm import cudf import cudf._lib as libcudf from cudf._lib import pylibcudf -from cudf.api.types import _is_categorical_dtype, _is_datetime64tz_dtype from cudf.core.buffer import ( Buffer, ExposureTrackedBuffer, diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 22fc3ea2c6f..f6f5e522cbd 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -12,9 +12,10 @@ import cupy as cp import numpy as np +import pandas as pd +from pandas.api import types as pd_types import cudf -import pandas as pd from cudf.core._compat import PANDAS_GE_150 from cudf.core.dtypes import ( # noqa: F401 _BaseDtype, @@ -30,7 +31,6 @@ is_list_dtype, is_struct_dtype, ) -from pandas.api import types as pd_types def is_numeric_dtype(obj): diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 08a5103b409..a0c0b119ef7 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -28,7 +28,6 @@ is_scalar, is_timedelta64_dtype, ) - from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_220 from cudf.core.buffer import Buffer, cuda_array_interface_wrapper from cudf.core.column import ColumnBase, as_column, column, string diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index f73c222624a..f5d527ad201 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -1,11 +1,10 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. from typing import Optional -import pyarrow as pa import pandas as pd +import pyarrow as pa import cudf - from cudf.api.types import _is_interval_dtype from cudf.core.column import StructColumn from cudf.core.dtypes import CategoricalDtype, IntervalDtype diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 2c12c77277c..094ccb57a1c 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -14,12 +14,11 @@ from cudf import _lib as libcudf from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype from cudf.api.types import is_scalar, is_timedelta64_dtype +from cudf.core._compat import PANDAS_GE_200 from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column import ColumnBase, column, string from cudf.utils.dtypes import np_to_pa_dtype from cudf.utils.utils import _all_bools_with_nulls -from cudf.core._compat import PANDAS_GE_200 - _dtype_to_format_conversion = { "timedelta64[ns]": "%D days %H:%M:%S", diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 86947fe6028..d0cbacfb7e8 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6099,7 +6099,6 @@ def _reduce( numeric_only=False, **kwargs, ): - source = self if axis is None: diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 0eb2e455544..734dd501d48 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -9,21 +9,21 @@ from typing import Any, Callable, Dict, List, Tuple, Type, Union import numpy as np +import pandas as pd import pyarrow as pa +from pandas.api import types as pd_types +from pandas.api.extensions import ExtensionDtype +from pandas.core.dtypes.dtypes import ( + CategoricalDtype as pd_CategoricalDtype, + CategoricalDtypeType as pd_CategoricalDtypeType, +) import cudf -import pandas as pd from cudf._typing import Dtype from cudf.core._compat import PANDAS_GE_150 from cudf.core.abc import Serializable from cudf.core.buffer import Buffer from cudf.utils.docutils import doc_apply -from pandas.api import types as pd_types -from pandas.api.extensions import ExtensionDtype -from pandas.core.dtypes.dtypes import ( - CategoricalDtype as pd_CategoricalDtype, - CategoricalDtypeType as pd_CategoricalDtypeType, -) if PANDAS_GE_150: from pandas.core.arrays.arrow.extension_types import ArrowIntervalType diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index e6ac114bfcb..081109e81bc 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -34,8 +34,8 @@ ) from cudf.core.join._join_helpers import _match_join_keys from cudf.utils.dtypes import is_column_like -from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate +from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]: @@ -726,7 +726,11 @@ def _compute_validity_mask(self, index, row_tuple, max_length): [ frame, cudf.DataFrame( - {"idx": cudf.Series(column.as_column(range(len(frame))))} + { + "idx": cudf.Series( + column.as_column(range(len(frame))) + ) + } ), ], axis=1, diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index dc7d6b84d9b..fc253c5c197 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -4,9 +4,9 @@ import cupy as cp import numpy as np +import pandas as pd import cudf -import pandas as pd from cudf._lib.unary import is_nan from cudf.api.types import ( _is_categorical_dtype, diff --git a/python/cudf/cudf/tests/test_api_types.py b/python/cudf/cudf/tests/test_api_types.py index cc26815920c..7780f9853a2 100644 --- a/python/cudf/cudf/tests/test_api_types.py +++ b/python/cudf/cudf/tests/test_api_types.py @@ -6,9 +6,8 @@ from pandas.api import types as pd_types import cudf -from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210, PANDAS_GE_214 from cudf.api import types - +from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210, PANDAS_GE_214 from cudf.testing._utils import expect_warning_if diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/test_applymap.py index 523a7f424e8..38a34c206d7 100644 --- a/python/cudf/cudf/tests/test_applymap.py +++ b/python/cudf/cudf/tests/test_applymap.py @@ -3,8 +3,8 @@ import pytest from cudf import NA, DataFrame -from cudf.testing import _utils as utils from cudf.core._compat import PANDAS_GE_210 +from cudf.testing import _utils as utils @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py index b3f5267c3b2..090e8884991 100644 --- a/python/cudf/cudf/tests/test_array_function.py +++ b/python/cudf/cudf/tests/test_array_function.py @@ -1,4 +1,5 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. + import numpy as np import pandas as pd import pytest diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 0f7abab8104..ad32ebce01b 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -346,7 +346,6 @@ def test_categorical_set_categories_preserves_order(): def test_categorical_as_ordered(pd_str_cat): - pd_sr = pd.Series(pd_str_cat.copy().set_ordered(False)) cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(False)) @@ -362,7 +361,6 @@ def test_categorical_as_ordered(pd_str_cat): def test_categorical_as_unordered(pd_str_cat): - pd_sr = pd.Series(pd_str_cat.copy().set_ordered(True)) cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(True)) @@ -380,7 +378,6 @@ def test_categorical_as_unordered(pd_str_cat): @pytest.mark.parametrize("from_ordered", [True, False]) @pytest.mark.parametrize("to_ordered", [True, False]) def test_categorical_reorder_categories(pd_str_cat, from_ordered, to_ordered): - pd_sr = pd.Series(pd_str_cat.copy().set_ordered(from_ordered)) cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(from_ordered)) @@ -401,7 +398,6 @@ def test_categorical_reorder_categories(pd_str_cat, from_ordered, to_ordered): def test_categorical_add_categories(pd_str_cat): - pd_sr = pd.Series(pd_str_cat.copy()) cd_sr = cudf.Series(pd_str_cat.copy()) @@ -419,7 +415,6 @@ def test_categorical_add_categories(pd_str_cat): def test_categorical_remove_categories(pd_str_cat): - pd_sr = pd.Series(pd_str_cat.copy()) cd_sr = cudf.Series(pd_str_cat.copy()) diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py index e2a2b307856..bf764b02faa 100644 --- a/python/cudf/cudf/tests/test_column_accessor.py +++ b/python/cudf/cudf/tests/test_column_accessor.py @@ -5,9 +5,9 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_200 from cudf.core.column_accessor import ColumnAccessor from cudf.testing._utils import assert_eq -from cudf.core._compat import PANDAS_GE_200 simple_test_data = [ {}, diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 895e35614fa..ed8f32ed12b 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -1,17 +1,16 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. +import warnings +from contextlib import contextmanager from decimal import Decimal -import warnings import numpy as np import pandas as pd import pytest -from contextlib import contextmanager - import cudf as gd from cudf.api.types import _is_categorical_dtype -from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140, PANDAS_GE_200 +from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_200, PANDAS_LT_140 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing._utils import ( assert_eq, diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 2fa4a313d6f..6de66bf1952 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -17,7 +17,7 @@ import cudf from cudf import read_csv -from cudf.core._compat import PANDAS_LT_140, PANDAS_GE_200 +from cudf.core._compat import PANDAS_GE_200, PANDAS_LT_140 from cudf.testing._utils import assert_eq, assert_exceptions_equal diff --git a/python/cudf/cudf/tests/test_datasets.py b/python/cudf/cudf/tests/test_datasets.py index fdab8cb5edf..320c221fcb2 100644 --- a/python/cudf/cudf/tests/test_datasets.py +++ b/python/cudf/cudf/tests/test_datasets.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import numpy as np diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 14a732deea4..d83b46250d0 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -2,6 +2,7 @@ import datetime import operator +import warnings import cupy as cp import numpy as np @@ -10,15 +11,14 @@ import pytest import cudf -import warnings import cudf.testing.dataset_generator as dataset_generator from cudf import DataFrame, Series from cudf.core._compat import ( - PANDAS_GE_150, - PANDAS_LT_140, PANDAS_EQ_200, + PANDAS_GE_150, PANDAS_GE_200, PANDAS_GE_210, + PANDAS_LT_140, ) from cudf.core.index import DatetimeIndex from cudf.testing._utils import ( @@ -623,22 +623,13 @@ def test_datetime_dataframe(): @pytest.mark.parametrize("dayfirst", [True, False]) def test_cudf_to_datetime(data, dayfirst): pd_data = data - is_string_data = False if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): gd_data = cudf.from_pandas(pd_data) - is_string_data = ( - gd_data.ndim == 1 - and not gd_data.empty - and gd_data.dtype.kind == "O" - ) else: if type(pd_data).__module__ == np.__name__: gd_data = cp.array(pd_data) else: gd_data = pd_data - is_string_data = isinstance(gd_data, list) and isinstance( - next(iter(gd_data), None), str - ) expected = pd.to_datetime(pd_data, dayfirst=dayfirst) actual = cudf.to_datetime(gd_data, dayfirst=dayfirst) @@ -696,7 +687,6 @@ def test_to_datetime_errors(data): def test_to_datetime_not_implemented(): - with pytest.raises(NotImplementedError): cudf.to_datetime([], exact=False) @@ -817,7 +807,6 @@ def test_to_datetime_different_formats_notimplemented(): def test_datetime_can_cast_safely(): - sr = cudf.Series( ["1679-01-01", "2000-01-31", "2261-01-01"], dtype="datetime64[ms]" ) @@ -938,7 +927,6 @@ def test_str_to_datetime_error(): @pytest.mark.parametrize("data_dtype", DATETIME_TYPES) @pytest.mark.parametrize("other_dtype", DATETIME_TYPES) def test_datetime_subtract(data, other, data_dtype, other_dtype): - gsr = cudf.Series(data, dtype=data_dtype) psr = gsr.to_pandas() @@ -1580,7 +1568,8 @@ def test_date_range_start_end_freq(request, start, end, freq): request.applymarker( pytest.mark.xfail( condition=( - not PANDAS_GE_200 and isinstance(freq, dict) + not PANDAS_GE_200 + and isinstance(freq, dict) and freq.get("hours", None) == 10 and freq.get("days", None) == 57 and freq.get("nanoseconds", None) == 3 @@ -1634,7 +1623,8 @@ def test_date_range_start_freq_periods(request, start, freq, periods): request.applymarker( pytest.mark.xfail( condition=( - not PANDAS_GE_200 and isinstance(freq, dict) + not PANDAS_GE_200 + and isinstance(freq, dict) and freq.get("hours", None) == 10 and freq.get("days", None) == 57 and freq.get("nanoseconds", None) == 3 diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py index 4b665cb6f0a..f1acd7b4320 100644 --- a/python/cudf/cudf/tests/test_dropna.py +++ b/python/cudf/cudf/tests/test_dropna.py @@ -21,7 +21,6 @@ @pytest.mark.parametrize("nulls", ["one", "some", "all", "none"]) @pytest.mark.parametrize("inplace", [True, False]) def test_dropna_series(data, nulls, inplace): - psr = pd.Series(data) if len(data) > 0: diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py index b01130d5fa1..447b2b3c4f5 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/test_duplicates.py @@ -9,10 +9,7 @@ import cudf from cudf import concat -from cudf.testing._utils import ( - assert_eq, - assert_exceptions_equal, -) +from cudf.testing._utils import assert_eq, assert_exceptions_equal # TODO: PANDAS 1.0 support # Revisit drop_duplicates() tests to update parameters like ignore_index. diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 77a25e2dbae..0c71d74f89f 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -22,9 +22,9 @@ from cudf.api.extensions import no_default from cudf.core._compat import ( PANDAS_GE_150, - PANDAS_LT_140, PANDAS_GE_200, PANDAS_GE_210, + PANDAS_LT_140, ) from cudf.core.udf._ops import arith_ops, comparison_ops, unary_ops from cudf.core.udf.groupby_typing import SUPPORTED_GROUPBY_NUMPY_TYPES diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py index c84088c1cd3..53919a95115 100644 --- a/python/cudf/cudf/tests/test_monotonic.py +++ b/python/cudf/cudf/tests/test_monotonic.py @@ -10,11 +10,7 @@ import cudf from cudf import Index, MultiIndex, Series -from cudf.core.index import ( - CategoricalIndex, - DatetimeIndex, - RangeIndex, -) +from cudf.core.index import CategoricalIndex, DatetimeIndex, RangeIndex from cudf.testing._utils import assert_eq @@ -46,7 +42,6 @@ def test_range_index(testrange): ], ) def test_generic_index(testlist): - index = Index(testlist) index_pd = pd.Index(testlist) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 69d3fe0b83f..105c31cc71f 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1622,7 +1622,6 @@ def test_parquet_writer_int96_timestamps(tmpdir, pdf, gdf): def test_multifile_parquet_folder(tmpdir): - test_pdf1 = make_pdf(nrows=10, nvalids=10 // 2, dtype="float64") test_pdf2 = make_pdf(nrows=20, dtype="float64") expect = pd.concat([test_pdf1, test_pdf2]) diff --git a/python/cudf/cudf/tests/test_resampling.py b/python/cudf/cudf/tests/test_resampling.py index 7cc4b465873..6281d54aa60 100644 --- a/python/cudf/cudf/tests/test_resampling.py +++ b/python/cudf/cudf/tests/test_resampling.py @@ -5,8 +5,8 @@ import pytest import cudf -from cudf.testing._utils import assert_eq from cudf.core._compat import PANDAS_GE_200 +from cudf.testing._utils import assert_eq def assert_resample_results_equal(lhs, rhs, **kwargs): diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index 7d3e19c002b..22dcf5dfa7e 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -151,7 +151,6 @@ def test_rolling_with_offset(agg): @pytest.mark.parametrize("seed", [100, 2000]) @pytest.mark.parametrize("window_size", [2, 10, 100]) def test_rolling_var_std_large(agg, ddof, center, seed, window_size): - iupper_bound = math.sqrt(np.iinfo(np.int64).max / window_size) ilower_bound = -math.sqrt(abs(np.iinfo(np.int64).min) / window_size) @@ -312,7 +311,6 @@ def test_rolling_getitem_window(): ) @pytest.mark.parametrize("center", [True, False]) def test_rollling_series_numba_udf_basic(data, index, center): - psr = pd.Series(data, index=index) gsr = cudf.from_pandas(psr) @@ -349,7 +347,6 @@ def some_func(A): ) @pytest.mark.parametrize("center", [True, False]) def test_rolling_dataframe_numba_udf_basic(data, center): - pdf = pd.DataFrame(data) gdf = cudf.from_pandas(pdf) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 2e39345f63a..b7be3878412 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -14,6 +14,7 @@ import pytest import cudf +from cudf.api.extensions import no_default from cudf.core._compat import PANDAS_LT_140 from cudf.errors import MixedTypeError from cudf.testing._utils import ( @@ -25,7 +26,6 @@ expect_warning_if, gen_rand, ) -from cudf.api.extensions import no_default def _series_na_data(): diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 6dbb23fbf04..b35dd28c4ec 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -9,13 +9,13 @@ import cudf from cudf.api.extensions import no_default +from cudf.core._compat import PANDAS_GE_210 from cudf.datasets import randomdata from cudf.testing._utils import ( assert_eq, assert_exceptions_equal, expect_warning_if, ) -from cudf.core._compat import PANDAS_GE_210 params_dtypes = [np.int32, np.uint32, np.float32, np.float64] methods = ["min", "max", "sum", "mean", "var", "std"] @@ -182,7 +182,6 @@ def test_exact_quantiles_int(int_method): def test_approx_quantiles(): - arr = np.asarray([6.8, 0.15, 3.4, 4.17, 2.13, 1.11, -1.01, 0.8, 5.7]) quant_values = [0.0, 0.25, 0.33, 0.5, 1.0] @@ -222,7 +221,6 @@ def test_approx_quantiles_int(): ], ) def test_misc_quantiles(data, q): - pdf_series = pd.Series(data, dtype="float64" if len(data) == 0 else None) gdf_series = cudf.from_pandas(pdf_series) @@ -503,7 +501,6 @@ def test_corr1d(data1, data2, method): @pytest.mark.parametrize("method", ["spearman", "pearson"]) def test_df_corr(method): - gdf = randomdata(100, {str(x): float for x in range(50)}) pdf = gdf.to_pandas() got = gdf.corr(method) diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 850c56b7614..980a8c0df2e 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -9,9 +9,9 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_200 from cudf.testing import _utils as utils from cudf.testing._utils import assert_eq, assert_exceptions_equal -from cudf.core._compat import PANDAS_GE_200 _TIMEDELTA_DATA = [ [1000000, 200000, 3000000], diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 2425c323060..e36e1a68114 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -269,7 +269,6 @@ def test_rename_categories(): tm.assert_series_equal(psr, sr) - def test_column_rename(dataframe): pdf, df = dataframe pdf.columns = ["x", "y"] diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index f6b8c34fef0..a6a457d98a4 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -2,11 +2,10 @@ import numpy as np import pandas as pd -import dask import pytest - from pandas.testing import assert_series_equal +import dask from dask import dataframe as dd from cudf import DataFrame, Series, date_range From e74fe0a1a0f3e40e4514fca510429f5e2e33fa76 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 29 Jan 2024 06:58:34 -0800 Subject: [PATCH 125/162] Remove gated xfails (#14905) This removes xpassing tests from the test output. --- python/cudf/cudf/tests/test_index.py | 40 +++--------------------- python/cudf/cudf/tests/test_timedelta.py | 8 +---- 2 files changed, 5 insertions(+), 43 deletions(-) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index ca8ef83316d..4dfbcf138c3 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2274,45 +2274,13 @@ def test_range_index_concat(objs): [ (pd.RangeIndex(0, 10), pd.RangeIndex(3, 7)), (pd.RangeIndex(0, 10), pd.RangeIndex(10, 20)), - pytest.param( - pd.RangeIndex(0, 10, 2), - pd.RangeIndex(1, 5, 3), - marks=pytest.mark.xfail( - condition=PANDAS_GE_200, - reason="https://github.com/pandas-dev/pandas/issues/53490", - strict=False, - ), - ), - pytest.param( - pd.RangeIndex(1, 5, 3), - pd.RangeIndex(0, 10, 2), - marks=pytest.mark.xfail( - condition=PANDAS_GE_200, - reason="https://github.com/pandas-dev/pandas/issues/53490", - strict=False, - ), - ), - pytest.param( - pd.RangeIndex(1, 10, 3), - pd.RangeIndex(1, 5, 2), - marks=pytest.mark.xfail( - condition=PANDAS_GE_200, - reason="https://github.com/pandas-dev/pandas/issues/53490", - strict=False, - ), - ), + (pd.RangeIndex(0, 10, 2), pd.RangeIndex(1, 5, 3)), + (pd.RangeIndex(1, 5, 3), pd.RangeIndex(0, 10, 2)), + (pd.RangeIndex(1, 10, 3), pd.RangeIndex(1, 5, 2)), (pd.RangeIndex(1, 5, 2), pd.RangeIndex(1, 10, 3)), (pd.RangeIndex(1, 100, 3), pd.RangeIndex(1, 50, 3)), (pd.RangeIndex(1, 100, 3), pd.RangeIndex(1, 50, 6)), - pytest.param( - pd.RangeIndex(1, 100, 6), - pd.RangeIndex(1, 50, 3), - marks=pytest.mark.xfail( - condition=PANDAS_GE_200, - reason="https://github.com/pandas-dev/pandas/issues/53490", - strict=False, - ), - ), + (pd.RangeIndex(1, 100, 6), pd.RangeIndex(1, 50, 3)), (pd.RangeIndex(0, 10, name="a"), pd.RangeIndex(90, 100, name="b")), (pd.Index([0, 1, 2, 30], name="a"), pd.Index([90, 100])), (pd.Index([0, 1, 2, 30], name="a"), [90, 100]), diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 980a8c0df2e..7cae2f3a30f 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -538,13 +538,7 @@ def test_timedelta_series_mod_with_scalar_zero(reverse): datetime.timedelta(seconds=768), datetime.timedelta(microseconds=7), np.timedelta64(4, "s"), - pytest.param( - np.timedelta64("nat", "s"), - marks=pytest.mark.xfail( - strict=False, - reason="https://github.com/pandas-dev/pandas/issues/52295", - ), - ), + np.timedelta64("nat", "s"), np.timedelta64(1, "s"), np.timedelta64(1, "ms"), np.timedelta64(1, "us"), From f69ae1d110ce6389ccef115fe5ca49d36066b8ca Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 29 Jan 2024 09:22:37 -0600 Subject: [PATCH 126/162] Add `Groupby.indices` property and deprecate `obj` in `get_group` (#14912) This PR: Introduces Groupby.indices property. Deprecates obj in Groupby.get_group --- python/cudf/cudf/core/groupby/groupby.py | 34 ++++++++++++++++++++++++ python/cudf/cudf/tests/test_groupby.py | 13 +++++++-- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index b73d5532100..b3577444f6b 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -344,6 +344,33 @@ def groups(self): zip(group_names.to_pandas(), grouped_index._split(offsets[1:-1])) ) + @cached_property + def indices(self): + """ + Dict {group name -> group indices}. + + Examples + -------- + >>> import cudf + >>> data = [[10, 20, 30], [10, 30, 40], [40, 50, 30]] + >>> df = cudf.DataFrame(data, columns=["a", "b", "c"]) + >>> df + a b c + 0 10 20 30 + 1 10 30 40 + 2 40 50 30 + >>> df.groupby(by=["a"]).indices + {10: array([0, 1]), 40: array([2])} + """ + group_names, offsets, _, grouped_values = self._grouped() + + return dict( + zip( + group_names.to_pandas(), + np.split(grouped_values.index.values, offsets[1:-1]), + ) + ) + @_cudf_nvtx_annotate def get_group(self, name, obj=None): """ @@ -379,6 +406,13 @@ def get_group(self, name, obj=None): """ if obj is None: obj = self.obj + else: + warnings.warn( + "obj is deprecated and will be removed in a future version. " + "Use ``df.iloc[gb.indices.get(name)]`` " + "instead of ``gb.get_group(name, obj=df)``.", + FutureWarning, + ) return obj.loc[self.groups[name].drop_duplicates()] diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 0c71d74f89f..526aa9f503a 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -3130,11 +3130,20 @@ def test_groupby_get_group(pdf, group, name, obj): else: gobj = obj - expected = pdf.groupby(group).get_group(name=name, obj=obj) - actual = gdf.groupby(group).get_group(name=name, obj=gobj) + pgb = pdf.groupby(group) + ggb = gdf.groupby(group) + with expect_warning_if(obj is not None): + expected = pgb.get_group(name=name, obj=obj) + with expect_warning_if(obj is not None): + actual = ggb.get_group(name=name, obj=gobj) assert_groupby_results_equal(expected, actual) + expected = pdf.iloc[pgb.indices.get(name)] + actual = gdf.iloc[ggb.indices.get(name)] + + assert_eq(expected, actual) + @pytest.mark.parametrize( "by", From fc790ab0a4650188f975ca139313f011d6427a4d Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 29 Jan 2024 13:41:20 -0600 Subject: [PATCH 127/162] Change pandas version range (#14919) This PR pins pandas version range from 2.0 through 2.1.4 --- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-120_arch-x86_64.yaml | 2 +- conda/recipes/cudf/meta.yaml | 2 +- dependencies.yaml | 2 +- python/cudf/pyproject.toml | 2 +- python/dask_cudf/pyproject.toml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 18575ba861b..e749d223bea 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -65,7 +65,7 @@ dependencies: - nvcomp==3.0.5 - nvtx>=0.2.1 - packaging -- pandas==2.1.4 +- pandas>=2.0,<2.1.5dev0 - pandoc - pip - pre-commit diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index abbbb7c2758..80ca746fb38 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -63,7 +63,7 @@ dependencies: - nvcomp==3.0.5 - nvtx>=0.2.1 - packaging -- pandas==2.1.4 +- pandas>=2.0,<2.1.5dev0 - pandoc - pip - pre-commit diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 73cb59bd97a..0dffdc10421 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -76,7 +76,7 @@ requirements: - {{ pin_compatible('protobuf', min_pin='x.x', max_pin='x') }} - python - typing_extensions >=4.0.0 - - pandas >=1.3,<1.6.0dev0 + - pandas >=2.0,<2.1.5dev0 - cupy >=12.0.0 # TODO: Pin to numba<0.58 until #14160 is resolved - numba >=0.57,<0.58 diff --git a/dependencies.yaml b/dependencies.yaml index 96bcf66f99b..cb2102910b5 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -501,7 +501,7 @@ dependencies: packages: - fsspec>=0.6.0 - *numpy - - pandas==2.1.4 + - pandas>=2.0,<2.1.5dev0 run_cudf: common: - output_types: [conda, requirements, pyproject] diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 4e6b8d984b1..81fe0bec325 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -33,7 +33,7 @@ dependencies = [ "numpy>=1.21", "nvtx>=0.2.1", "packaging", - "pandas==2.1.4", + "pandas>=2.0,<2.1.5dev0", "protobuf>=4.21,<5", "ptxcompiler", "pyarrow>=14.0.1,<15.0.0a0", diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 5f0596a1d6a..52ff31af7ba 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -22,7 +22,7 @@ dependencies = [ "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "numpy>=1.21", - "pandas==2.1.4", + "pandas>=2.0,<2.1.5dev0", "rapids-dask-dependency==24.4.*", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ From 5abe6b545198d87fe822a8fbc53ecd5271c9056b Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 29 Jan 2024 15:10:20 -0600 Subject: [PATCH 128/162] Fix custreamz pytests to test on float64 types (#14925) This PR passes types to empty dataframe construction because reductions were being performed on float64 types and now empty column default type is object. --- python/custreamz/custreamz/tests/test_dataframes.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/custreamz/custreamz/tests/test_dataframes.py b/python/custreamz/custreamz/tests/test_dataframes.py index 1a1fc84ef89..7ce398c7617 100644 --- a/python/custreamz/custreamz/tests/test_dataframes.py +++ b/python/custreamz/custreamz/tests/test_dataframes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. """ Tests for Streamz Dataframes (SDFs) built on top of cuDF DataFrames. @@ -863,7 +863,7 @@ def test_rolling_aggs_with_start_state(stream): def test_window_aggs_with_start_state(stream): - example = cudf.DataFrame({"name": [], "amount": []}) + example = cudf.DataFrame({"name": [], "amount": []}, dtype="float64") sdf = DataFrame(stream, example=example) output0 = ( sdf.window(2, with_state=True, start=None) @@ -881,7 +881,7 @@ def test_window_aggs_with_start_state(stream): assert output0[-1][1] == 450 stream = Stream() - example = cudf.DataFrame({"name": [], "amount": []}) + example = cudf.DataFrame({"name": [], "amount": []}, dtype="float64") sdf = DataFrame(stream, example=example) output1 = ( sdf.window(2, with_state=True, start=output0[-1][0]) @@ -895,7 +895,7 @@ def test_window_aggs_with_start_state(stream): def test_windowed_groupby_aggs_with_start_state(stream): - example = cudf.DataFrame({"name": [], "amount": []}) + example = cudf.DataFrame({"name": [], "amount": []}, dtype="float64") sdf = DataFrame(stream, example=example) output0 = ( sdf.window(5, with_state=True, start=None) @@ -915,7 +915,7 @@ def test_windowed_groupby_aggs_with_start_state(stream): stream.emit(df) stream = Stream() - example = cudf.DataFrame({"name": [], "amount": []}) + example = cudf.DataFrame({"name": [], "amount": []}, dtype="float64") sdf = DataFrame(stream, example=example) output1 = ( sdf.window(5, with_state=True, start=output0[-1][0]) From eb957d97b07fdd5cac88348f9568db96ff9baeb4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 29 Jan 2024 15:40:51 -0800 Subject: [PATCH 129/162] Revert unnecessary copyright changes --- python/cudf/CMakeLists.txt | 2 +- python/cudf/cmake/Modules/LinkPyarrowHeaders.cmake | 2 +- python/cudf/cmake/Modules/WheelHelpers.cmake | 2 +- python/cudf/cudf/_lib/CMakeLists.txt | 2 +- python/cudf/cudf/_lib/copying.pyx | 2 +- python/cudf/cudf/_lib/cpp/copying.pxd | 2 +- python/cudf/cudf/_lib/cpp/stream_compaction.pxd | 2 +- python/cudf/cudf/_lib/io/CMakeLists.txt | 2 +- python/cudf/cudf/_lib/nvtext/CMakeLists.txt | 2 +- python/cudf/cudf/_lib/pylibcudf/copying.pxd | 2 +- python/cudf/cudf/_lib/pylibcudf/scalar.pxd | 2 +- python/cudf/cudf/_lib/pylibcudf/table.pxd | 2 +- python/cudf/cudf/_lib/scalar.pxd | 2 +- python/cudf/cudf/_lib/strings/CMakeLists.txt | 2 +- python/cudf/cudf/_lib/strings/convert/CMakeLists.txt | 2 +- python/cudf/cudf/_lib/strings/split/CMakeLists.txt | 2 +- python/cudf/cudf/core/_internals/where.py | 2 +- python/cudf/cudf/core/column/struct.py | 2 +- python/cudf/cudf/core/udf/groupby_typing.py | 2 +- 19 files changed, 19 insertions(+), 19 deletions(-) diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index 481d6194a03..77771afe0e6 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at diff --git a/python/cudf/cmake/Modules/LinkPyarrowHeaders.cmake b/python/cudf/cmake/Modules/LinkPyarrowHeaders.cmake index 93695dd44dc..6b543433a5d 100644 --- a/python/cudf/cmake/Modules/LinkPyarrowHeaders.cmake +++ b/python/cudf/cmake/Modules/LinkPyarrowHeaders.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at diff --git a/python/cudf/cmake/Modules/WheelHelpers.cmake b/python/cudf/cmake/Modules/WheelHelpers.cmake index 3abe98a0647..278d6751c15 100644 --- a/python/cudf/cmake/Modules/WheelHelpers.cmake +++ b/python/cudf/cmake/Modules/WheelHelpers.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 8697f731dfb..b67c26f779f 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index b707fa2a6fc..8eb0500617f 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import pickle diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/cpp/copying.pxd index 6b6b4c87c1d..f3e5c0aec72 100644 --- a/python/cudf/cudf/_lib/cpp/copying.pxd +++ b/python/cudf/cudf/_lib/cpp/copying.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from libc.stdint cimport int32_t, int64_t, uint8_t from libcpp cimport bool diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd index 803b06bc8ae..aef2f639d76 100644 --- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr diff --git a/python/cudf/cudf/_lib/io/CMakeLists.txt b/python/cudf/cudf/_lib/io/CMakeLists.txt index 34c5e8a6e5f..2408fa1c12f 100644 --- a/python/cudf/cudf/_lib/io/CMakeLists.txt +++ b/python/cudf/cudf/_lib/io/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt index f099adf7e0a..55301789812 100644 --- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt +++ b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pxd b/python/cudf/cudf/_lib/pylibcudf/copying.pxd index ed29a3b3be3..21785a9b108 100644 --- a/python/cudf/cudf/_lib/pylibcudf/copying.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/copying.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2023, NVIDIA CORPORATION. from libcpp cimport bool as cbool diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd index b95f8233f4d..4c47de5c0c6 100644 --- a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2023, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pxd b/python/cudf/cudf/_lib/pylibcudf/table.pxd index 2d7fb7d7149..de4ffd73be3 100644 --- a/python/cudf/cudf/_lib/pylibcudf/table.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/table.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2023, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from pyarrow cimport lib as pa diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd index eaf67aa659f..b5c5a8a64a3 100644 --- a/python/cudf/cudf/_lib/scalar.pxd +++ b/python/cudf/cudf/_lib/scalar.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr diff --git a/python/cudf/cudf/_lib/strings/CMakeLists.txt b/python/cudf/cudf/_lib/strings/CMakeLists.txt index dbb2f9d1734..081b84db79c 100644 --- a/python/cudf/cudf/_lib/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/strings/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at diff --git a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt index 05bf6109ad3..ebd7a793bf4 100644 --- a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt +++ b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at diff --git a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt index 930e42d44ef..105e73788fe 100644 --- a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt +++ b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 2f7827521a0..ef6b10f66c1 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. import warnings from typing import Tuple, Union diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 95e776c8720..6cfa8db0d96 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from __future__ import annotations from functools import cached_property diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py index 70864320fd8..72088493074 100644 --- a/python/cudf/cudf/core/udf/groupby_typing.py +++ b/python/cudf/cudf/core/udf/groupby_typing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from typing import Any, Dict import numba From 7f7e237dbbdc107a46b00924574b1d349807a47a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Jan 2024 00:25:48 +0000 Subject: [PATCH 130/162] Undo a few incorrect copyright fixes --- python/cudf/cmake/Modules/LinkPyarrowHeaders.cmake | 2 +- python/cudf/cudf/_lib/pylibcudf/copying.pxd | 2 +- python/cudf/cudf/_lib/pylibcudf/scalar.pxd | 2 +- python/cudf/cudf/_lib/pylibcudf/table.pxd | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cmake/Modules/LinkPyarrowHeaders.cmake b/python/cudf/cmake/Modules/LinkPyarrowHeaders.cmake index 6b543433a5d..d432f9fe1f5 100644 --- a/python/cudf/cmake/Modules/LinkPyarrowHeaders.cmake +++ b/python/cudf/cmake/Modules/LinkPyarrowHeaders.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2023-2023, NVIDIA CORPORATION. +# Copyright (c) 2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pxd b/python/cudf/cudf/_lib/pylibcudf/copying.pxd index 21785a9b108..3567df9ac9c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/copying.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/copying.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2023, NVIDIA CORPORATION. +# Copyright (c) 2023, NVIDIA CORPORATION. from libcpp cimport bool as cbool diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd index 4c47de5c0c6..0edc934ca22 100644 --- a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2023, NVIDIA CORPORATION. +# Copyright (c) 2023, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pxd b/python/cudf/cudf/_lib/pylibcudf/table.pxd index de4ffd73be3..6fe06f00491 100644 --- a/python/cudf/cudf/_lib/pylibcudf/table.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/table.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2023, NVIDIA CORPORATION. +# Copyright (c) 2023, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from pyarrow cimport lib as pa From c6353358541e0401402a32a7384ed8c57c7aae7a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 29 Jan 2024 15:38:53 -1000 Subject: [PATCH 131/162] Remove pandas 1.3, 1.4 checks (#14927) Removes pandas 1.3, 1.4 checks in unit tests Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/14927 --- python/cudf/cudf/core/_compat.py | 3 --- python/cudf/cudf/tests/test_concat.py | 6 +----- python/cudf/cudf/tests/test_csv.py | 6 +----- python/cudf/cudf/tests/test_dataframe.py | 23 ++--------------------- python/cudf/cudf/tests/test_datetime.py | 10 +--------- python/cudf/cudf/tests/test_groupby.py | 11 +---------- python/cudf/cudf/tests/test_index.py | 8 ++------ python/cudf/cudf/tests/test_replace.py | 9 ++------- python/cudf/cudf/tests/test_series.py | 11 +---------- 9 files changed, 11 insertions(+), 76 deletions(-) diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py index fb267bdf7df..b602dfdf23c 100644 --- a/python/cudf/cudf/core/_compat.py +++ b/python/cudf/cudf/core/_compat.py @@ -4,9 +4,6 @@ from packaging import version PANDAS_VERSION = version.parse(pd.__version__) -PANDAS_GE_133 = PANDAS_VERSION >= version.parse("1.3.3") -PANDAS_GE_134 = PANDAS_VERSION >= version.parse("1.3.4") -PANDAS_LT_140 = PANDAS_VERSION < version.parse("1.4.0") PANDAS_GE_150 = PANDAS_VERSION >= version.parse("1.5.0") PANDAS_LT_153 = PANDAS_VERSION < version.parse("1.5.3") PANDAS_EQ_200 = PANDAS_VERSION == version.parse("2.0.0") diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index ed8f32ed12b..9078d54c193 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -10,7 +10,7 @@ import cudf as gd from cudf.api.types import _is_categorical_dtype -from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_200, PANDAS_LT_140 +from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_200 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing._utils import ( assert_eq, @@ -932,10 +932,6 @@ def test_concat_join_one_df(ignore_index, sort, join, axis): @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("join", ["inner", "outer"]) @pytest.mark.parametrize("axis", [0, 1]) -@pytest.mark.xfail( - condition=PANDAS_LT_140, - reason="https://github.com/pandas-dev/pandas/issues/43584", -) def test_concat_join_no_overlapping_columns( pdf1, pdf2, ignore_index, sort, join, axis ): diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 6de66bf1952..8171f3a1872 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -17,7 +17,7 @@ import cudf from cudf import read_csv -from cudf.core._compat import PANDAS_GE_200, PANDAS_LT_140 +from cudf.core._compat import PANDAS_GE_200 from cudf.testing._utils import assert_eq, assert_exceptions_equal @@ -1368,10 +1368,6 @@ def test_csv_reader_column_names(names): assert list(df) == list(names) -@pytest.mark.xfail( - condition=PANDAS_LT_140, - reason="https://github.com/rapidsai/cudf/issues/10618", -) def test_csv_reader_repeated_column_name(): buffer = """A,A,A.1,A,A.2,A,A.4,A,A 1,2,3.1,4,a.2,a,a.4,a,a diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 34dc7ebc68e..fa130a99c72 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -26,11 +26,9 @@ import cudf from cudf.api.extensions import no_default from cudf.core._compat import ( - PANDAS_GE_134, PANDAS_GE_150, PANDAS_GE_200, PANDAS_GE_210, - PANDAS_LT_140, PANDAS_LT_203, ) from cudf.core.buffer.spill_manager import get_global_manager @@ -3589,15 +3587,7 @@ def test_dataframe_empty_sort_index(): [2, 0, 1], ] ), - pytest.param( - pd.RangeIndex(2, -1, -1), - marks=[ - pytest_xfail( - condition=PANDAS_LT_140, - reason="https://github.com/pandas-dev/pandas/issues/43591", - ) - ], - ), + pd.RangeIndex(2, -1, -1), ], ) @pytest.mark.parametrize("axis", [0, 1, "index", "columns"]) @@ -9584,16 +9574,7 @@ def test_explode(data, labels, ignore_index, p_index, label_to_explode): pdf = pd.DataFrame(data, index=p_index, columns=labels) gdf = cudf.from_pandas(pdf) - if PANDAS_GE_134: - expect = pdf.explode(label_to_explode, ignore_index) - else: - # https://github.com/pandas-dev/pandas/issues/43314 - if isinstance(label_to_explode, int): - pdlabel_to_explode = [label_to_explode] - else: - pdlabel_to_explode = label_to_explode - expect = pdf.explode(pdlabel_to_explode, ignore_index) - + expect = pdf.explode(label_to_explode, ignore_index) got = gdf.explode(label_to_explode, ignore_index) assert_eq(expect, got, check_dtype=False) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index d83b46250d0..60b0d787278 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -18,7 +18,6 @@ PANDAS_GE_150, PANDAS_GE_200, PANDAS_GE_210, - PANDAS_LT_140, ) from cudf.core.index import DatetimeIndex from cudf.testing._utils import ( @@ -1500,14 +1499,7 @@ def test_is_month_start(data, dtype): date_range_test_periods = [1, 10, 100] date_range_test_freq = [ {"months": 3, "years": 1}, - pytest.param( - {"hours": 10, "days": 57, "nanoseconds": 3}, - marks=pytest.mark.xfail( - condition=PANDAS_LT_140, - reason="Pandas ignoring nanoseconds component. " - "https://github.com/pandas-dev/pandas/issues/44393", - ), - ), + {"hours": 10, "days": 57, "nanoseconds": 3}, "83D", "17h", "-680T", diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 526aa9f503a..dcfc9d801a4 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -20,12 +20,7 @@ import cudf from cudf import DataFrame, Series from cudf.api.extensions import no_default -from cudf.core._compat import ( - PANDAS_GE_150, - PANDAS_GE_200, - PANDAS_GE_210, - PANDAS_LT_140, -) +from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_200, PANDAS_GE_210 from cudf.core.udf._ops import arith_ops, comparison_ops, unary_ops from cudf.core.udf.groupby_typing import SUPPORTED_GROUPBY_NUMPY_TYPES from cudf.core.udf.utils import UDFError, precompiled @@ -909,10 +904,6 @@ def test_groupby_2keys_agg(nelem, func): # https://github.com/pandas-dev/pandas/issues/40685 is resolved. # "func", ["min", "max", "idxmin", "idxmax", "count", "sum"], ) -@pytest.mark.xfail( - condition=PANDAS_LT_140, - reason="https://github.com/pandas-dev/pandas/issues/43209", -) def test_groupby_agg_decimal(num_groups, nelem_per_group, func): # The number of digits after the decimal to use. decimal_digits = 2 diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 4dfbcf138c3..5cc1c93deff 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -15,7 +15,7 @@ import cudf from cudf.api.extensions import no_default from cudf.api.types import is_bool_dtype -from cudf.core._compat import PANDAS_GE_133, PANDAS_GE_200 +from cudf.core._compat import PANDAS_GE_200 from cudf.core.index import ( CategoricalIndex, DatetimeIndex, @@ -514,15 +514,11 @@ def test_empty_df_head_tail_index(n): None, ), (pd.Index(range(5)), pd.Index(range(4)) > 0, None, ValueError), - pytest.param( + ( pd.Index(range(5)), pd.Index(range(5)) > 1, 10, None, - marks=pytest.mark.xfail( - condition=not PANDAS_GE_133, - reason="https://github.com/pandas-dev/pandas/issues/43240", - ), ), ( pd.Index(np.arange(10)), diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index ac2b2c6cd30..3050ce75d12 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -8,12 +8,7 @@ import pytest import cudf -from cudf.core._compat import ( - PANDAS_GE_134, - PANDAS_GE_150, - PANDAS_GE_200, - PANDAS_GE_210, -) +from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_200, PANDAS_GE_210 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing._utils import ( INTEGER_TYPES, @@ -1016,7 +1011,7 @@ def test_numeric_series_replace_dtype(series_dtype, replacement): pd.Series(["one", "two", "three"], dtype="category"), {"to_replace": "one", "value": "two", "inplace": True}, marks=pytest.mark.xfail( - condition=(not PANDAS_GE_134) or (PANDAS_GE_200), + condition=PANDAS_GE_200, reason="https://github.com/pandas-dev/pandas/issues/43232" "https://github.com/pandas-dev/pandas/issues/53358", ), diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index b7be3878412..14006f90b45 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -15,7 +15,6 @@ import cudf from cudf.api.extensions import no_default -from cudf.core._compat import PANDAS_LT_140 from cudf.errors import MixedTypeError from cudf.testing._utils import ( NUMERIC_TYPES, @@ -1318,15 +1317,7 @@ def test_series_raises_float16(data): pd.RangeIndex(0, 3, 1), [3.0, 1.0, np.nan], ["a", "z", None], - pytest.param( - pd.RangeIndex(4, -1, -2), - marks=[ - pytest.mark.xfail( - condition=PANDAS_LT_140, - reason="https://github.com/pandas-dev/pandas/issues/43591", - ) - ], - ), + pd.RangeIndex(4, -1, -2), ], ) @pytest.mark.parametrize("axis", [0, "index"]) From adcd7e9c757fdda3e25d7c469d062a7402a4ad95 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 30 Jan 2024 00:41:21 -0600 Subject: [PATCH 132/162] Apply suggestions from code review Co-authored-by: Bradley Dice Co-authored-by: Vyas Ramasubramani --- python/cudf/cudf/api/types.py | 3 +-- python/cudf/cudf/core/column/datetime.py | 6 ++---- python/cudf/cudf/core/column/string.py | 6 +++--- python/cudf/cudf/core/column/timedelta.py | 2 +- python/cudf/cudf/core/dataframe.py | 4 ++-- python/cudf/cudf/core/dtypes.py | 2 +- python/cudf/cudf/core/frame.py | 2 +- python/cudf/cudf/core/groupby/groupby.py | 6 +++--- python/cudf/cudf/core/series.py | 4 ++-- python/cudf/cudf/core/single_column_frame.py | 4 ++-- python/cudf/cudf/tests/test_binops.py | 4 ++-- python/cudf/cudf/tests/test_dataframe.py | 2 +- 12 files changed, 21 insertions(+), 24 deletions(-) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index f6f5e522cbd..6a9e5933e12 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -540,8 +540,7 @@ def _is_pandas_nullable_extension_dtype(dtype_to_check) -> bool: is_named_tuple = pd_types.is_named_tuple is_iterator = pd_types.is_iterator is_bool = pd_types.is_bool -is_categorical = pd_types.is_categorical_dtype -# TODO +is_categorical_dtype = pd_types.is_categorical_dtype is_complex = pd_types.is_complex is_float = pd_types.is_float is_hashable = pd_types.is_hashable diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index a0c0b119ef7..6682bbb333b 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -113,14 +113,12 @@ def infer_format(element: str, **kwargs) -> str: raise NotImplementedError( "cuDF does not yet support timezone-aware datetimes" ) - if ".%f" in fmt: + if ".%f" not in fmt: # For context read: # https://github.com/pandas-dev/pandas/issues/52418 # We cannot rely on format containing only %f # c++/libcudf expects .%3f, .%6f, .%9f # Logic below handles those cases well. - pass - else: return fmt element_parts = element.split(".") @@ -534,7 +532,7 @@ def median(self, skipna: Optional[bool] = None) -> pd.Timestamp: def cov(self, other: DatetimeColumn) -> float: if not isinstance(other, DatetimeColumn): raise TypeError( - f"cannot perform corr with types {self.dtype}, {other.dtype}" + f"cannot perform cov with types {self.dtype}, {other.dtype}" ) return self.as_numerical_column("int64").cov( other.as_numerical_column("int64") diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 2fdcf30606a..2373f94ee97 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -697,7 +697,7 @@ def contains( >>> data = ['Mouse', 'dog', 'house and parrot', '23.0', np.NaN] >>> idx = cudf.Index(data) >>> idx - Index(['Mouse' 'dog' 'house and parrot' '23.0' None], dtype='object') + Index(['Mouse', 'dog', 'house and parrot', '23.0', None], dtype='object') >>> idx.str.contains('23', regex=False) Index([False, False, False, True, ], dtype='bool') @@ -2805,7 +2805,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: >>> idx = cudf.Index(['X 123', 'Y 999']) >>> idx - Index(['X 123' 'Y 999'], dtype='object') + Index(['X 123', 'Y 999'], dtype='object') Which will create a MultiIndex: @@ -2878,7 +2878,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: >>> idx = cudf.Index(['X 123', 'Y 999']) >>> idx - Index(['X 123' 'Y 999'], dtype='object') + Index(['X 123', 'Y 999'], dtype='object') Which will create a MultiIndex: diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 094ccb57a1c..edf05fbb264 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -406,7 +406,7 @@ def std( def cov(self, other: TimeDeltaColumn) -> float: if not isinstance(other, TimeDeltaColumn): raise TypeError( - f"cannot perform corr with types {self.dtype}, {other.dtype}" + f"cannot perform cov with types {self.dtype}, {other.dtype}" ) return self.as_numerical_column("int64").cov( other.as_numerical_column("int64") diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d0cbacfb7e8..23f153e14fa 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7267,14 +7267,14 @@ def pct_change( fill_method : str, default 'ffill' How to handle NAs before computing percent changes. - .. deprecated:: 23.12 + .. deprecated:: 24.04 All options of `fill_method` are deprecated except `fill_method=None`. limit : int, optional The number of consecutive NAs to fill before stopping. Not yet implemented. - .. deprecated:: 23.12 + .. deprecated:: 24.04 `limit` is deprecated. freq : str, optional Increment to use from time series API. diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 734dd501d48..11e64faecf9 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -1026,7 +1026,7 @@ def _is_categorical_dtype(obj): def is_categorical_dtype(obj): """Check whether an array-like or dtype is of the Categorical dtype. - .. deprecated:: 23.12 + .. deprecated:: 24.04 Use isinstance(dtype, cudf.CategoricalDtype) instead Parameters diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index d061045fb2d..a1c5cf40024 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -748,7 +748,7 @@ def fillna( non-null value. `bfill` propagates backward with the next non-null value. Cannot be used with ``value``. - .. deprecated:: 23.12 + .. deprecated:: 24.04 `method` is deprecated. Returns diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index b3577444f6b..ba802c47479 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -290,7 +290,7 @@ def dtypes(self): """ Return the dtypes in this group. - .. deprecated:: 23.08 + .. deprecated:: 24.04 Use `.dtypes` on base object instead. Returns @@ -2343,14 +2343,14 @@ def pct_change( fill_method : str, default 'ffill' How to handle NAs before computing percent changes. - .. deprecated:: 23.12 + .. deprecated:: 24.04 All options of `fill_method` are deprecated except `fill_method=None`. limit : int, optional The number of consecutive NAs to fill before stopping. Not yet implemented. - .. deprecated:: 23.12 + .. deprecated:: 24.04 `limit` is deprecated. freq : str, optional Increment to use from time series API. diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 7ff529dbd05..649b0688992 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3582,14 +3582,14 @@ def pct_change( fill_method : str, default 'ffill' How to handle NAs before computing percent changes. - .. deprecated:: 23.12 + .. deprecated:: 24.04 All options of `fill_method` are deprecated except `fill_method=None`. limit : int, optional The number of consecutive NAs to fill before stopping. Not yet implemented. - .. deprecated:: 23.12 + .. deprecated:: 24.04 `limit` is deprecated. freq : str, optional Increment to use from time series API. diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 65fbc968c12..97779522b8b 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -159,7 +159,7 @@ def from_arrow(cls, array): >>> import cudf >>> import pyarrow as pa >>> cudf.Index.from_arrow(pa.array(["a", "b", None])) - Index(['a' 'b' None], dtype='object') + Index(['a', 'b', None], dtype='object') >>> cudf.Series.from_arrow(pa.array(["a", "b", None])) 0 a 1 b @@ -273,7 +273,7 @@ def factorize(self, sort=False, use_na_sentinel=True): >>> codes array([0, 0, 1], dtype=int8) >>> uniques - Index(['a' 'c'], dtype='object') + Index(['a', 'c'], dtype='object') """ return cudf.core.algorithms.factorize( self, diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 3d920f225d3..2c5d46f2ca2 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -663,11 +663,11 @@ def test_different_shapes_and_columns_with_unaligned_indices(binop): # Test with a RangeIndex pdf1 = pd.DataFrame({"x": [4, 3, 2, 1], "y": [7, 3, 8, 6]}) - # Test with a Index + # Test with an Index pdf2 = pd.DataFrame( {"x": [1, 2, 3, 7], "y": [4, 5, 6, 7]}, index=[0, 1, 3, 4] ) - # Test with a Index in a different order + # Test with an Index in a different order pdf3 = pd.DataFrame( {"x": [4, 5, 6, 7], "y": [1, 2, 3, 7], "z": [0, 5, 3, 7]}, index=[0, 3, 5, 3], diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index fa130a99c72..a483657a334 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10900,7 +10900,7 @@ def test_dataframe_contains(name, contains, other_names): assert (contains in pdf) == expectation assert (contains in gdf) == expectation elif pd.api.types.is_float_dtype(gdf.columns.dtype): - # In some cases, the columns are converted to a Index[float] based on + # In some cases, the columns are converted to an Index[float] based on # the other column names. That casts name values from None to np.nan. expectation = contains is np.nan and (name is None or name is np.nan) assert (contains in pdf) == expectation From 86a4068cc61494b56c66b9384a66206d46b06e06 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 29 Jan 2024 20:42:38 -1000 Subject: [PATCH 133/162] Allow hash_array to be findable in pandas 2.0; add workaround for test_resample for cudf.pandas (#14908) Fixes new failure in test_hash_array. Open to feedback on a better approach. The main issue is that some public methods are defined under __getattr__ with no __dir__ to find them (which we rely on for module population) --- python/cudf/cudf/pandas/_wrappers/pandas.py | 17 +++++++++++++++++ .../cudf/cudf_pandas_tests/test_cudf_pandas.py | 3 +++ 2 files changed, 20 insertions(+) diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index 137709925df..b7c8e92e8db 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -2,6 +2,7 @@ # All rights reserved. # SPDX-License-Identifier: Apache-2.0 import copyreg +import importlib import pickle import sys @@ -47,6 +48,22 @@ cudf.set_option("mode.pandas_compatible", True) +def _pandas_util_dir(): + # In pandas 2.0, pandas.util contains public APIs under + # __getattr__ but no __dir__ to find them + # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/util/__init__.py + return list(importlib.import_module("pandas.util").__dict__.keys()) + [ + "hash_array", + "hash_pandas_object", + "Appender", + "Substitution", + "cache_readonly", + ] + + +pd.util.__dir__ = _pandas_util_dir + + def make_final_proxy_type( name, fast_type, diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index e36e1a68114..546f8df95f3 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -909,6 +909,9 @@ def test_resample(): ) expected = ser.resample("D").max() result = xser.resample("D").max() + # TODO: See if as_unit can be avoided + expected.index = expected.index.as_unit("s") + result.index = result.index.as_unit("s") tm.assert_series_equal(result, expected) From 92b6472cd3d8ce2bdbee3e8d9dbae187ec227c31 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 29 Jan 2024 20:44:46 -1000 Subject: [PATCH 134/162] Remove pandas 1.5 checks (#14928) Remove pandas 1.5 checks mostly in unit tests. --- python/cudf/cudf/api/types.py | 4 +- python/cudf/cudf/core/_compat.py | 2 - python/cudf/cudf/core/column/column.py | 8 +-- python/cudf/cudf/core/dtypes.py | 7 +-- python/cudf/cudf/core/multiindex.py | 17 +----- python/cudf/cudf/core/window/rolling.py | 23 +++----- python/cudf/cudf/tests/test_array_ufunc.py | 11 +--- python/cudf/cudf/tests/test_binops.py | 45 ++++----------- python/cudf/cudf/tests/test_concat.py | 62 ++++----------------- python/cudf/cudf/tests/test_dataframe.py | 53 ++---------------- python/cudf/cudf/tests/test_datetime.py | 16 +----- python/cudf/cudf/tests/test_df_protocol.py | 5 -- python/cudf/cudf/tests/test_dtypes.py | 9 +-- python/cudf/cudf/tests/test_groupby.py | 25 +-------- python/cudf/cudf/tests/test_numerical.py | 15 +---- python/cudf/cudf/tests/test_parquet.py | 64 ++++++++++------------ python/cudf/cudf/tests/test_replace.py | 20 +++---- python/cudf/cudf/tests/test_rolling.py | 22 +------- python/cudf/cudf/tests/test_setitem.py | 31 +++-------- python/cudf/cudf/tests/test_string.py | 9 +-- 20 files changed, 98 insertions(+), 350 deletions(-) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 6a9e5933e12..6a9eb68d6f5 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -16,7 +16,6 @@ from pandas.api import types as pd_types import cudf -from cudf.core._compat import PANDAS_GE_150 from cudf.core.dtypes import ( # noqa: F401 _BaseDtype, _is_categorical_dtype, @@ -497,8 +496,9 @@ def _is_pandas_nullable_extension_dtype(dtype_to_check) -> bool: pd.Float64Dtype, pd.BooleanDtype, pd.StringDtype, + pd.ArrowDtype, ), - ) or (PANDAS_GE_150 and isinstance(dtype_to_check, pd.ArrowDtype)): + ): return True elif isinstance(dtype_to_check, pd.CategoricalDtype): return _is_pandas_nullable_extension_dtype( diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py index b602dfdf23c..f15e85b7a88 100644 --- a/python/cudf/cudf/core/_compat.py +++ b/python/cudf/cudf/core/_compat.py @@ -4,8 +4,6 @@ from packaging import version PANDAS_VERSION = version.parse(pd.__version__) -PANDAS_GE_150 = PANDAS_VERSION >= version.parse("1.5.0") -PANDAS_LT_153 = PANDAS_VERSION < version.parse("1.5.3") PANDAS_EQ_200 = PANDAS_VERSION == version.parse("2.0.0") PANDAS_GE_200 = PANDAS_VERSION >= version.parse("2.0.0") PANDAS_GE_201 = PANDAS_VERSION >= version.parse("2.0.1") diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 569e8e30dd2..ad56cabb48e 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -28,6 +28,7 @@ import pyarrow as pa import pyarrow.compute as pc from numba import cuda +from pandas.core.arrays.arrow.extension_types import ArrowIntervalType from typing_extensions import Self import rmm @@ -66,7 +67,7 @@ is_scalar, is_string_dtype, ) -from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_210 +from cudf.core._compat import PANDAS_GE_210 from cudf.core.abc import Serializable from cudf.core.buffer import ( Buffer, @@ -97,11 +98,6 @@ ) from cudf.utils.utils import _array_ufunc, mask_dtype -if PANDAS_GE_150: - from pandas.core.arrays.arrow.extension_types import ArrowIntervalType -else: - from pandas.core.arrays._arrow_utils import ArrowIntervalType - if PANDAS_GE_210: NumpyExtensionArray = pd.arrays.NumpyExtensionArray else: diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 11e64faecf9..f05758d6993 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -13,6 +13,7 @@ import pyarrow as pa from pandas.api import types as pd_types from pandas.api.extensions import ExtensionDtype +from pandas.core.arrays.arrow.extension_types import ArrowIntervalType from pandas.core.dtypes.dtypes import ( CategoricalDtype as pd_CategoricalDtype, CategoricalDtypeType as pd_CategoricalDtypeType, @@ -20,16 +21,10 @@ import cudf from cudf._typing import Dtype -from cudf.core._compat import PANDAS_GE_150 from cudf.core.abc import Serializable from cudf.core.buffer import Buffer from cudf.utils.docutils import doc_apply -if PANDAS_GE_150: - from pandas.core.arrays.arrow.extension_types import ArrowIntervalType -else: - from pandas.core.arrays._arrow_utils import ArrowIntervalType - def dtype(arbitrary): """ diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 081109e81bc..a747ca8eea0 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -24,7 +24,6 @@ from cudf.api.extensions import no_default from cudf.api.types import is_integer, is_list_like, is_object_dtype from cudf.core import column -from cudf.core._compat import PANDAS_GE_150 from cudf.core.frame import Frame from cudf.core.index import ( BaseIndex, @@ -469,21 +468,7 @@ def __repr__(self): ) ) - if not PANDAS_GE_150: - # Need this whole `if` block, - # this is a workaround for the following issue: - # https://github.com/pandas-dev/pandas/issues/39984 - preprocess_pdf = pd.DataFrame( - { - name: col.to_pandas(nullable=(col.dtype.kind != "f")) - for name, col in preprocess._data.items() - } - ) - - preprocess_pdf.columns = preprocess.names - preprocess = pd.MultiIndex.from_frame(preprocess_pdf) - else: - preprocess = preprocess.to_pandas(nullable=True) + preprocess = preprocess.to_pandas(nullable=True) preprocess.values[:] = tuples_list else: preprocess = preprocess.to_pandas(nullable=True) diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index f4322aefceb..890e4ecc2f0 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -10,7 +10,6 @@ from cudf import _lib as libcudf from cudf.api.types import is_integer, is_number from cudf.core import column -from cudf.core._compat import PANDAS_GE_150 from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import as_column from cudf.core.mixins import Reducible @@ -217,21 +216,13 @@ def _apply_agg_column(self, source_column, agg_name): following_window = None window = self.window elif isinstance(self.window, BaseIndexer): - if PANDAS_GE_150: - start, end = self.window.get_window_bounds( - num_values=len(self.obj), - min_periods=self.min_periods, - center=self.center, - closed=None, - step=None, - ) - else: - start, end = self.window.get_window_bounds( - num_values=len(self.obj), - min_periods=self.min_periods, - center=self.center, - closed=None, - ) + start, end = self.window.get_window_bounds( + num_values=len(self.obj), + min_periods=self.min_periods, + center=self.center, + closed=None, + step=None, + ) start = as_column(start, dtype="int32") end = as_column(end, dtype="int32") diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index 3e3f3aa5dfa..d6b944ebeac 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -12,7 +12,7 @@ from packaging import version import cudf -from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_200, PANDAS_GE_210 +from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210 from cudf.testing._utils import ( assert_eq, expect_warning_if, @@ -76,15 +76,6 @@ def _hide_ufunc_warnings(ufunc): def test_ufunc_index(request, ufunc): # Note: This test assumes that all ufuncs are unary or binary. fname = ufunc.__name__ - request.applymarker( - pytest.mark.xfail( - condition=( - fname in {"bitwise_and", "bitwise_or", "bitwise_xor"} - and not PANDAS_GE_150 - ), - reason="https://github.com/pandas-dev/pandas/issues/46769", - ) - ) request.applymarker( pytest.mark.xfail( condition=not hasattr(cp, fname), diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 2c5d46f2ca2..3ebefa6e071 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -13,7 +13,6 @@ import cudf from cudf import Series -from cudf.core._compat import PANDAS_GE_150 from cudf.core.buffer.spill_manager import get_global_manager from cudf.core.index import as_index from cudf.testing import _utils as utils @@ -1706,13 +1705,7 @@ def test_scalar_null_binops(op, dtype_l, dtype_r): "minutes", "seconds", "microseconds", - pytest.param( - "nanoseconds", - marks=pytest_xfail( - condition=not PANDAS_GE_150, - reason="https://github.com/pandas-dev/pandas/issues/36589", - ), - ), + "nanoseconds", ], ) @pytest.mark.parametrize( @@ -1758,29 +1751,17 @@ def test_datetime_dateoffset_binaryop( {"months": 2, "years": 5}, {"microseconds": 1, "seconds": 1}, {"months": 2, "years": 5, "seconds": 923, "microseconds": 481}, - pytest.param( - {"milliseconds": 4}, - marks=pytest.mark.xfail( - condition=not PANDAS_GE_150, - reason="Pandas gets the wrong answer for milliseconds", - ), - ), - pytest.param( - {"milliseconds": 4, "years": 2}, - marks=pytest_xfail( - reason="https://github.com/pandas-dev/pandas/issues/49897" - ), - ), - pytest.param( - {"nanoseconds": 12}, - marks=pytest.mark.xfail( - condition=not PANDAS_GE_150, - reason="Pandas gets the wrong answer for nanoseconds", - ), - ), + {"milliseconds": 4}, + {"milliseconds": 4, "years": 2}, {"nanoseconds": 12}, ], ) +@pytest.mark.filterwarnings( + "ignore:Non-vectorized DateOffset:pandas.errors.PerformanceWarning" +) +@pytest.mark.filterwarnings( + "ignore:Discarding nonzero nanoseconds:UserWarning" +) @pytest.mark.parametrize("op", [operator.add, operator.sub]) def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op): gsr = cudf.Series(date_col, dtype="datetime64[ns]") @@ -1816,13 +1797,7 @@ def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op): "minutes", "seconds", "microseconds", - pytest.param( - "nanoseconds", - marks=pytest_xfail( - condition=not PANDAS_GE_150, - reason="https://github.com/pandas-dev/pandas/issues/36589", - ), - ), + "nanoseconds", ], ) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 9078d54c193..4b0e46bf286 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -10,7 +10,7 @@ import cudf as gd from cudf.api.types import _is_categorical_dtype -from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_200 +from cudf.core._compat import PANDAS_GE_200 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing._utils import ( assert_eq, @@ -828,13 +828,7 @@ def test_concat_join_axis_1(objs, ignore_index, sort, join, axis): axis=axis, ) - if PANDAS_GE_150: - assert_eq(expected, actual, check_index_type=True) - else: - # special handling of check_index_type below - # required because: - # https://github.com/pandas-dev/pandas/issues/47501 - assert_eq(expected, actual, check_index_type=not (axis == 1 and sort)) + assert_eq(expected, actual, check_index_type=True) @pytest.mark.parametrize("ignore_index", [True, False]) @@ -902,13 +896,7 @@ def test_concat_join_one_df(ignore_index, sort, join, axis): [gdf1], sort=sort, join=join, ignore_index=ignore_index, axis=axis ) - if PANDAS_GE_150: - assert_eq(expected, actual, check_index_type=True) - else: - # special handling of check_index_type below - # required because: - # https://github.com/pandas-dev/pandas/issues/47501 - assert_eq(expected, actual, check_index_type=not (axis == 1 and sort)) + assert_eq(expected, actual, check_index_type=True) @pytest.mark.parametrize( @@ -953,13 +941,7 @@ def test_concat_join_no_overlapping_columns( axis=axis, ) - if PANDAS_GE_150: - assert_eq(expected, actual, check_index_type=True) - else: - # special handling of check_index_type below - # required because: - # https://github.com/pandas-dev/pandas/issues/47501 - assert_eq(expected, actual, check_index_type=not (axis == 1 and sort)) + assert_eq(expected, actual, check_index_type=True) @pytest.mark.parametrize("ignore_index", [False, True]) @@ -1113,7 +1095,7 @@ def test_concat_join_no_overlapping_columns_empty_df_basic( assert_eq( expected, actual, - check_index_type=PANDAS_GE_150, + check_index_type=True, check_column_type=not PANDAS_GE_200, ) @@ -1149,21 +1131,11 @@ def test_concat_join_series(ignore_index, sort, join, axis): axis=axis, ) - if PANDAS_GE_150: - assert_eq( - expected, - actual, - check_index_type=True, - ) - else: - # special handling of check_index_type required below: - # https://github.com/pandas-dev/pandas/issues/46675 - # https://github.com/pandas-dev/pandas/issues/47501 - assert_eq( - expected, - actual, - check_index_type=(axis == 0), - ) + assert_eq( + expected, + actual, + check_index_type=True, + ) @pytest.mark.parametrize( @@ -1323,19 +1295,7 @@ def test_concat_join_empty_dataframes( ) @pytest.mark.parametrize("ignore_index", [True, False]) @pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.parametrize( - "join", - [ - "inner", - pytest.param( - "outer", - marks=pytest.mark.xfail( - condition=not PANDAS_GE_150, - reason="https://github.com/pandas-dev/pandas/issues/37937", - ), - ), - ], -) +@pytest.mark.parametrize("join", ["inner", "outer"]) @pytest.mark.parametrize("axis", [1]) def test_concat_join_empty_dataframes_axis_1( df, other, ignore_index, axis, join, sort diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index a483657a334..a0f6c4c3cfc 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -25,12 +25,7 @@ import cudf from cudf.api.extensions import no_default -from cudf.core._compat import ( - PANDAS_GE_150, - PANDAS_GE_200, - PANDAS_GE_210, - PANDAS_LT_203, -) +from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210, PANDAS_LT_203 from cudf.core.buffer.spill_manager import get_global_manager from cudf.core.column import column from cudf.errors import MixedTypeError @@ -345,27 +340,9 @@ def test_concat_index(a, b): {"a": [1, None, None], "b": [3, np.nan, np.nan]}, {1: ["a", "b", "c"], 2: ["q", "w", "u"]}, {1: ["a", np.nan, "c"], 2: ["q", None, "u"]}, - pytest.param( - {}, - marks=pytest_xfail( - condition=not PANDAS_GE_150, - reason="https://github.com/rapidsai/cudf/issues/11080", - ), - ), - pytest.param( - {1: [], 2: [], 3: []}, - marks=pytest_xfail( - condition=not PANDAS_GE_150, - reason="https://github.com/rapidsai/cudf/issues/11080", - ), - ), - pytest.param( - [1, 2, 3], - marks=pytest_xfail( - condition=not PANDAS_GE_150, - reason="https://github.com/rapidsai/cudf/issues/11080", - ), - ), + {}, + {1: [], 2: [], 3: []}, + [1, 2, 3], ], ) def test_axes(data): @@ -1882,18 +1859,7 @@ def test_nonmatching_index_setitem(nrows): assert_eq(gdf["c"].to_pandas(), gdf_series.to_pandas()) -@pytest.mark.parametrize( - "dtype", - [ - "int", - pytest.param( - "int64[pyarrow]", - marks=pytest.mark.skipif( - not PANDAS_GE_150, reason="pyarrow support only in >=1.5" - ), - ), - ], -) +@pytest.mark.parametrize("dtype", ["int", "int64[pyarrow]"]) def test_from_pandas(dtype): df = pd.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0], dtype=dtype) df.columns.name = "custom_column_name" @@ -7710,14 +7676,7 @@ def test_dataframe_concat_dataframe(df, other, sort, ignore_index): "other", [ pd.Series([10, 11, 23, 234, 13]), - pytest.param( - pd.Series([10, 11, 23, 234, 13], index=[11, 12, 13, 44, 33]), - marks=pytest.mark.xfail( - condition=not PANDAS_GE_150, - reason="pandas bug: " - "https://github.com/pandas-dev/pandas/issues/35092", - ), - ), + pd.Series([10, 11, 23, 234, 13], index=[11, 12, 13, 44, 33]), {1: 1}, {0: 10, 1: 100, 2: 102}, ], diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 60b0d787278..62733625485 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -13,12 +13,7 @@ import cudf import cudf.testing.dataset_generator as dataset_generator from cudf import DataFrame, Series -from cudf.core._compat import ( - PANDAS_EQ_200, - PANDAS_GE_150, - PANDAS_GE_200, - PANDAS_GE_210, -) +from cudf.core._compat import PANDAS_EQ_200, PANDAS_GE_200, PANDAS_GE_210 from cudf.core.index import DatetimeIndex from cudf.testing._utils import ( DATETIME_TYPES, @@ -1504,14 +1499,7 @@ def test_is_month_start(data, dtype): "17h", "-680T", "110546s", - pytest.param( - "110546789L", - marks=pytest.mark.xfail( - condition=not PANDAS_GE_150, - reason="Pandas DateOffset ignores milliseconds. " - "https://github.com/pandas-dev/pandas/issues/43371", - ), - ), + "110546789L", "110546789248U", ] diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index bffbade14d8..a22b678ebe6 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -7,7 +7,6 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_150 from cudf.core.buffer import as_buffer from cudf.core.column import as_column, build_column from cudf.core.df_protocol import ( @@ -278,9 +277,5 @@ def test_NA_mixed_dtype(): assert_df_unique_dtype_cols(data_mixed) -@pytest.mark.skipif( - not PANDAS_GE_150, - reason="Pandas versions < 1.5.0 do not support interchange protocol", -) def test_from_cpu_df(pandas_df): cudf.from_dataframe(pandas_df, allow_copy=True) diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index 6e24099f1a8..0efd8d9781c 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -1,12 +1,12 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import numpy as np import pandas as pd import pyarrow as pa import pytest +from pandas.core.arrays.arrow.extension_types import ArrowIntervalType import cudf -from cudf.core._compat import PANDAS_GE_150 from cudf.core.column import ColumnBase from cudf.core.dtypes import ( CategoricalDtype, @@ -20,11 +20,6 @@ from cudf.testing._utils import assert_eq from cudf.utils.dtypes import np_to_pa_dtype -if PANDAS_GE_150: - from pandas.core.arrays.arrow.extension_types import ArrowIntervalType -else: - from pandas.core.arrays._arrow_utils import ArrowIntervalType - def test_cdt_basic(): psr = pd.Series(["a", "b", "a", "c"], dtype="category") diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index dcfc9d801a4..a0b86d735cc 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -20,7 +20,7 @@ import cudf from cudf import DataFrame, Series from cudf.api.extensions import no_default -from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_200, PANDAS_GE_210 +from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210 from cudf.core.udf._ops import arith_ops, comparison_ops, unary_ops from cudf.core.udf.groupby_typing import SUPPORTED_GROUPBY_NUMPY_TYPES from cudf.core.udf.utils import UDFError, precompiled @@ -1190,13 +1190,7 @@ def test_advanced_groupby_levels(): @pytest.mark.parametrize( "func", [ - pytest.param( - lambda df: df.groupby(["x", "y", "z"]).sum(), - marks=pytest.mark.xfail( - condition=not PANDAS_GE_150, - reason="https://github.com/pandas-dev/pandas/issues/32464", - ), - ), + lambda df: df.groupby(["x", "y", "z"]).sum(), lambda df: df.groupby(["x", "y"]).sum(), lambda df: df.groupby(["x", "y"]).agg("sum"), lambda df: df.groupby(["y"]).sum(), @@ -3294,20 +3288,7 @@ def test_groupby_pct_change_empty_columns(): assert_eq(expected, actual) -@pytest.mark.parametrize( - "group_keys", - [ - None, - pytest.param( - True, - marks=pytest.mark.xfail( - condition=not PANDAS_GE_150, - reason="https://github.com/pandas-dev/pandas/pull/34998", - ), - ), - False, - ], -) +@pytest.mark.parametrize("group_keys", [None, True, False]) @pytest.mark.parametrize("by", ["A", ["A", "B"]]) def test_groupby_group_keys(group_keys, by): gdf = cudf.DataFrame( diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index fee5cc0ad21..2139e7b9860 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -5,7 +5,6 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_150 from cudf.testing._utils import NUMERIC_TYPES, assert_eq from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes @@ -267,12 +266,7 @@ def test_to_numeric_downcast_large_float_pd_bug(data, downcast): expected = pd.to_numeric(ps, downcast=downcast) got = cudf.to_numeric(gs, downcast=downcast) - if PANDAS_GE_150: - assert_eq(expected, got) - else: - # Pandas bug: https://github.com/pandas-dev/pandas/issues/19729 - with pytest.raises(AssertionError, match="Series are different"): - assert_eq(expected, got) + assert_eq(expected, got) @pytest.mark.parametrize( @@ -350,12 +344,7 @@ def test_to_numeric_downcast_string_large_float(data, downcast): expected = pd.to_numeric(ps, downcast=downcast) got = cudf.to_numeric(gs, downcast=downcast) - if PANDAS_GE_150: - assert_eq(expected, got) - else: - # Pandas bug: https://github.com/pandas-dev/pandas/issues/19729 - with pytest.raises(AssertionError, match="Series are different"): - assert_eq(expected, got) + assert_eq(expected, got) else: expected = pd.Series([np.inf, -np.inf]) with pytest.warns( diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 105c31cc71f..b4e24bd1617 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -21,7 +21,7 @@ from pyarrow import fs as pa_fs, parquet as pq import cudf -from cudf.core._compat import PANDAS_GE_200, PANDAS_LT_153 +from cudf.core._compat import PANDAS_GE_200 from cudf.io.parquet import ( ParquetDatasetWriter, ParquetWriter, @@ -2612,43 +2612,37 @@ def test_parquet_writer_list_statistics(tmpdir): ] }, # Struct of Lists - pytest.param( - { - "Real estate records": [ - None, - { - "Status": "NRI", - "Ownerships": { - "land_unit": [None, 2, None], - "flats": [[1, 2, 3], [], [4, 5], [], [0, 6, 0]], - }, - }, - { - "Status": None, - "Ownerships": { - "land_unit": [4, 5], - "flats": [[7, 8], []], - }, + { + "Real estate records": [ + None, + { + "Status": "NRI", + "Ownerships": { + "land_unit": [None, 2, None], + "flats": [[1, 2, 3], [], [4, 5], [], [0, 6, 0]], }, - { - "Status": "RI", - "Ownerships": {"land_unit": None, "flats": [[]]}, + }, + { + "Status": None, + "Ownerships": { + "land_unit": [4, 5], + "flats": [[7, 8], []], }, - {"Status": "RI", "Ownerships": None}, - { - "Status": None, - "Ownerships": { - "land_unit": [7, 8, 9], - "flats": [[], [], []], - }, + }, + { + "Status": "RI", + "Ownerships": {"land_unit": None, "flats": [[]]}, + }, + {"Status": "RI", "Ownerships": None}, + { + "Status": None, + "Ownerships": { + "land_unit": [7, 8, 9], + "flats": [[], [], []], }, - ] - }, - marks=pytest.mark.xfail( - condition=PANDAS_LT_153, - reason="pandas assertion fixed in pandas 1.5.3", - ), - ), + }, + ] + }, ], ) def test_parquet_writer_nested(tmpdir, data): diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index 3050ce75d12..6db1c97b9fd 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -8,7 +8,7 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_200, PANDAS_GE_210 +from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing._utils import ( INTEGER_TYPES, @@ -167,18 +167,12 @@ def test_series_replace_with_nulls(): "c": ["abc", "def", ".", None, None], } ), - pytest.param( - cudf.DataFrame( - { - "a": ["one", "two", None, "three"], - "b": ["one", None, "two", "three"], - }, - dtype="category", - ), - marks=pytest.mark.xfail( - condition=not PANDAS_GE_150, - reason="https://github.com/pandas-dev/pandas/issues/46672", - ), + cudf.DataFrame( + { + "a": ["one", "two", None, "three"], + "b": ["one", None, "two", "three"], + }, + dtype="category", ), cudf.DataFrame( { diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index 22dcf5dfa7e..9c3c9d1082c 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -8,7 +8,7 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_200 +from cudf.core._compat import PANDAS_GE_200 from cudf.testing._utils import assert_eq from cudf.testing.dataset_generator import rand_dataframe @@ -480,7 +480,7 @@ def test_rolling_custom_index_support(): from pandas.api.indexers import BaseIndexer class CustomIndexer(BaseIndexer): - def custom_get_window_bounds( + def get_window_bounds( self, num_values, min_periods, center, closed, step=None ): start = np.empty(num_values, dtype=np.int64) @@ -496,24 +496,6 @@ def custom_get_window_bounds( return start, end - if PANDAS_GE_150: - - def get_window_bounds( - self, num_values, min_periods, center, closed, step - ): - return self.custom_get_window_bounds( - num_values, min_periods, center, closed, step - ) - - else: - - def get_window_bounds( - self, num_values, min_periods, center, closed - ): - return self.custom_get_window_bounds( - num_values, min_periods, center, closed - ) - use_expanding = [True, False, True, False, True] indexer = CustomIndexer(window_size=1, use_expanding=use_expanding) diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index e8d7fdadbff..de0826d61e9 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -5,7 +5,7 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_210 +from cudf.core._compat import PANDAS_GE_210 from cudf.testing._utils import ( assert_eq, assert_exceptions_equal, @@ -230,22 +230,12 @@ def test_categorical_setitem_invalid(): ps = pd.Series([1, 2, 3], dtype="category") gs = cudf.Series([1, 2, 3], dtype="category") - if PANDAS_GE_150: - assert_exceptions_equal( - lfunc=ps.__setitem__, - rfunc=gs.__setitem__, - lfunc_args_and_kwargs=([0, 5], {}), - rfunc_args_and_kwargs=([0, 5], {}), - ) - else: - # Following workaround is needed because: - # https://github.com/pandas-dev/pandas/issues/46646 - with pytest.raises( - ValueError, - match="Cannot setitem on a Categorical with a new category, set " - "the categories first", - ): - gs[0] = 5 + assert_exceptions_equal( + lfunc=ps.__setitem__, + rfunc=gs.__setitem__, + lfunc_args_and_kwargs=([0, 5], {}), + rfunc_args_and_kwargs=([0, 5], {}), + ) def test_series_slice_setitem_list(): @@ -318,11 +308,8 @@ def test_series_setitem_upcasting(dtype, indices): sr[indices] = new_value with expect_warning_if(dtype != np.float64): cr[indices] = new_value - if PANDAS_GE_150: - assert_eq(sr, cr) - else: - # pandas bug, incorrectly fails to upcast from float32 to float64 - assert_eq(sr.values, cr.values) + assert_eq(sr, cr) + if dtype == np.float64: # no-op type cast should not modify backing column assert col_ref == cr._column diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 8c8a3cb2399..b2bf687ba06 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -15,7 +15,6 @@ import cudf from cudf import concat -from cudf.core._compat import PANDAS_GE_150 from cudf.core.column.string import StringColumn from cudf.core.index import Index, as_index from cudf.testing._utils import ( @@ -1721,13 +1720,7 @@ def test_strings_filling_tests(data, width, fillchar): ["A,,B", "1,,5", "3,00,0"], ["Linda van der Berg", "George Pitt-Rivers"], ["³", "⅕", ""], - pytest.param( - ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], - marks=pytest.mark.xfail( - condition=not PANDAS_GE_150, - reason="https://github.com/pandas-dev/pandas/issues/20868", - ), - ), + ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], [" ", "\t\r\n ", ""], ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], ], From 132978fe85e7f700078a1eb1f0a4264ff404274f Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 30 Jan 2024 08:19:38 +0000 Subject: [PATCH 135/162] Address all remaining reviews --- .pre-commit-config.yaml | 2 ++ python/cudf/cudf/api/types.py | 4 +++- python/cudf/cudf/core/_compat.py | 1 + python/cudf/cudf/core/column_accessor.py | 28 ++++------------------ python/cudf/cudf/core/dataframe.py | 12 +++++++--- python/cudf/cudf/core/dtypes.py | 2 ++ python/cudf/cudf/core/groupby/groupby.py | 7 ++++++ python/cudf/cudf/core/index.py | 5 +++- python/cudf/cudf/core/indexed_frame.py | 6 +++++ python/cudf/cudf/core/reshape.py | 4 ++++ python/cudf/cudf/core/series.py | 13 ++++++++++ python/cudf/cudf/tests/test_array_ufunc.py | 7 ++---- python/cudf/cudf/tests/test_timedelta.py | 5 +--- python/cudf/cudf/utils/ioutils.py | 10 ++++++++ 14 files changed, 68 insertions(+), 38 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ad8e2f6c5ee..ccda2596031 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -91,6 +91,8 @@ repos: entry: '(category=|\s)DeprecationWarning[,)]' language: pygrep types_or: [python, cython] + # We need to exclude just the following file because few APIs still need + # DeprecationWarning: https://github.com/pandas-dev/pandas/issues/54970 exclude: | (?x)^( ^python/cudf/cudf/core/dtypes.py diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 6a9eb68d6f5..a422eb82231 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -16,6 +16,7 @@ from pandas.api import types as pd_types import cudf +from cudf.core._compat import PANDAS_LT_300 from cudf.core.dtypes import ( # noqa: F401 _BaseDtype, _is_categorical_dtype, @@ -467,11 +468,13 @@ def is_any_real_numeric_dtype(arr_or_dtype) -> bool: def _is_datetime64tz_dtype(obj): with warnings.catch_warnings(): warnings.simplefilter("ignore") + assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." return _wrap_pandas_is_dtype_api(pd_types.is_datetime64tz_dtype)(obj) def is_datetime64tz_dtype(obj): # Do not remove until pandas 3.0 support is added. + assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." warnings.warn( "is_datetime64tz_dtype is deprecated and will be removed in a future " "version.", @@ -540,7 +543,6 @@ def _is_pandas_nullable_extension_dtype(dtype_to_check) -> bool: is_named_tuple = pd_types.is_named_tuple is_iterator = pd_types.is_iterator is_bool = pd_types.is_bool -is_categorical_dtype = pd_types.is_categorical_dtype is_complex = pd_types.is_complex is_float = pd_types.is_float is_hashable = pd_types.is_hashable diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py index f15e85b7a88..5aa685560c8 100644 --- a/python/cudf/cudf/core/_compat.py +++ b/python/cudf/cudf/core/_compat.py @@ -11,3 +11,4 @@ PANDAS_GE_214 = PANDAS_VERSION >= version.parse("2.1.4") PANDAS_GE_220 = PANDAS_VERSION >= version.parse("2.2.0") PANDAS_LT_203 = PANDAS_VERSION < version.parse("2.0.3") +PANDAS_LT_300 = PANDAS_VERSION < version.parse("3.0.0") diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 93abaae6120..33085bede78 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -3,7 +3,6 @@ from __future__ import annotations import itertools -import warnings from collections import abc from functools import cached_property, reduce from typing import ( @@ -23,7 +22,6 @@ import cudf from cudf.core import column -from cudf.core._compat import PANDAS_GE_200 if TYPE_CHECKING: from cudf._typing import Dtype @@ -237,28 +235,10 @@ def _clear_cache(self): def to_pandas_index(self) -> pd.Index: """Convert the keys of the ColumnAccessor to a Pandas Index object.""" if self.multiindex and len(self.level_names) > 0: - if PANDAS_GE_200: - result = pd.MultiIndex.from_tuples( - self.names, - names=self.level_names, - ) - else: - # Using `from_frame()` instead of `from_tuples` - # prevents coercion of values to a different type - # (e.g., ''->NaT) - with warnings.catch_warnings(): - # Specifying `dtype="object"` here and passing that to - # `from_frame` is deprecated in pandas, but we cannot - # remove that without also losing compatibility with other - # current pandas behaviors like the NaT inference above. - warnings.simplefilter("ignore") - result = pd.MultiIndex.from_frame( - pd.DataFrame( - self.names, - columns=self.level_names, - dtype="object", - ), - ) + result = pd.MultiIndex.from_tuples( + self.names, + names=self.level_names, + ) else: # Determine if we can return a RangeIndex if self.rangeindex: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 23f153e14fa..1b0f83c5d70 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -58,7 +58,7 @@ is_string_dtype, ) from cudf.core import column, df_protocol, indexing_utils, reshape -from cudf.core._compat import PANDAS_GE_200 +from cudf.core._compat import PANDAS_GE_200, PANDAS_LT_300 from cudf.core.abc import Serializable from cudf.core.column import ( CategoricalColumn, @@ -4589,6 +4589,7 @@ def applymap( Transformed DataFrame. """ # Do not remove until pandas 3.0 support is added. + assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." warnings.warn( "DataFrame.applymap has been deprecated. Use DataFrame.map " "instead.", @@ -6102,8 +6103,6 @@ def _reduce( source = self if axis is None: - # if op in {"any", "all"}: - # axis = 2 if op in {"sum", "product", "std", "var"}: # Do not remove until pandas 2.0 support is added. warnings.warn( @@ -6140,6 +6139,7 @@ def _reduce( if axis == 2 and op in ("kurtosis", "kurt", "skew"): # TODO: concat + op can probably be done in the general case # for axis == 2. + # https://github.com/rapidsai/cudf/issues/14930 return getattr(concat_columns(source._data.columns), op)( **kwargs ) @@ -6323,6 +6323,9 @@ def mode(self, axis=0, numeric_only=False, dropna=True): return DataFrame() with warnings.catch_warnings(): + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." warnings.simplefilter("ignore", FutureWarning) df = cudf.concat(mode_results, axis=1) @@ -7303,6 +7306,9 @@ def pct_change( if fill_method not in (no_default, None) or limit is not no_default: # Do not remove until pandas 3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." warnings.warn( "The 'fill_method' and 'limit' keywords in " f"{type(self).__name__}.pct_change are deprecated and will be " diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index f05758d6993..17d6d42618a 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -21,6 +21,7 @@ import cudf from cudf._typing import Dtype +from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable from cudf.core.buffer import Buffer from cudf.utils.docutils import doc_apply @@ -1035,6 +1036,7 @@ def is_categorical_dtype(obj): Whether or not the array-like or dtype is of a categorical dtype. """ # Do not remove until pandas 3.0 support is added. + assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." warnings.warn( "is_categorical_dtype is deprecated and will be removed in a future " "version. Use isinstance(dtype, cudf.CategoricalDtype) instead", diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index ba802c47479..1f08abdc7fc 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -23,6 +23,7 @@ from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType from cudf.api.extensions import no_default from cudf.api.types import is_bool_dtype, is_float_dtype, is_list_like +from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable from cudf.core.column.column import ColumnBase, as_column from cudf.core.column_accessor import ColumnAccessor @@ -2244,6 +2245,9 @@ def fillna( if method not in {"ffill", "bfill"}: raise ValueError("Method can only be of 'ffill', 'bfill'.") # Do not remove until pandas 3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." warnings.warn( f"{type(self).__name__}.fillna with 'method' is " "deprecated and will raise in a future version. " @@ -2374,6 +2378,9 @@ def pct_change( if fill_method not in (no_default, None) or limit is not no_default: # Do not remove until pandas 3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." warnings.warn( "The 'fill_method' keyword being not None and the 'limit' " f"keywords in {type(self).__name__}.pct_change are " diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 2bd4219997f..fa9e49baaa2 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -40,7 +40,7 @@ is_signed_integer_dtype, ) from cudf.core._base_index import BaseIndex -from cudf.core._compat import PANDAS_GE_200 +from cudf.core._compat import PANDAS_GE_200, PANDAS_LT_300 from cudf.core.column import ( CategoricalColumn, ColumnBase, @@ -1107,6 +1107,9 @@ def _concat(cls, objs): non_empties = [index for index in objs if len(index)] if len(objs) != len(non_empties): # Do not remove until pandas-3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." warnings.warn( "The behavior of array concatenation with empty entries is " "deprecated. In a future version, this will no longer exclude " diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index d7239dbcf2f..8c3276d7703 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -48,6 +48,7 @@ is_scalar, ) from cudf.core._base_index import BaseIndex +from cudf.core._compat import PANDAS_LT_300 from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ColumnBase, as_column, full from cudf.core.column_accessor import ColumnAccessor @@ -2160,6 +2161,9 @@ def fillna( ): # noqa: D102 if method is not None: # Do not remove until pandas 3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." warnings.warn( f"{type(self).__name__}.fillna with 'method' is " "deprecated and will raise in a future version. " @@ -3389,6 +3393,7 @@ def first(self, offset): 2018-04-11 2 """ # Do not remove until pandas 3.0 support is added. + assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." warnings.warn( "first is deprecated and will be removed in a future version. " "Please create a mask and filter using `.loc` instead", @@ -3441,6 +3446,7 @@ def last(self, offset): 2018-04-15 4 """ # Do not remove until pandas 3.0 support is added. + assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." warnings.warn( "last is deprecated and will be removed in a future version. " "Please create a mask and filter using `.loc` instead", diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 05ab1edfaba..2ea538d66a1 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -14,6 +14,7 @@ from cudf._lib.types import size_type_dtype from cudf._typing import Dtype from cudf.api.extensions import no_default +from cudf.core._compat import PANDAS_LT_300 from cudf.core.column import ColumnBase, as_column, column_empty_like from cudf.core.column.categorical import CategoricalColumn from cudf.utils.dtypes import min_unsigned_type @@ -324,6 +325,9 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): any_empty = any(obj.empty for obj in objs) if any_empty: # Do not remove until pandas-3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." warnings.warn( "The behavior of array concatenation with empty entries is " "deprecated. In a future version, this will no longer exclude " diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 649b0688992..77ed7644f69 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -48,6 +48,7 @@ is_string_dtype, ) from cudf.core import indexing_utils +from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( @@ -278,6 +279,9 @@ def __setitem__(self, key, value): value = value.astype(to_dtype) if to_dtype != self._frame._column.dtype: # Do not remove until pandas-3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." warnings.warn( f"Setting an item of incompatible dtype is deprecated " "and will raise in a future error of pandas. " @@ -388,10 +392,16 @@ def _loc_to_iloc(self, arg): arg.dtype ): # Do not remove until pandas 3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." warnings.warn(warn_msg, FutureWarning) return arg.value elif is_integer(arg): # Do not remove until pandas 3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." warnings.warn(warn_msg, FutureWarning) return arg try: @@ -3617,6 +3627,9 @@ def pct_change( ) if fill_method not in (no_default, None) or limit is not no_default: # Do not remove until pandas 3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." warnings.warn( "The 'fill_method' and 'limit' keywords in " f"{type(self).__name__}.pct_change are deprecated and will be " diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index d6b944ebeac..3ba0403d67c 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -7,12 +7,10 @@ import cupy as cp import numpy as np -import pandas as pd import pytest -from packaging import version import cudf -from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210 +from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_210, PANDAS_LT_300 from cudf.testing._utils import ( assert_eq, expect_warning_if, @@ -84,8 +82,7 @@ def test_ufunc_index(request, ufunc): ) request.applymarker( pytest.mark.xfail( - condition=fname == "matmul" - and version.parse(pd.__version__) < version.parse("3.0"), + condition=fname == "matmul" and PANDAS_LT_300, reason="Fixed by https://github.com/pandas-dev/pandas/pull/57079", ) ) diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 7cae2f3a30f..18fe1700e25 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -696,10 +696,7 @@ def test_timedelta_dt_components(data, dtype): @pytest.mark.parametrize( "data", - _TIMEDELTA_DATA_NON_OVERFLOW, - # TODO-PANDAS-2.0: Replace above with `_TIMEDELTA_DATA` - # after the following issue is fixed: - # https://github.com/pandas-dev/pandas/issues/52386 + _TIMEDELTA_DATA, ) @pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) def test_timedelta_dt_properties(data, dtype): diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 1c5bde89800..feb02bac60d 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -15,6 +15,7 @@ from pyarrow import PythonFile as ArrowPythonFile from pyarrow.lib import NativeFile +from cudf.core._compat import PANDAS_LT_300 from cudf.utils.docutils import docfmt_partial try: @@ -1683,6 +1684,9 @@ def get_reader_filepath_or_buffer( if fs is None: if warn_on_raw_text_input: # Do not remove until pandas 3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." warnings.warn( f"Passing literal {warn_meta[0]} to {warn_meta[1]} is " "deprecated and will be removed in a future version. " @@ -1704,6 +1708,9 @@ def get_reader_filepath_or_buffer( ) elif warn_on_raw_text_input: # Do not remove until pandas 3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." warnings.warn( f"Passing literal {warn_meta[0]} to {warn_meta[1]} is " "deprecated and will be removed in a future version. " @@ -1713,6 +1720,9 @@ def get_reader_filepath_or_buffer( ) elif warn_on_raw_text_input: # Do not remove until pandas 3.0 support is added. + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." warnings.warn( f"Passing literal {warn_meta[0]} to {warn_meta[1]} is " "deprecated and will be removed in a future version. " From 30f873d86cbb0f7c9536acd5530ae2b7f9d7b68e Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 30 Jan 2024 09:11:13 +0000 Subject: [PATCH 136/162] Address all dask_cudf reviews --- python/dask_cudf/dask_cudf/io/tests/test_csv.py | 12 ++++++++++++ python/dask_cudf/dask_cudf/tests/test_core.py | 12 ------------ python/dask_cudf/dask_cudf/tests/test_groupby.py | 3 +-- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_csv.py b/python/dask_cudf/dask_cudf/io/tests/test_csv.py index 987fcf6b4ae..a35a9f1be48 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_csv.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_csv.py @@ -233,6 +233,18 @@ def test_read_csv_skipfooter(csv_end_bad_lines): dd.assert_eq(ddf_cpu, ddf_gpu, check_dtype=False) +def test_read_csv_skipfooter_arrow_string_fail(request, csv_end_bad_lines): + request.applymarker( + pytest.mark.xfail( + reason="https://github.com/rapidsai/cudf/issues/14915", + ) + ) + ddf_cpu = dd.read_csv(csv_end_bad_lines, skipfooter=3).compute() + ddf_gpu = dask_cudf.read_csv(csv_end_bad_lines, skipfooter=3).compute() + + dd.assert_eq(ddf_cpu, ddf_gpu, check_dtype=False) + + def test_read_csv_skipfooter_error(csv_end_bad_lines): with pytest.raises(ValueError): dask_cudf.read_csv( diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 552d800e2dd..afe2a050695 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -940,15 +940,3 @@ def test_categorical_dtype_round_trip(): actual = ds.compute() expected = pds.compute() assert actual.dtype.ordered == expected.dtype.ordered - - -def test_object_to_string_fail(request): - request.applymarker( - pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/14915", - ) - ) - s = cudf.Series(["a", "b", "c"] * 10) - ds = dgd.from_cudf(s, npartitions=2) - pds = dd.from_pandas(s.to_pandas(), npartitions=2) - dd.assert_eq(ds.sort_values(), pds.sort_values()) diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index cef8bdacace..c8cc6e65fa5 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -610,8 +610,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index): if as_index: # Groupby columns became the index. # Sorting the index should not change anything. - with dask.config.set({"dataframe.convert-string": False}): - dd.assert_eq(gf.index, gf.sort_index().index) + dd.assert_eq(gf.index.to_frame(), gf.sort_index().index.to_frame()) else: # Groupby columns are did NOT become the index. # Sorting by these columns should not change anything. From 2b05b59720ea0d25566b844a6d5cfd0afef74ab3 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 30 Jan 2024 04:23:33 -0600 Subject: [PATCH 137/162] Fix custreamz pytests to test on float64 types (#14934) This PR passes types to empty dataframe construction because reductions were being performed on float64 types and now empty column default type is object. From 2e307535554664180fc06de4805dbe0a297bbdaf Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 30 Jan 2024 12:40:36 +0000 Subject: [PATCH 138/162] Remaining custreamz test fix --- python/custreamz/custreamz/tests/test_dataframes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/custreamz/custreamz/tests/test_dataframes.py b/python/custreamz/custreamz/tests/test_dataframes.py index 7ce398c7617..779560a394a 100644 --- a/python/custreamz/custreamz/tests/test_dataframes.py +++ b/python/custreamz/custreamz/tests/test_dataframes.py @@ -749,7 +749,7 @@ def on_old(self, state, new): def test_groupby_aggregate_with_start_state(stream): - example = cudf.DataFrame({"name": [], "amount": []}) + example = cudf.DataFrame({"name": [], "amount": []}, dtype="float64") sdf = DataFrame(stream, example=example).groupby(["name"]) output0 = sdf.amount.sum(start=None).stream.gather().sink_to_list() output1 = ( @@ -817,7 +817,7 @@ def test_reductions_with_start_state(stream): def test_rolling_aggs_with_start_state(stream): - example = cudf.DataFrame({"name": [], "amount": []}) + example = cudf.DataFrame({"name": [], "amount": []}, dtype="float64") sdf = DataFrame(stream, example=example) output0 = ( sdf.rolling(2, with_state=True, start=()) From 1937252684b2589781e6a13075ce9458b649d40e Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 30 Jan 2024 12:55:29 +0000 Subject: [PATCH 139/162] Remove missing docstrings --- docs/cudf/source/user_guide/api_docs/groupby.rst | 2 -- docs/cudf/source/user_guide/api_docs/series.rst | 1 - 2 files changed, 3 deletions(-) diff --git a/docs/cudf/source/user_guide/api_docs/groupby.rst b/docs/cudf/source/user_guide/api_docs/groupby.rst index 26dd9bb354b..80811efa33f 100644 --- a/docs/cudf/source/user_guide/api_docs/groupby.rst +++ b/docs/cudf/source/user_guide/api_docs/groupby.rst @@ -80,7 +80,6 @@ application to columns of a specific data type. .. autosummary:: :toctree: api/ - DataFrameGroupBy.backfill DataFrameGroupBy.bfill DataFrameGroupBy.count DataFrameGroupBy.cumcount @@ -94,7 +93,6 @@ application to columns of a specific data type. DataFrameGroupBy.idxmax DataFrameGroupBy.idxmin DataFrameGroupBy.nunique - DataFrameGroupBy.pad DataFrameGroupBy.quantile DataFrameGroupBy.shift DataFrameGroupBy.size diff --git a/docs/cudf/source/user_guide/api_docs/series.rst b/docs/cudf/source/user_guide/api_docs/series.rst index 4672db04eb3..28931d567b4 100644 --- a/docs/cudf/source/user_guide/api_docs/series.rst +++ b/docs/cudf/source/user_guide/api_docs/series.rst @@ -158,7 +158,6 @@ Computations / descriptive stats Series.unique Series.nunique Series.is_unique - Series.is_monotonic Series.is_monotonic_increasing Series.is_monotonic_decreasing Series.value_counts From 6d07cc2d0cbcf7913c5a4bf3a4d20ea82dcef8e4 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 30 Jan 2024 14:19:21 +0000 Subject: [PATCH 140/162] Fix another custreamz test --- python/custreamz/custreamz/tests/test_dataframes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/custreamz/custreamz/tests/test_dataframes.py b/python/custreamz/custreamz/tests/test_dataframes.py index 779560a394a..bae4b051cae 100644 --- a/python/custreamz/custreamz/tests/test_dataframes.py +++ b/python/custreamz/custreamz/tests/test_dataframes.py @@ -771,7 +771,7 @@ def test_groupby_aggregate_with_start_state(stream): assert assert_eq(output1[0][1].reset_index(), out_df1) assert assert_eq(output2[0].reset_index(), out_df2) - example = cudf.DataFrame({"name": [], "amount": []}) + example = cudf.DataFrame({"name": [], "amount": []}, dtype="float64") sdf = DataFrame(stream, example=example).groupby(["name"]) output3 = sdf.amount.sum(start=output0[0]).stream.gather().sink_to_list() output4 = ( From 71d87d53632ff03a7fa92901c8d066ffeab3847a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 30 Jan 2024 08:20:41 -0800 Subject: [PATCH 141/162] Add back reftarget change for cudf.Index --- docs/cudf/source/conf.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 035ee586822..0100c331e72 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -454,6 +454,12 @@ def on_missing_reference(app, env, node, contnode): _prefixed_domain_objects[f"{prefix}{name}"] = name reftarget = node.get("reftarget") + if reftarget == "cudf.core.index.Index": + # We don't exposed docs for `cudf.core.index.Index` + # hence we would want the docstring & mypy references to + # use `cudf.Index` + node["reftarget"] = "cudf.Index" + return contnode if "namespacecudf" in reftarget: node["reftarget"] = "cudf" return contnode From 3438af0e3aa2ae7b6b16bd8e5a0ff0141dd633c7 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Jan 2024 17:33:44 +0000 Subject: [PATCH 142/162] Revert "Add back reftarget change for cudf.Index" This reverts commit 71d87d53632ff03a7fa92901c8d066ffeab3847a. --- docs/cudf/source/conf.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 0100c331e72..035ee586822 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -454,12 +454,6 @@ def on_missing_reference(app, env, node, contnode): _prefixed_domain_objects[f"{prefix}{name}"] = name reftarget = node.get("reftarget") - if reftarget == "cudf.core.index.Index": - # We don't exposed docs for `cudf.core.index.Index` - # hence we would want the docstring & mypy references to - # use `cudf.Index` - node["reftarget"] = "cudf.Index" - return contnode if "namespacecudf" in reftarget: node["reftarget"] = "cudf" return contnode From ffa473e4f6b7515cd78e9cd5f9bcb8537e32ae62 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Jan 2024 18:11:19 +0000 Subject: [PATCH 143/162] Move abs to IndexedFrame --- python/cudf/cudf/core/frame.py | 26 -------------------------- python/cudf/cudf/core/indexed_frame.py | 26 ++++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index a1c5cf40024..9342e9439c3 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1549,32 +1549,6 @@ def _get_sorted_inds( stable=True, ) - @_cudf_nvtx_annotate - def abs(self): - """ - Return a Series/DataFrame with absolute numeric value of each element. - - This function only applies to elements that are all numeric. - - Returns - ------- - DataFrame/Series - Absolute value of each element. - - Examples - -------- - Absolute numeric values in a Series - - >>> s = cudf.Series([-1.10, 2, -3.33, 4]) - >>> s.abs() - 0 1.10 - 1 2.00 - 2 3.33 - 3 4.00 - dtype: float64 - """ - return self._unaryop("abs") - @_cudf_nvtx_annotate def _is_sorted(self, ascending=None, null_position=None): """ diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 8c3276d7703..52fc5b3808e 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -987,6 +987,32 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): output._copy_type_metadata(self, include_index=False) return self._mimic_inplace(output, inplace=inplace) + @_cudf_nvtx_annotate + def abs(self): + """ + Return a Series/DataFrame with absolute numeric value of each element. + + This function only applies to elements that are all numeric. + + Returns + ------- + DataFrame/Series + Absolute value of each element. + + Examples + -------- + Absolute numeric values in a Series + + >>> s = cudf.Series([-1.10, 2, -3.33, 4]) + >>> s.abs() + 0 1.10 + 1 2.00 + 2 3.33 + 3 4.00 + dtype: float64 + """ + return self._unaryop("abs") + def _copy_type_metadata( self, other: Self, From abcd15d1bcfaed8fda2180fae4ecc3b5dd325f8c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Jan 2024 18:15:59 +0000 Subject: [PATCH 144/162] Move head and tail to IndexedFrame --- python/cudf/cudf/core/frame.py | 227 +------------------------ python/cudf/cudf/core/indexed_frame.py | 225 ++++++++++++++++++++++++ 2 files changed, 226 insertions(+), 226 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 9342e9439c3..ae4f6180eec 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -46,7 +46,7 @@ from cudf.core.window import Rolling from cudf.utils import ioutils from cudf.utils.docutils import copy_docstring -from cudf.utils.dtypes import can_convert_to_column, find_common_type +from cudf.utils.dtypes import find_common_type from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf @@ -1745,121 +1745,6 @@ def _apply_cupy_ufunc_to_operands( data[i][name] = as_column(out).set_mask(mask) return data - @_cudf_nvtx_annotate - def dot(self, other, reflect=False): - """ - Get dot product of frame and other, (binary operator `dot`). - - Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`, - `dot`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`, - `@`. - - Parameters - ---------- - other : Sequence, Series, or DataFrame - Any multiple element data structure, or list-like object. - reflect : bool, default False - If ``True``, swap the order of the operands. See - https://docs.python.org/3/reference/datamodel.html#object.__ror__ - for more information on when this is necessary. - - Returns - ------- - scalar, Series, or DataFrame - The result of the operation. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame([[1, 2, 3, 4], - ... [5, 6, 7, 8]]) - >>> df @ df.T - 0 1 - 0 30 70 - 1 70 174 - >>> s = cudf.Series([1, 1, 1, 1]) - >>> df @ s - 0 10 - 1 26 - dtype: int64 - >>> [1, 2, 3, 4] @ s - 10 - """ - # TODO: This function does not currently support nulls. - lhs = self.values - result_index = None - result_cols = None - if isinstance(self, cudf.Series) and isinstance( - other, (cudf.Series, cudf.DataFrame) - ): - common = self.index.union(other.index) - if len(common) > len(self.index) or len(common) > len(other.index): - raise ValueError("matrices are not aligned") - - lhs = self.reindex(index=common, copy=False).values - rhs = other.reindex(index=common, copy=False).values - if isinstance(other, cudf.DataFrame): - result_index = other._data.to_pandas_index() - elif isinstance(self, cudf.DataFrame) and isinstance( - other, (cudf.Series, cudf.DataFrame) - ): - common = self._data.to_pandas_index().union( - other.index.to_pandas() - ) - if len(common) > len(self._data.names) or len(common) > len( - other.index - ): - raise ValueError("matrices are not aligned") - - lhs = self.reindex(columns=common, copy=False) - result_index = lhs.index - - rhs = other.reindex(index=common, copy=False).values - lhs = lhs.values - if isinstance(other, cudf.DataFrame): - result_cols = other._data.to_pandas_index() - - elif isinstance( - other, (cupy.ndarray, np.ndarray) - ) or can_convert_to_column(other): - rhs = cupy.asarray(other) - else: - # TODO: This should raise an exception, not return NotImplemented, - # but __matmul__ relies on the current behavior. We should either - # move this implementation to __matmul__ and call it from here - # (checking for NotImplemented and raising NotImplementedError if - # that's what's returned), or __matmul__ should catch a - # NotImplementedError from here and return NotImplemented. The - # latter feels cleaner (putting the implementation in this method - # rather than in the operator) but will be slower in the (highly - # unlikely) case that we're multiplying a cudf object with another - # type of object that somehow supports this behavior. - return NotImplemented - if reflect: - lhs, rhs = rhs, lhs - - result = lhs.dot(rhs) - if len(result.shape) == 1: - return cudf.Series( - result, - index=self.index if result_index is None else result_index, - ) - if len(result.shape) == 2: - return cudf.DataFrame( - result, - index=self.index if result_index is None else result_index, - columns=result_cols, - ) - return result.item() - - @_cudf_nvtx_annotate - def __matmul__(self, other): - return self.dot(other) - - @_cudf_nvtx_annotate - def __rmatmul__(self, other): - return self.dot(other, reflect=True) - # Unary logical operators @_cudf_nvtx_annotate def __neg__(self): @@ -2629,116 +2514,6 @@ def __deepcopy__(self, memo): def __copy__(self): return self.copy(deep=False) - @_cudf_nvtx_annotate - def head(self, n=5): - """ - Return the first `n` rows. - This function returns the first `n` rows for the object based - on position. It is useful for quickly testing if your object - has the right type of data in it. - For negative values of `n`, this function returns all rows except - the last `n` rows, equivalent to ``df[:-n]``. - - Parameters - ---------- - n : int, default 5 - Number of rows to select. - - Returns - ------- - DataFrame or Series - The first `n` rows of the caller object. - - Examples - -------- - **Series** - - >>> ser = cudf.Series(['alligator', 'bee', 'falcon', - ... 'lion', 'monkey', 'parrot', 'shark', 'whale', 'zebra']) - >>> ser - 0 alligator - 1 bee - 2 falcon - 3 lion - 4 monkey - 5 parrot - 6 shark - 7 whale - 8 zebra - dtype: object - - Viewing the first 5 lines - - >>> ser.head() - 0 alligator - 1 bee - 2 falcon - 3 lion - 4 monkey - dtype: object - - Viewing the first `n` lines (three in this case) - - >>> ser.head(3) - 0 alligator - 1 bee - 2 falcon - dtype: object - - For negative values of `n` - - >>> ser.head(-3) - 0 alligator - 1 bee - 2 falcon - 3 lion - 4 monkey - 5 parrot - dtype: object - - **DataFrame** - - >>> df = cudf.DataFrame() - >>> df['key'] = [0, 1, 2, 3, 4] - >>> df['val'] = [float(i + 10) for i in range(5)] # insert column - >>> df.head(2) - key val - 0 0 10.0 - 1 1 11.0 - """ - return self.iloc[:n] - - @_cudf_nvtx_annotate - def tail(self, n=5): - """ - Returns the last n rows as a new DataFrame or Series - - Examples - -------- - **DataFrame** - - >>> import cudf - >>> df = cudf.DataFrame() - >>> df['key'] = [0, 1, 2, 3, 4] - >>> df['val'] = [float(i + 10) for i in range(5)] # insert column - >>> df.tail(2) - key val - 3 3 13.0 - 4 4 14.0 - - **Series** - - >>> import cudf - >>> ser = cudf.Series([4, 3, 2, 1, 0]) - >>> ser.tail(2) - 3 1 - 4 0 - """ - if n == 0: - return self.iloc[0:0] - - return self.iloc[-n:] - @_cudf_nvtx_annotate @copy_docstring(Rolling) def rolling( diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 52fc5b3808e..efa75772053 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1013,6 +1013,231 @@ def abs(self): """ return self._unaryop("abs") + @_cudf_nvtx_annotate + def dot(self, other, reflect=False): + """ + Get dot product of frame and other, (binary operator `dot`). + + Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`, + `dot`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`, + `@`. + + Parameters + ---------- + other : Sequence, Series, or DataFrame + Any multiple element data structure, or list-like object. + reflect : bool, default False + If ``True``, swap the order of the operands. See + https://docs.python.org/3/reference/datamodel.html#object.__ror__ + for more information on when this is necessary. + + Returns + ------- + scalar, Series, or DataFrame + The result of the operation. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame([[1, 2, 3, 4], + ... [5, 6, 7, 8]]) + >>> df @ df.T + 0 1 + 0 30 70 + 1 70 174 + >>> s = cudf.Series([1, 1, 1, 1]) + >>> df @ s + 0 10 + 1 26 + dtype: int64 + >>> [1, 2, 3, 4] @ s + 10 + """ + # TODO: This function does not currently support nulls. + lhs = self.values + result_index = None + result_cols = None + if isinstance(self, cudf.Series) and isinstance( + other, (cudf.Series, cudf.DataFrame) + ): + common = self.index.union(other.index) + if len(common) > len(self.index) or len(common) > len(other.index): + raise ValueError("matrices are not aligned") + + lhs = self.reindex(index=common, copy=False).values + rhs = other.reindex(index=common, copy=False).values + if isinstance(other, cudf.DataFrame): + result_index = other._data.to_pandas_index() + elif isinstance(self, cudf.DataFrame) and isinstance( + other, (cudf.Series, cudf.DataFrame) + ): + common = self._data.to_pandas_index().union( + other.index.to_pandas() + ) + if len(common) > len(self._data.names) or len(common) > len( + other.index + ): + raise ValueError("matrices are not aligned") + + lhs = self.reindex(columns=common, copy=False) + result_index = lhs.index + + rhs = other.reindex(index=common, copy=False).values + lhs = lhs.values + if isinstance(other, cudf.DataFrame): + result_cols = other._data.to_pandas_index() + + elif isinstance( + other, (cp.ndarray, np.ndarray) + ) or cudf.utils.dtypes.can_convert_to_column(other): + rhs = cp.asarray(other) + else: + # TODO: This should raise an exception, not return NotImplemented, + # but __matmul__ relies on the current behavior. We should either + # move this implementation to __matmul__ and call it from here + # (checking for NotImplemented and raising NotImplementedError if + # that's what's returned), or __matmul__ should catch a + # NotImplementedError from here and return NotImplemented. The + # latter feels cleaner (putting the implementation in this method + # rather than in the operator) but will be slower in the (highly + # unlikely) case that we're multiplying a cudf object with another + # type of object that somehow supports this behavior. + return NotImplemented + if reflect: + lhs, rhs = rhs, lhs + + result = lhs.dot(rhs) + if len(result.shape) == 1: + return cudf.Series( + result, + index=self.index if result_index is None else result_index, + ) + if len(result.shape) == 2: + return cudf.DataFrame( + result, + index=self.index if result_index is None else result_index, + columns=result_cols, + ) + return result.item() + + @_cudf_nvtx_annotate + def __matmul__(self, other): + return self.dot(other) + + @_cudf_nvtx_annotate + def __rmatmul__(self, other): + return self.dot(other, reflect=True) + + @_cudf_nvtx_annotate + def head(self, n=5): + """ + Return the first `n` rows. + This function returns the first `n` rows for the object based + on position. It is useful for quickly testing if your object + has the right type of data in it. + For negative values of `n`, this function returns all rows except + the last `n` rows, equivalent to ``df[:-n]``. + + Parameters + ---------- + n : int, default 5 + Number of rows to select. + + Returns + ------- + DataFrame or Series + The first `n` rows of the caller object. + + Examples + -------- + **Series** + + >>> ser = cudf.Series(['alligator', 'bee', 'falcon', + ... 'lion', 'monkey', 'parrot', 'shark', 'whale', 'zebra']) + >>> ser + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + 5 parrot + 6 shark + 7 whale + 8 zebra + dtype: object + + Viewing the first 5 lines + + >>> ser.head() + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + dtype: object + + Viewing the first `n` lines (three in this case) + + >>> ser.head(3) + 0 alligator + 1 bee + 2 falcon + dtype: object + + For negative values of `n` + + >>> ser.head(-3) + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + 5 parrot + dtype: object + + **DataFrame** + + >>> df = cudf.DataFrame() + >>> df['key'] = [0, 1, 2, 3, 4] + >>> df['val'] = [float(i + 10) for i in range(5)] # insert column + >>> df.head(2) + key val + 0 0 10.0 + 1 1 11.0 + """ + return self.iloc[:n] + + @_cudf_nvtx_annotate + def tail(self, n=5): + """ + Returns the last n rows as a new DataFrame or Series + + Examples + -------- + **DataFrame** + + >>> import cudf + >>> df = cudf.DataFrame() + >>> df['key'] = [0, 1, 2, 3, 4] + >>> df['val'] = [float(i + 10) for i in range(5)] # insert column + >>> df.tail(2) + key val + 3 3 13.0 + 4 4 14.0 + + **Series** + + >>> import cudf + >>> ser = cudf.Series([4, 3, 2, 1, 0]) + >>> ser.tail(2) + 3 1 + 4 0 + """ + if n == 0: + return self.iloc[0:0] + + return self.iloc[-n:] + def _copy_type_metadata( self, other: Self, From 50d287f97bec818a6027b55727ab1d8e538fd4ce Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Jan 2024 18:17:53 +0000 Subject: [PATCH 145/162] Move isnull (alias) to IndexedFrame --- python/cudf/cudf/core/frame.py | 3 --- python/cudf/cudf/core/indexed_frame.py | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index ae4f6180eec..5d33fbf70c3 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1234,9 +1234,6 @@ def isna(self): data_columns = (col.isnull() for col in self._columns) return self._from_data_like_self(zip(self._column_names, data_columns)) - # Alias for isna - isnull = isna - @_cudf_nvtx_annotate def notna(self): """ diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index efa75772053..6443391bfe1 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1238,6 +1238,9 @@ def tail(self, n=5): return self.iloc[-n:] + # Alias for isna + isnull = Frame.isna + def _copy_type_metadata( self, other: Self, From 0013faa416fe3032dc6f2242204ef282bc8036f9 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Jan 2024 18:18:38 +0000 Subject: [PATCH 146/162] Move kurtosis and skew to IndexedFrame --- python/cudf/cudf/core/frame.py | 119 ------------------------- python/cudf/cudf/core/indexed_frame.py | 119 +++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 119 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 5d33fbf70c3..3a7545c93c0 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2161,125 +2161,6 @@ def var( **kwargs, ) - @_cudf_nvtx_annotate - def kurtosis(self, axis=0, skipna=True, numeric_only=False, **kwargs): - """ - Return Fisher's unbiased kurtosis of a sample. - - Kurtosis obtained using Fisher's definition of - kurtosis (kurtosis of normal == 0.0). Normalized by N-1. - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - numeric_only : bool, default False - If True, includes only float, int, boolean columns. - If False, will raise error in-case there are - non-numeric columns. - - Returns - ------- - Series or scalar - - Examples - -------- - **Series** - - >>> import cudf - >>> series = cudf.Series([1, 2, 3, 4]) - >>> series.kurtosis() - -1.1999999999999904 - - **DataFrame** - - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.kurt() - a -1.2 - b -1.2 - dtype: float64 - - .. pandas-compat:: - **DataFrame.kurtosis** - - Parameters currently not supported are `level` and `numeric_only` - """ - if axis not in (0, "index", None, no_default): - raise NotImplementedError("Only axis=0 is currently supported.") - - return self._reduce( - "kurtosis", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - **kwargs, - ) - - # Alias for kurtosis. - kurt = kurtosis - - @_cudf_nvtx_annotate - def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs): - """ - Return unbiased Fisher-Pearson skew of a sample. - - Parameters - ---------- - skipna: bool, default True - Exclude NA/null values when computing the result. - numeric_only : bool, default False - If True, includes only float, int, boolean columns. - If False, will raise error in-case there are - non-numeric columns. - - Returns - ------- - Series - - Examples - -------- - **Series** - - >>> import cudf - >>> series = cudf.Series([1, 2, 3, 4, 5, 6, 6]) - >>> series - 0 1 - 1 2 - 2 3 - 3 4 - 4 5 - 5 6 - 6 6 - dtype: int64 - - **DataFrame** - - >>> import cudf - >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 8, 10, 10]}) - >>> df.skew() - a 0.00000 - b -0.37037 - dtype: float64 - - .. pandas-compat:: - **DataFrame.skew, Series.skew, Frame.skew** - - The `axis` parameter is not currently supported. - """ - if axis not in (0, "index", None, no_default): - raise NotImplementedError("Only axis=0 is currently supported.") - - return self._reduce( - "skew", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - **kwargs, - ) - @_cudf_nvtx_annotate def all(self, axis=0, skipna=True, **kwargs): """ diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 6443391bfe1..d63921c2c68 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1241,6 +1241,125 @@ def tail(self, n=5): # Alias for isna isnull = Frame.isna + @_cudf_nvtx_annotate + def kurtosis(self, axis=0, skipna=True, numeric_only=False, **kwargs): + """ + Return Fisher's unbiased kurtosis of a sample. + + Kurtosis obtained using Fisher's definition of + kurtosis (kurtosis of normal == 0.0). Normalized by N-1. + + Parameters + ---------- + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. + + Returns + ------- + Series or scalar + + Examples + -------- + **Series** + + >>> import cudf + >>> series = cudf.Series([1, 2, 3, 4]) + >>> series.kurtosis() + -1.1999999999999904 + + **DataFrame** + + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.kurt() + a -1.2 + b -1.2 + dtype: float64 + + .. pandas-compat:: + **DataFrame.kurtosis** + + Parameters currently not supported are `level` and `numeric_only` + """ + if axis not in (0, "index", None, no_default): + raise NotImplementedError("Only axis=0 is currently supported.") + + return self._reduce( + "kurtosis", + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + **kwargs, + ) + + # Alias for kurtosis. + kurt = kurtosis + + @_cudf_nvtx_annotate + def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs): + """ + Return unbiased Fisher-Pearson skew of a sample. + + Parameters + ---------- + skipna: bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. + + Returns + ------- + Series + + Examples + -------- + **Series** + + >>> import cudf + >>> series = cudf.Series([1, 2, 3, 4, 5, 6, 6]) + >>> series + 0 1 + 1 2 + 2 3 + 3 4 + 4 5 + 5 6 + 6 6 + dtype: int64 + + **DataFrame** + + >>> import cudf + >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 8, 10, 10]}) + >>> df.skew() + a 0.00000 + b -0.37037 + dtype: float64 + + .. pandas-compat:: + **DataFrame.skew, Series.skew, Frame.skew** + + The `axis` parameter is not currently supported. + """ + if axis not in (0, "index", None, no_default): + raise NotImplementedError("Only axis=0 is currently supported.") + + return self._reduce( + "skew", + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + **kwargs, + ) + def _copy_type_metadata( self, other: Self, From 11ab9e818798a6f3d8d5def1beeb287e80cf63e8 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Jan 2024 18:23:24 +0000 Subject: [PATCH 147/162] Move mask to IndexedFrame --- python/cudf/cudf/core/frame.py | 62 -------------------------- python/cudf/cudf/core/indexed_frame.py | 62 ++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 62 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 3a7545c93c0..6a1f6b76302 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -615,68 +615,6 @@ def where(self, cond, other=None, inplace: bool = False) -> Optional[Self]: """ raise NotImplementedError - @_cudf_nvtx_annotate - def mask(self, cond, other=None, inplace: bool = False) -> Optional[Self]: - """ - Replace values where the condition is True. - - Parameters - ---------- - cond : bool Series/DataFrame, array-like - Where cond is False, keep the original value. - Where True, replace with corresponding value from other. - Callables are not supported. - other: scalar, list of scalars, Series/DataFrame - Entries where cond is True are replaced with - corresponding value from other. Callables are not - supported. Default is None. - - DataFrame expects only Scalar or array like with scalars or - dataframe with same dimension as self. - - Series expects only scalar or series like with same length - inplace : bool, default False - Whether to perform the operation in place on the data. - - Returns - ------- - Same type as caller - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({"A":[1, 4, 5], "B":[3, 5, 8]}) - >>> df.mask(df % 2 == 0, [-1, -1]) - A B - 0 1 3 - 1 -1 5 - 2 5 -1 - - >>> ser = cudf.Series([4, 3, 2, 1, 0]) - >>> ser.mask(ser > 2, 10) - 0 10 - 1 10 - 2 2 - 3 1 - 4 0 - dtype: int64 - >>> ser.mask(ser > 2) - 0 - 1 - 2 2 - 3 1 - 4 0 - dtype: int64 - """ - - if not hasattr(cond, "__invert__"): - # We Invert `cond` below and call `where`, so - # making sure the object supports - # `~`(inversion) operator or `__invert__` method - cond = cupy.asarray(cond) - - return self.where(cond=~cond, other=other, inplace=inplace) - @_cudf_nvtx_annotate def pipe(self, func, *args, **kwargs): """ diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index d63921c2c68..c08571a4752 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1360,6 +1360,68 @@ def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs): **kwargs, ) + @_cudf_nvtx_annotate + def mask(self, cond, other=None, inplace: bool = False) -> Optional[Self]: + """ + Replace values where the condition is True. + + Parameters + ---------- + cond : bool Series/DataFrame, array-like + Where cond is False, keep the original value. + Where True, replace with corresponding value from other. + Callables are not supported. + other: scalar, list of scalars, Series/DataFrame + Entries where cond is True are replaced with + corresponding value from other. Callables are not + supported. Default is None. + + DataFrame expects only Scalar or array like with scalars or + dataframe with same dimension as self. + + Series expects only scalar or series like with same length + inplace : bool, default False + Whether to perform the operation in place on the data. + + Returns + ------- + Same type as caller + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({"A":[1, 4, 5], "B":[3, 5, 8]}) + >>> df.mask(df % 2 == 0, [-1, -1]) + A B + 0 1 3 + 1 -1 5 + 2 5 -1 + + >>> ser = cudf.Series([4, 3, 2, 1, 0]) + >>> ser.mask(ser > 2, 10) + 0 10 + 1 10 + 2 2 + 3 1 + 4 0 + dtype: int64 + >>> ser.mask(ser > 2) + 0 + 1 + 2 2 + 3 1 + 4 0 + dtype: int64 + """ + + if not hasattr(cond, "__invert__"): + # We Invert `cond` below and call `where`, so + # making sure the object supports + # `~`(inversion) operator or `__invert__` method + cond = cp.asarray(cond) + + return self.where(cond=~cond, other=other, inplace=inplace) + def _copy_type_metadata( self, other: Self, From 2563b90f7e8b0a099b5acc6770d6fba2f33216a9 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Jan 2024 18:25:42 +0000 Subject: [PATCH 148/162] Move various reductions to IndexedFrame --- python/cudf/cudf/core/frame.py | 340 ------------------------- python/cudf/cudf/core/indexed_frame.py | 339 ++++++++++++++++++++++++ 2 files changed, 339 insertions(+), 340 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 6a1f6b76302..5d2d054d20c 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -31,7 +31,6 @@ import cudf from cudf import _lib as libcudf from cudf._typing import Dtype -from cudf.api.extensions import no_default from cudf.api.types import is_bool_dtype, is_dtype_equal, is_scalar from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( @@ -1815,290 +1814,6 @@ def max( **kwargs, ) - @_cudf_nvtx_annotate - def sum( - self, - axis=no_default, - skipna=True, - dtype=None, - numeric_only=False, - min_count=0, - **kwargs, - ): - """ - Return sum of the values in the DataFrame. - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. - numeric_only : bool, default False - If True, includes only float, int, boolean columns. - If False, will raise error in-case there are - non-numeric columns. - min_count: int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.sum() - a 10 - b 34 - dtype: int64 - - .. pandas-compat:: - **DataFrame.sum, Series.sum** - - Parameters currently not supported are `level`, `numeric_only`. - """ - return self._reduce( - "sum", - axis=axis, - skipna=skipna, - dtype=dtype, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - - @_cudf_nvtx_annotate - def product( - self, - axis=no_default, - skipna=True, - dtype=None, - numeric_only=False, - min_count=0, - **kwargs, - ): - """ - Return product of the values in the DataFrame. - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. - numeric_only : bool, default False - If True, includes only float, int, boolean columns. - If False, will raise error in-case there are - non-numeric columns. - min_count: int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.product() - a 24 - b 5040 - dtype: int64 - - .. pandas-compat:: - **DataFrame.product, Series.product** - - Parameters currently not supported are level`, `numeric_only`. - """ - - return self._reduce( - # cuDF columns use "product" as the op name, but cupy uses "prod" - # and we need cupy if axis == 1. - "prod" if axis in {1, "columns"} else "product", - axis=axis, - skipna=skipna, - dtype=dtype, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - - # Alias for pandas compatibility. - prod = product - - @_cudf_nvtx_annotate - def mean(self, axis=0, skipna=True, numeric_only=False, **kwargs): - """ - Return the mean of the values for the requested axis. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'} - Axis for the function to be applied on. - skipna : bool, default True - Exclude NA/null values when computing the result. - numeric_only : bool, default False - If True, includes only float, int, boolean columns. - If False, will raise error in-case there are - non-numeric columns. - **kwargs - Additional keyword arguments to be passed to the function. - - Returns - ------- - mean : Series or DataFrame (if level specified) - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.mean() - a 2.5 - b 8.5 - dtype: float64 - """ - return self._reduce( - "mean", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - **kwargs, - ) - - @_cudf_nvtx_annotate - def std( - self, - axis=no_default, - skipna=True, - ddof=1, - numeric_only=False, - **kwargs, - ): - """ - Return sample standard deviation of the DataFrame. - - Normalized by N-1 by default. This can be changed using - the `ddof` argument - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - ddof: int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is N - ddof, where N represents the number of elements. - numeric_only : bool, default False - If True, includes only float, int, boolean columns. - If False, will raise error in-case there are - non-numeric columns. - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.std() - a 1.290994 - b 1.290994 - dtype: float64 - - .. pandas-compat:: - **DataFrame.std, Series.std** - - Parameters currently not supported are `level` and - `numeric_only` - """ - - return self._reduce( - "std", - axis=axis, - skipna=skipna, - ddof=ddof, - numeric_only=numeric_only, - **kwargs, - ) - - @_cudf_nvtx_annotate - def var( - self, - axis=no_default, - skipna=True, - ddof=1, - numeric_only=False, - **kwargs, - ): - """ - Return unbiased variance of the DataFrame. - - Normalized by N-1 by default. This can be changed using the - ddof argument. - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - ddof: int, default 1 - Delta Degrees of Freedom. The divisor used in calculations is - N - ddof, where N represents the number of elements. - numeric_only : bool, default False - If True, includes only float, int, boolean columns. - If False, will raise error in-case there are - non-numeric columns. - - Returns - ------- - scalar - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.var() - a 1.666667 - b 1.666667 - dtype: float64 - - .. pandas-compat:: - **DataFrame.var, Series.var** - - Parameters currently not supported are `level` and - `numeric_only` - """ - return self._reduce( - "var", - axis=axis, - skipna=skipna, - ddof=ddof, - numeric_only=numeric_only, - **kwargs, - ) - @_cudf_nvtx_annotate def all(self, axis=0, skipna=True, **kwargs): """ @@ -2217,61 +1932,6 @@ def any(self, axis=0, skipna=True, **kwargs): **kwargs, ) - def median( - self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs - ): - """ - Return the median of the values for the requested axis. - - Parameters - ---------- - axis : {index (0), columns (1)} - Axis for the function to be applied on. For Series this - parameter is unused and defaults to 0. - skipna : bool, default True - Exclude NA/null values when computing the result. - numeric_only : bool, default False - If True, includes only float, int, boolean columns. - If False, will raise error in-case there are - non-numeric columns. - - Returns - ------- - scalar - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([10, 25, 3, 25, 24, 6]) - >>> ser - 0 10 - 1 25 - 2 3 - 3 25 - 4 24 - 5 6 - dtype: int64 - >>> ser.median() - 17.0 - - .. pandas-compat:: - **DataFrame.median, Series.median** - - Parameters currently not supported are `level` and `numeric_only`. - - .. pandas-compat:: - **DataFrame.median, Series.median** - - Parameters currently not supported are `level` and `numeric_only`. - """ - return self._reduce( - "median", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - **kwargs, - ) - @_cudf_nvtx_annotate @ioutils.doc_to_json() def to_json(self, path_or_buf=None, *args, **kwargs): diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index c08571a4752..82e355ddfd0 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1241,6 +1241,345 @@ def tail(self, n=5): # Alias for isna isnull = Frame.isna + @_cudf_nvtx_annotate + def sum( + self, + axis=no_default, + skipna=True, + dtype=None, + numeric_only=False, + min_count=0, + **kwargs, + ): + """ + Return sum of the values in the DataFrame. + + Parameters + ---------- + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values when computing the result. + dtype: data type + Data type to cast the result to. + numeric_only : bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. + min_count: int, default 0 + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result + will be NA. + + The default being 0. This means the sum of an all-NA or empty + Series is 0, and the product of an all-NA or empty Series is 1. + + Returns + ------- + Series + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.sum() + a 10 + b 34 + dtype: int64 + + .. pandas-compat:: + **DataFrame.sum, Series.sum** + + Parameters currently not supported are `level`, `numeric_only`. + """ + return self._reduce( + "sum", + axis=axis, + skipna=skipna, + dtype=dtype, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + + @_cudf_nvtx_annotate + def product( + self, + axis=no_default, + skipna=True, + dtype=None, + numeric_only=False, + min_count=0, + **kwargs, + ): + """ + Return product of the values in the DataFrame. + + Parameters + ---------- + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values when computing the result. + dtype: data type + Data type to cast the result to. + numeric_only : bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. + min_count: int, default 0 + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result + will be NA. + + The default being 0. This means the sum of an all-NA or empty + Series is 0, and the product of an all-NA or empty Series is 1. + + Returns + ------- + Series + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.product() + a 24 + b 5040 + dtype: int64 + + .. pandas-compat:: + **DataFrame.product, Series.product** + + Parameters currently not supported are level`, `numeric_only`. + """ + + return self._reduce( + # cuDF columns use "product" as the op name, but cupy uses "prod" + # and we need cupy if axis == 1. + "prod" if axis in {1, "columns"} else "product", + axis=axis, + skipna=skipna, + dtype=dtype, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + + # Alias for pandas compatibility. + prod = product + + @_cudf_nvtx_annotate + def mean(self, axis=0, skipna=True, numeric_only=False, **kwargs): + """ + Return the mean of the values for the requested axis. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'} + Axis for the function to be applied on. + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + mean : Series or DataFrame (if level specified) + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.mean() + a 2.5 + b 8.5 + dtype: float64 + """ + return self._reduce( + "mean", + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + **kwargs, + ) + + def median( + self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs + ): + """ + Return the median of the values for the requested axis. + + Parameters + ---------- + axis : {index (0), columns (1)} + Axis for the function to be applied on. For Series this + parameter is unused and defaults to 0. + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. + + Returns + ------- + scalar + + Examples + -------- + >>> import cudf + >>> ser = cudf.Series([10, 25, 3, 25, 24, 6]) + >>> ser + 0 10 + 1 25 + 2 3 + 3 25 + 4 24 + 5 6 + dtype: int64 + >>> ser.median() + 17.0 + + .. pandas-compat:: + **DataFrame.median, Series.median** + + Parameters currently not supported are `level` and `numeric_only`. + + .. pandas-compat:: + **DataFrame.median, Series.median** + + Parameters currently not supported are `level` and `numeric_only`. + """ + return self._reduce( + "median", + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + **kwargs, + ) + + @_cudf_nvtx_annotate + def std( + self, + axis=no_default, + skipna=True, + ddof=1, + numeric_only=False, + **kwargs, + ): + """ + Return sample standard deviation of the DataFrame. + + Normalized by N-1 by default. This can be changed using + the `ddof` argument + + Parameters + ---------- + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ddof: int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is N - ddof, where N represents the number of elements. + numeric_only : bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. + + Returns + ------- + Series + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.std() + a 1.290994 + b 1.290994 + dtype: float64 + + .. pandas-compat:: + **DataFrame.std, Series.std** + + Parameters currently not supported are `level` and + `numeric_only` + """ + + return self._reduce( + "std", + axis=axis, + skipna=skipna, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + + @_cudf_nvtx_annotate + def var( + self, + axis=no_default, + skipna=True, + ddof=1, + numeric_only=False, + **kwargs, + ): + """ + Return unbiased variance of the DataFrame. + + Normalized by N-1 by default. This can be changed using the + ddof argument. + + Parameters + ---------- + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ddof: int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is + N - ddof, where N represents the number of elements. + numeric_only : bool, default False + If True, includes only float, int, boolean columns. + If False, will raise error in-case there are + non-numeric columns. + + Returns + ------- + scalar + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.var() + a 1.666667 + b 1.666667 + dtype: float64 + + .. pandas-compat:: + **DataFrame.var, Series.var** + + Parameters currently not supported are `level` and + `numeric_only` + """ + return self._reduce( + "var", + axis=axis, + skipna=skipna, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + @_cudf_nvtx_annotate def kurtosis(self, axis=0, skipna=True, numeric_only=False, **kwargs): """ From 9716f52ac51c2a23aa7eb4585d0c47f1db0a378c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Jan 2024 18:27:25 +0000 Subject: [PATCH 149/162] Move nans_to_nulls to IndexedFrame --- python/cudf/cudf/core/frame.py | 54 -------------------------- python/cudf/cudf/core/indexed_frame.py | 54 ++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 54 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 5d2d054d20c..b06afa5da0b 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2004,60 +2004,6 @@ def rolling( win_type=win_type, ) - @_cudf_nvtx_annotate - def nans_to_nulls(self): - """ - Convert nans (if any) to nulls - - Returns - ------- - DataFrame or Series - - Examples - -------- - **Series** - - >>> import cudf, numpy as np - >>> series = cudf.Series([1, 2, np.nan, None, 10], nan_as_null=False) - >>> series - 0 1.0 - 1 2.0 - 2 NaN - 3 - 4 10.0 - dtype: float64 - >>> series.nans_to_nulls() - 0 1.0 - 1 2.0 - 2 - 3 - 4 10.0 - dtype: float64 - - **DataFrame** - - >>> df = cudf.DataFrame() - >>> df['a'] = cudf.Series([1, None, np.nan], nan_as_null=False) - >>> df['b'] = cudf.Series([None, 3.14, np.nan], nan_as_null=False) - >>> df - a b - 0 1.0 - 1 3.14 - 2 NaN NaN - >>> df.nans_to_nulls() - a b - 0 1.0 - 1 3.14 - 2 - """ - result_data = {} - for name, col in self._data.items(): - try: - result_data[name] = col.nans_to_nulls() - except AttributeError: - result_data[name] = col.copy() - return self._from_data_like_self(result_data) - @_cudf_nvtx_annotate def __invert__(self): """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 82e355ddfd0..5d634418655 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1761,6 +1761,60 @@ def mask(self, cond, other=None, inplace: bool = False) -> Optional[Self]: return self.where(cond=~cond, other=other, inplace=inplace) + @_cudf_nvtx_annotate + def nans_to_nulls(self): + """ + Convert nans (if any) to nulls + + Returns + ------- + DataFrame or Series + + Examples + -------- + **Series** + + >>> import cudf, numpy as np + >>> series = cudf.Series([1, 2, np.nan, None, 10], nan_as_null=False) + >>> series + 0 1.0 + 1 2.0 + 2 NaN + 3 + 4 10.0 + dtype: float64 + >>> series.nans_to_nulls() + 0 1.0 + 1 2.0 + 2 + 3 + 4 10.0 + dtype: float64 + + **DataFrame** + + >>> df = cudf.DataFrame() + >>> df['a'] = cudf.Series([1, None, np.nan], nan_as_null=False) + >>> df['b'] = cudf.Series([None, 3.14, np.nan], nan_as_null=False) + >>> df + a b + 0 1.0 + 1 3.14 + 2 NaN NaN + >>> df.nans_to_nulls() + a b + 0 1.0 + 1 3.14 + 2 + """ + result_data = {} + for name, col in self._data.items(): + try: + result_data[name] = col.nans_to_nulls() + except AttributeError: + result_data[name] = col.copy() + return self._from_data_like_self(result_data) + def _copy_type_metadata( self, other: Self, From fdf31e382833133d91d151fc868a78d4ad5d9ff4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Jan 2024 18:28:36 +0000 Subject: [PATCH 150/162] Move rolling to IndexedFrame --- python/cudf/cudf/core/frame.py | 16 ---------------- python/cudf/cudf/core/indexed_frame.py | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index b06afa5da0b..5d4bea580bb 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -42,9 +42,7 @@ ) from cudf.core.column_accessor import ColumnAccessor from cudf.core.mixins import BinaryOperand, Scannable -from cudf.core.window import Rolling from cudf.utils import ioutils -from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import find_common_type from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf @@ -1990,20 +1988,6 @@ def __deepcopy__(self, memo): def __copy__(self): return self.copy(deep=False) - @_cudf_nvtx_annotate - @copy_docstring(Rolling) - def rolling( - self, window, min_periods=None, center=False, axis=0, win_type=None - ): - return Rolling( - self, - window, - min_periods=min_periods, - center=center, - axis=axis, - win_type=win_type, - ) - @_cudf_nvtx_annotate def __invert__(self): """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 5d634418655..10e6493ce7e 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -66,8 +66,10 @@ _post_process_output_col, _return_arr_from_dtype, ) +from cudf.core.window import Rolling from cudf.utils import docutils from cudf.utils._numba import _CUDFNumbaConfig +from cudf.utils.docutils import copy_docstring from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.utils import _warn_no_dask_cudf @@ -1761,6 +1763,20 @@ def mask(self, cond, other=None, inplace: bool = False) -> Optional[Self]: return self.where(cond=~cond, other=other, inplace=inplace) + @_cudf_nvtx_annotate + @copy_docstring(Rolling) + def rolling( + self, window, min_periods=None, center=False, axis=0, win_type=None + ): + return Rolling( + self, + window, + min_periods=min_periods, + center=center, + axis=axis, + win_type=win_type, + ) + @_cudf_nvtx_annotate def nans_to_nulls(self): """ From 0bcdb2de6fb8c355e53c96fb166fe779a3b28807 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Jan 2024 18:29:13 +0000 Subject: [PATCH 151/162] Move notnull (alias) to IndexedFrame --- python/cudf/cudf/core/frame.py | 3 --- python/cudf/cudf/core/indexed_frame.py | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 5d4bea580bb..5618647b7f7 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1245,9 +1245,6 @@ def notna(self): data_columns = (col.notnull() for col in self._columns) return self._from_data_like_self(zip(self._column_names, data_columns)) - # Alias for notna - notnull = notna - @_cudf_nvtx_annotate def searchsorted( self, diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 10e6493ce7e..3688a65ff82 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1243,6 +1243,9 @@ def tail(self, n=5): # Alias for isna isnull = Frame.isna + # Alias for notna + notnull = Frame.notna + @_cudf_nvtx_annotate def sum( self, From ea7ebfbf28a2d7ad19488622b20c6013bade7016 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Jan 2024 18:30:47 +0000 Subject: [PATCH 152/162] Move pipe to IndexedFrame --- python/cudf/cudf/core/frame.py | 47 -------------------------- python/cudf/cudf/core/indexed_frame.py | 47 ++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 47 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 5618647b7f7..b230bac3706 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -612,53 +612,6 @@ def where(self, cond, other=None, inplace: bool = False) -> Optional[Self]: """ raise NotImplementedError - @_cudf_nvtx_annotate - def pipe(self, func, *args, **kwargs): - """ - Apply ``func(self, *args, **kwargs)``. - - Parameters - ---------- - func : function - Function to apply to the Series/DataFrame/Index. - ``args``, and ``kwargs`` are passed into ``func``. - Alternatively a ``(callable, data_keyword)`` tuple where - ``data_keyword`` is a string indicating the keyword of - ``callable`` that expects the Series/DataFrame/Index. - args : iterable, optional - Positional arguments passed into ``func``. - kwargs : mapping, optional - A dictionary of keyword arguments passed into ``func``. - - Returns - ------- - object : the return type of ``func``. - - Examples - -------- - Use ``.pipe`` when chaining together functions that expect - Series, DataFrames or GroupBy objects. Instead of writing - - >>> func(g(h(df), arg1=a), arg2=b, arg3=c) - - You can write - - >>> (df.pipe(h) - ... .pipe(g, arg1=a) - ... .pipe(func, arg2=b, arg3=c) - ... ) - - If you have a function that takes the data as (say) the second - argument, pass a tuple indicating which keyword expects the - data. For example, suppose ``f`` takes its data as ``arg2``: - - >>> (df.pipe(h) - ... .pipe(g, arg1=a) - ... .pipe((func, 'arg2'), arg1=a, arg3=c) - ... ) - """ - return cudf.core.common.pipe(self, func, *args, **kwargs) - @_cudf_nvtx_annotate def fillna( self, diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 3688a65ff82..807445af2a9 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1246,6 +1246,53 @@ def tail(self, n=5): # Alias for notna notnull = Frame.notna + @_cudf_nvtx_annotate + def pipe(self, func, *args, **kwargs): + """ + Apply ``func(self, *args, **kwargs)``. + + Parameters + ---------- + func : function + Function to apply to the Series/DataFrame/Index. + ``args``, and ``kwargs`` are passed into ``func``. + Alternatively a ``(callable, data_keyword)`` tuple where + ``data_keyword`` is a string indicating the keyword of + ``callable`` that expects the Series/DataFrame/Index. + args : iterable, optional + Positional arguments passed into ``func``. + kwargs : mapping, optional + A dictionary of keyword arguments passed into ``func``. + + Returns + ------- + object : the return type of ``func``. + + Examples + -------- + Use ``.pipe`` when chaining together functions that expect + Series, DataFrames or GroupBy objects. Instead of writing + + >>> func(g(h(df), arg1=a), arg2=b, arg3=c) + + You can write + + >>> (df.pipe(h) + ... .pipe(g, arg1=a) + ... .pipe(func, arg2=b, arg3=c) + ... ) + + If you have a function that takes the data as (say) the second + argument, pass a tuple indicating which keyword expects the + data. For example, suppose ``f`` takes its data as ``arg2``: + + >>> (df.pipe(h) + ... .pipe(g, arg1=a) + ... .pipe((func, 'arg2'), arg1=a, arg3=c) + ... ) + """ + return cudf.core.common.pipe(self, func, *args, **kwargs) + @_cudf_nvtx_annotate def sum( self, From 7b0bcde0c4147b280ff12bfe478942bf863c907f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Jan 2024 18:32:16 +0000 Subject: [PATCH 153/162] Move conversion functions --- python/cudf/cudf/core/frame.py | 39 ------------------------ python/cudf/cudf/core/indexed_frame.py | 41 +++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 40 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index b230bac3706..9164c35c00b 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1880,22 +1880,6 @@ def any(self, axis=0, skipna=True, **kwargs): **kwargs, ) - @_cudf_nvtx_annotate - @ioutils.doc_to_json() - def to_json(self, path_or_buf=None, *args, **kwargs): - """{docstring}""" - - return cudf.io.json.to_json( - self, path_or_buf=path_or_buf, *args, **kwargs - ) - - @_cudf_nvtx_annotate - @ioutils.doc_to_hdf() - def to_hdf(self, path_or_buf, key, *args, **kwargs): - """{docstring}""" - - cudf.io.hdf.to_hdf(path_or_buf, key, self, *args, **kwargs) - @_cudf_nvtx_annotate @ioutils.doc_to_dlpack() def to_dlpack(self): @@ -1903,29 +1887,6 @@ def to_dlpack(self): return cudf.io.dlpack.to_dlpack(self) - @_cudf_nvtx_annotate - def to_string(self): - r""" - Convert to string - - cuDF uses Pandas internals for efficient string formatting. - Set formatting options using pandas string formatting options and - cuDF objects will print identically to Pandas objects. - - cuDF supports `null/None` as a value in any column type, which - is transparently supported during this output process. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame() - >>> df['key'] = [0, 1, 2] - >>> df['val'] = [float(i + 10) for i in range(3)] - >>> df.to_string() - ' key val\n0 0 10.0\n1 1 11.0\n2 2 12.0' - """ - return repr(self) - @_cudf_nvtx_annotate def __str__(self): return self.to_string() diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 807445af2a9..df2fd881432 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -67,7 +67,7 @@ _return_arr_from_dtype, ) from cudf.core.window import Rolling -from cudf.utils import docutils +from cudf.utils import docutils, ioutils from cudf.utils._numba import _CUDFNumbaConfig from cudf.utils.docutils import copy_docstring from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate @@ -507,6 +507,45 @@ def empty(self): """ return self.size == 0 + @_cudf_nvtx_annotate + @ioutils.doc_to_json() + def to_json(self, path_or_buf=None, *args, **kwargs): + """{docstring}""" + + return cudf.io.json.to_json( + self, path_or_buf=path_or_buf, *args, **kwargs + ) + + @_cudf_nvtx_annotate + @ioutils.doc_to_hdf() + def to_hdf(self, path_or_buf, key, *args, **kwargs): + """{docstring}""" + + cudf.io.hdf.to_hdf(path_or_buf, key, self, *args, **kwargs) + + @_cudf_nvtx_annotate + def to_string(self): + r""" + Convert to string + + cuDF uses Pandas internals for efficient string formatting. + Set formatting options using pandas string formatting options and + cuDF objects will print identically to Pandas objects. + + cuDF supports `null/None` as a value in any column type, which + is transparently supported during this output process. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame() + >>> df['key'] = [0, 1, 2] + >>> df['val'] = [float(i + 10) for i in range(3)] + >>> df.to_string() + ' key val\n0 0 10.0\n1 1 11.0\n2 2 12.0' + """ + return repr(self) + def copy(self, deep: bool = True) -> Self: """Make a copy of this object's indices and data. From 28548f6c3b7b4aabecc7a325cb86f550f8026377 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Jan 2024 18:32:58 +0000 Subject: [PATCH 154/162] Add missing methods to the docs --- docs/cudf/source/user_guide/api_docs/index_objects.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/cudf/source/user_guide/api_docs/index_objects.rst b/docs/cudf/source/user_guide/api_docs/index_objects.rst index ff190da86bf..e669b95198c 100644 --- a/docs/cudf/source/user_guide/api_docs/index_objects.rst +++ b/docs/cudf/source/user_guide/api_docs/index_objects.rst @@ -41,6 +41,7 @@ Modifying and computations .. autosummary:: :toctree: api/ + Index.all Index.any Index.copy Index.drop_duplicates @@ -60,6 +61,7 @@ Modifying and computations Index.where Index.take Index.unique + Index.nunique Compatibility with MultiIndex ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 59af57d3f5a776b636ec7d66b9d6079e314f07e9 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Jan 2024 18:45:47 +0000 Subject: [PATCH 155/162] Add isnull and notnull to index docs --- docs/cudf/source/user_guide/api_docs/index_objects.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/cudf/source/user_guide/api_docs/index_objects.rst b/docs/cudf/source/user_guide/api_docs/index_objects.rst index e669b95198c..9c84f206010 100644 --- a/docs/cudf/source/user_guide/api_docs/index_objects.rst +++ b/docs/cudf/source/user_guide/api_docs/index_objects.rst @@ -78,7 +78,9 @@ Missing values Index.fillna Index.dropna Index.isna + Index.isnull Index.notna + Index.notnull Memory usage ~~~~~~~~~~~~ From c6f5392a0a8f0ad5687716944b697ebc9965cc82 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Jan 2024 18:47:50 +0000 Subject: [PATCH 156/162] Revert "Move isnull (alias) to IndexedFrame" This reverts commit 50d287f97bec818a6027b55727ab1d8e538fd4ce. --- python/cudf/cudf/core/frame.py | 3 +++ python/cudf/cudf/core/indexed_frame.py | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 9164c35c00b..abbb730fb71 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1122,6 +1122,9 @@ def isna(self): data_columns = (col.isnull() for col in self._columns) return self._from_data_like_self(zip(self._column_names, data_columns)) + # Alias for isna + isnull = isna + @_cudf_nvtx_annotate def notna(self): """ diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index df2fd881432..c12d55d6873 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1279,9 +1279,6 @@ def tail(self, n=5): return self.iloc[-n:] - # Alias for isna - isnull = Frame.isna - # Alias for notna notnull = Frame.notna From 63015385cf7cde97dead6b1ce973faef78d25865 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Jan 2024 18:48:30 +0000 Subject: [PATCH 157/162] Revert "Move notnull (alias) to IndexedFrame" This reverts commit 0bcdb2de6fb8c355e53c96fb166fe779a3b28807. --- python/cudf/cudf/core/frame.py | 3 +++ python/cudf/cudf/core/indexed_frame.py | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index abbb730fb71..996b1a80c79 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1201,6 +1201,9 @@ def notna(self): data_columns = (col.notnull() for col in self._columns) return self._from_data_like_self(zip(self._column_names, data_columns)) + # Alias for notna + notnull = notna + @_cudf_nvtx_annotate def searchsorted( self, diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index c12d55d6873..6d53198611e 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1279,9 +1279,6 @@ def tail(self, n=5): return self.iloc[-n:] - # Alias for notna - notnull = Frame.notna - @_cudf_nvtx_annotate def pipe(self, func, *args, **kwargs): """ From a95bc6a0fabece2ce10d493c0c7acca743a09f29 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Jan 2024 19:13:33 +0000 Subject: [PATCH 158/162] Make sure str works even if to_string does not --- python/cudf/cudf/core/frame.py | 2 +- python/cudf/cudf/core/indexed_frame.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 996b1a80c79..96b62e185b3 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1895,7 +1895,7 @@ def to_dlpack(self): @_cudf_nvtx_annotate def __str__(self): - return self.to_string() + return repr(self) @_cudf_nvtx_annotate def __deepcopy__(self, memo): diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 6d53198611e..9c35dba7cfd 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -544,7 +544,7 @@ def to_string(self): >>> df.to_string() ' key val\n0 0 10.0\n1 1 11.0\n2 2 12.0' """ - return repr(self) + return str(self) def copy(self, deep: bool = True) -> Self: """Make a copy of this object's indices and data. From 4f0563d563bda4608ce93e2436a9d224abf4f073 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Jan 2024 19:13:56 +0000 Subject: [PATCH 159/162] Remove tests of now unsupported reductions --- python/cudf/cudf/tests/test_array_function.py | 4 ---- python/cudf/cudf/tests/test_index.py | 3 --- 2 files changed, 7 deletions(-) diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py index 090e8884991..58939f0ddd9 100644 --- a/python/cudf/cudf/tests/test_array_function.py +++ b/python/cudf/cudf/tests/test_array_function.py @@ -104,11 +104,7 @@ def test_array_func_missing_cudf_dataframe(pd_df, func): @pytest.mark.parametrize( "func", [ - lambda x: np.mean(x), - lambda x: np.sum(x), - lambda x: np.var(x, ddof=1), lambda x: np.unique(x), - lambda x: np.dot(x, x), ], ) def test_array_func_cudf_index(np_ar, func): diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 5cc1c93deff..996b651b9fe 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -132,11 +132,8 @@ def test_index_comparision(): [ lambda x: x.min(), lambda x: x.max(), - lambda x: x.sum(), - lambda x: x.mean(), lambda x: x.any(), lambda x: x.all(), - lambda x: x.prod(), ], ) def test_reductions(func): From 07e98723864fb719eb2682b4aa0abab54770d67a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Jan 2024 19:24:31 +0000 Subject: [PATCH 160/162] Address feedback --- python/cudf/cudf/core/indexed_frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 9c35dba7cfd..15277ff5586 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1287,11 +1287,11 @@ def pipe(self, func, *args, **kwargs): Parameters ---------- func : function - Function to apply to the Series/DataFrame/Index. + Function to apply to the Series/DataFrame. ``args``, and ``kwargs`` are passed into ``func``. Alternatively a ``(callable, data_keyword)`` tuple where ``data_keyword`` is a string indicating the keyword of - ``callable`` that expects the Series/DataFrame/Index. + ``callable`` that expects the Series/DataFrame. args : iterable, optional Positional arguments passed into ``func``. kwargs : mapping, optional From 6ed75ffddcd678a5dbcfd5f0e2dccf98531b4282 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 30 Jan 2024 14:04:52 -0800 Subject: [PATCH 161/162] Fix V2 Parquet page alignment for use with zStandard compression (#14841) Fixes #14781 This PR makes changes to the Parquet writer to ensure that data to be compressed is properly aligned. Changes have also been made to the `EncPage` struct to make it easier to keep fields in that struct aligned, and also to reduce confusing re-use of fields. In particular, the `max_data_size` field can be any of a) the maximum possible size for the page data, b) the actual size of page data after encoding, c) the actual size of compressed page data. The latter two now have their own fields, `data_size` and `comp_data_size`. Authors: - Ed Seidl (https://github.com/etseidl) - Mike Wilson (https://github.com/hyperbolic2346) Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/14841 --- cpp/src/io/parquet/page_enc.cu | 129 +++++++++++++++++---------- cpp/src/io/parquet/parquet_gpu.hpp | 43 +++++---- cpp/src/io/parquet/writer_impl.cu | 7 +- cpp/tests/io/parquet_v2_test.cpp | 5 +- cpp/tests/io/parquet_writer_test.cpp | 17 ---- 5 files changed, 115 insertions(+), 86 deletions(-) diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 3cc4fda695f..2f351edd2b9 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -388,6 +388,27 @@ __device__ uint8_t const* delta_encode(page_enc_state_s<0>* s, uint64_t* buffer, return packer.flush(); } +/** + * @brief Sets `s->cur` to point to the start of encoded page data. + * + * For V1 headers, this will be immediately after the repetition and definition level data. For V2, + * it will be at the next properly aligned location after the level data. The padding in V2 is + * needed for compressors that require aligned input. + */ +template +inline void __device__ set_page_data_start(state_type* s) +{ + s->cur = s->page.page_data + s->page.max_hdr_size; + switch (s->page.page_type) { + case PageType::DATA_PAGE: + s->cur += s->page.level_bytes(); + if (s->col.num_def_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; } + if (s->col.num_rep_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; } + break; + case PageType::DATA_PAGE_V2: s->cur += s->page.max_lvl_size; break; + } +} + } // anonymous namespace // blockDim {512,1,1} @@ -594,8 +615,13 @@ CUDF_KERNEL void __launch_bounds__(128) page_g.chunk = &chunks[blockIdx.y][blockIdx.x]; page_g.chunk_id = blockIdx.y * num_columns + blockIdx.x; page_g.hdr_size = 0; + page_g.def_lvl_bytes = 0; + page_g.rep_lvl_bytes = 0; + page_g.max_lvl_size = 0; + page_g.comp_data_size = 0; page_g.max_hdr_size = MAX_V1_HDR_SIZE; page_g.max_data_size = ck_g.uniq_data_size; + page_g.data_size = ck_g.uniq_data_size; page_g.start_row = cur_row; page_g.num_rows = ck_g.num_dict_entries; page_g.num_leaf_values = ck_g.num_dict_entries; @@ -689,12 +715,17 @@ CUDF_KERNEL void __launch_bounds__(128) page_size = 1 + max_RLE_page_size(ck_g.dict_rle_bits, values_in_page); } if (!t) { - page_g.num_fragments = fragments_in_chunk - page_start; - page_g.chunk = &chunks[blockIdx.y][blockIdx.x]; - page_g.chunk_id = blockIdx.y * num_columns + blockIdx.x; - page_g.page_type = data_page_type; - page_g.hdr_size = 0; - page_g.max_hdr_size = max_data_page_hdr_size; // Max size excluding statistics + page_g.num_fragments = fragments_in_chunk - page_start; + page_g.chunk = &chunks[blockIdx.y][blockIdx.x]; + page_g.chunk_id = blockIdx.y * num_columns + blockIdx.x; + page_g.page_type = data_page_type; + page_g.hdr_size = 0; + page_g.def_lvl_bytes = 0; + page_g.rep_lvl_bytes = 0; + page_g.max_lvl_size = 0; + page_g.data_size = 0; + page_g.comp_data_size = 0; + page_g.max_hdr_size = max_data_page_hdr_size; // Max size excluding statistics if (ck_g.stats) { uint32_t stats_hdr_len = 16; if (col_g.stats_dtype == dtype_string || col_g.stats_dtype == dtype_byte_array) { @@ -716,13 +747,19 @@ CUDF_KERNEL void __launch_bounds__(128) page_g.num_valid = num_valid; auto const def_level_size = max_RLE_page_size(col_g.num_def_level_bits(), values_in_page); auto const rep_level_size = max_RLE_page_size(col_g.num_rep_level_bits(), values_in_page); + if (write_v2_headers) { + page_g.max_lvl_size = + util::round_up_unsafe(def_level_size + rep_level_size, page_align); + } // get a different bound if using delta encoding if (is_use_delta) { auto const delta_len = delta_data_len(physical_type, type_id, page_g.num_leaf_values, page_size); page_size = max(page_size, delta_len); } - auto const max_data_size = page_size + def_level_size + rep_level_size + rle_pad; + auto const max_data_size = + page_size + rle_pad + + (write_v2_headers ? page_g.max_lvl_size : def_level_size + rep_level_size); // page size must fit in 32-bit signed integer if (max_data_size > std::numeric_limits::max()) { CUDF_UNREACHABLE("page size exceeds maximum for i32"); @@ -739,7 +776,9 @@ CUDF_KERNEL void __launch_bounds__(128) page_offset += util::round_up_unsafe(page_g.max_hdr_size + page_g.max_data_size, page_align); if (not comp_page_sizes.empty()) { - comp_page_offset += page_g.max_hdr_size + comp_page_sizes[ck_g.first_page + num_pages]; + // V2 does not include level data in compressed size estimate + comp_page_offset += page_g.max_hdr_size + page_g.max_lvl_size + + comp_page_sizes[ck_g.first_page + num_pages]; } page_headers_size += page_g.max_hdr_size; max_page_data_size = max(max_page_data_size, page_g.max_data_size); @@ -774,8 +813,10 @@ CUDF_KERNEL void __launch_bounds__(128) } pages[ck_g.first_page + num_pages] = page_g; } + // page_sizes should be the number of bytes to be compressed, so don't include level + // data for V2. if (not page_sizes.empty()) { - page_sizes[ck_g.first_page + num_pages] = page_g.max_data_size; + page_sizes[ck_g.first_page + num_pages] = page_g.max_data_size - page_g.max_lvl_size; } if (page_grstats) { page_grstats[ck_g.first_page + num_pages] = pagestats_g; } } @@ -1429,10 +1470,6 @@ __device__ void finish_page_encode(state_buf* s, return thrust::reduce(thrust::seq, hist_start, hist_end, 0U); }; - // V2 does not compress rep and def level data - size_t const skip_comp_size = - write_v2_headers ? s->page.def_lvl_bytes + s->page.rep_lvl_bytes : 0; - // this will be true if max_rep > 0 (i.e. there are lists) if (s->page.rep_histogram != nullptr) { // for repetition we get hist[0] from num_rows, and can derive hist[max_rep_level] @@ -1489,10 +1526,17 @@ __device__ void finish_page_encode(state_buf* s, // FIXME(ets): this needs to do error propagation back to the host CUDF_UNREACHABLE("detected possible page data corruption"); } - s->page.max_data_size = actual_data_size; + if (s->page.is_v2()) { + auto const d_base = base + s->page.max_lvl_size; + s->page.data_size = static_cast(end_ptr - d_base) + s->page.level_bytes(); + } else { + s->page.data_size = actual_data_size; + } if (not comp_in.empty()) { - comp_in[blockIdx.x] = {base + skip_comp_size, actual_data_size - skip_comp_size}; - comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size + skip_comp_size, + auto const c_base = base + s->page.max_lvl_size; + auto const bytes_to_compress = static_cast(end_ptr - c_base); + comp_in[blockIdx.x] = {c_base, bytes_to_compress}; + comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size + s->page.max_lvl_size, 0}; // size is unused } pages[blockIdx.x] = s->page; @@ -1503,10 +1547,10 @@ __device__ void finish_page_encode(state_buf* s, } // copy uncompressed bytes over - if (skip_comp_size != 0 && not comp_in.empty()) { + if (s->page.is_v2() and not comp_in.empty()) { uint8_t* const src = s->page.page_data + s->page.max_hdr_size; uint8_t* const dst = s->page.compressed_data + s->page.max_hdr_size; - for (int i = t; i < skip_comp_size; i += block_size) { + for (int i = t; i < s->page.level_bytes(); i += block_size) { dst[i] = src[i]; } } @@ -1536,13 +1580,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8) s->col = *s->ck.col_desc; s->rle_len_pos = nullptr; // get s->cur back to where it was at the end of encoding the rep and def level data - s->cur = - s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes; - // if V1 data page, need space for the RLE length fields - if (s->page.page_type == PageType::DATA_PAGE) { - if (s->col.num_def_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; } - if (s->col.num_rep_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; } - } + set_page_data_start(s); } __syncthreads(); @@ -1771,13 +1809,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8) s->col = *s->ck.col_desc; s->rle_len_pos = nullptr; // get s->cur back to where it was at the end of encoding the rep and def level data - s->cur = - s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes; - // if V1 data page, need space for the RLE length fields - if (s->page.page_type == PageType::DATA_PAGE) { - if (s->col.num_def_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; } - if (s->col.num_rep_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; } - } + set_page_data_start(s); } __syncthreads(); @@ -1908,8 +1940,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8) s->col = *s->ck.col_desc; s->rle_len_pos = nullptr; // get s->cur back to where it was at the end of encoding the rep and def level data - s->cur = - s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes; + set_page_data_start(s); } __syncthreads(); @@ -2017,8 +2048,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8) s->col = *s->ck.col_desc; s->rle_len_pos = nullptr; // get s->cur back to where it was at the end of encoding the rep and def level data - s->cur = - s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes; + set_page_data_start(s); } __syncthreads(); @@ -2142,11 +2172,10 @@ CUDF_KERNEL void __launch_bounds__(decide_compression_block_size) auto const num_pages = ck_g[warp_id].num_pages; for (auto page_id = lane_id; page_id < num_pages; page_id += cudf::detail::warp_size) { auto const& curr_page = ck_g[warp_id].pages[page_id]; - auto const page_data_size = curr_page.max_data_size; - auto const is_v2 = curr_page.page_type == PageType::DATA_PAGE_V2; - auto const lvl_bytes = is_v2 ? curr_page.def_lvl_bytes + curr_page.rep_lvl_bytes : 0; + auto const page_data_size = curr_page.data_size; uncompressed_data_size += page_data_size; if (auto comp_res = curr_page.comp_res; comp_res != nullptr) { + auto const lvl_bytes = curr_page.is_v2() ? curr_page.level_bytes() : 0; compressed_data_size += comp_res->bytes_written + lvl_bytes; if (comp_res->status != compression_status::SUCCESS) { atomicOr(&compression_error[warp_id], 1); @@ -2614,14 +2643,13 @@ CUDF_KERNEL void __launch_bounds__(128) EncodeStatistics(hdr_start, &chunk_stats[page_g.chunk_id], col_g.stats_dtype, scratch); page_g.chunk->ck_stat_size = static_cast(hdr_end - hdr_start); } - uncompressed_page_size = page_g.max_data_size; + uncompressed_page_size = page_g.data_size; if (ck_g.is_compressed) { - auto const is_v2 = page_g.page_type == PageType::DATA_PAGE_V2; - auto const lvl_bytes = is_v2 ? page_g.def_lvl_bytes + page_g.rep_lvl_bytes : 0; + auto const lvl_bytes = page_g.is_v2() ? page_g.level_bytes() : 0; hdr_start = page_g.compressed_data; compressed_page_size = static_cast(comp_results[blockIdx.x].bytes_written) + lvl_bytes; - page_g.max_data_size = compressed_page_size; + page_g.comp_data_size = compressed_page_size; } else { hdr_start = page_g.page_data; compressed_page_size = uncompressed_page_size; @@ -2708,19 +2736,26 @@ CUDF_KERNEL void __launch_bounds__(1024) if (t == 0) { page_g = first_page[page]; } __syncthreads(); - src = (ck_g.is_compressed) ? page_g.compressed_data : page_g.page_data; + src = ck_g.is_compressed ? page_g.compressed_data : page_g.page_data; // Copy page header hdr_len = page_g.hdr_size; memcpy_block<1024, true>(dst, src, hdr_len, t); src += page_g.max_hdr_size; dst += hdr_len; - // Copy page data uncompressed_size += hdr_len; - data_len = page_g.max_data_size; + data_len = ck_g.is_compressed ? page_g.comp_data_size : page_g.data_size; + // Copy page data. For V2, the level data and page data are disjoint. + if (page_g.is_v2()) { + auto const lvl_len = page_g.level_bytes(); + memcpy_block<1024, true>(dst, src, lvl_len, t); + src += page_g.max_lvl_size; + dst += lvl_len; + data_len -= lvl_len; + } memcpy_block<1024, true>(dst, src, data_len, t); dst += data_len; __syncthreads(); - if (!t && page == 0 && ck_g.use_dictionary) { ck_g.dictionary_size = hdr_len + data_len; } + if (t == 0 && page == 0 && ck_g.use_dictionary) { ck_g.dictionary_size = hdr_len + data_len; } } if (t == 0) { chunks[blockIdx.x].bfr_size = uncompressed_size; diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index d58c7f95389..b215cd7a20b 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -560,30 +560,41 @@ struct EncColumnChunk { * @brief Struct describing an encoder data page */ struct EncPage { - uint8_t* page_data; //!< Ptr to uncompressed page - uint8_t* compressed_data; //!< Ptr to compressed page - uint16_t num_fragments; //!< Number of fragments in page - PageType page_type; //!< Page type - Encoding encoding; //!< Encoding used for page data - EncColumnChunk* chunk; //!< Chunk that this page belongs to + // all pointers at the top to keep things properly aligned + uint8_t* page_data; //!< Ptr to uncompressed page + uint8_t* compressed_data; //!< Ptr to compressed page + EncColumnChunk* chunk; //!< Chunk that this page belongs to + compression_result* comp_res; //!< Ptr to compression result + uint32_t* def_histogram; //!< Histogram of counts for each definition level + uint32_t* rep_histogram; //!< Histogram of counts for each repetition level + // put this here in case it's ever made 64-bit + encode_kernel_mask kernel_mask; //!< Mask used to control which encoding kernels to run + // the rest can be 4 byte aligned uint32_t chunk_id; //!< Index in chunk array - uint32_t hdr_size; //!< Size of page header + uint32_t hdr_size; //!< Actual size of encoded page header uint32_t max_hdr_size; //!< Maximum size of page header - uint32_t max_data_size; //!< Maximum size of coded page data (excluding header) + uint32_t max_data_size; //!< Maximum size of encoded page data (excluding header) + uint32_t data_size; //!< Actual size of encoded page data (includes level data) + uint32_t comp_data_size; //!< Actual size of compressed page data uint32_t start_row; //!< First row of page uint32_t num_rows; //!< Rows in page uint32_t num_leaf_values; //!< Values in page. Different from num_rows in case of nested types uint32_t num_values; //!< Number of def/rep level values in page. Includes null/empty elements in //!< non-leaf levels - uint32_t def_lvl_bytes; //!< Number of bytes of encoded definition level data (V2 only) - uint32_t rep_lvl_bytes; //!< Number of bytes of encoded repetition level data (V2 only) - compression_result* comp_res; //!< Ptr to compression result - uint32_t num_nulls; //!< Number of null values (V2 only) (down here for alignment) - encode_kernel_mask kernel_mask; //!< Mask used to control which encoding kernels to run - uint32_t* def_histogram; //!< Histogram of counts for each definition level - uint32_t* rep_histogram; //!< Histogram of counts for each repetition level - uint32_t var_bytes_size; //!< Number of variable length bytes in the page (byte arrays only) + uint32_t def_lvl_bytes; //!< Number of bytes of encoded definition level data + uint32_t rep_lvl_bytes; //!< Number of bytes of encoded repetition level data + uint32_t max_lvl_size; //!< Maximum size of level data (V2 only, 0 for V1) + uint32_t num_nulls; //!< Number of null values uint32_t num_valid; //!< Number of valid leaf values + uint32_t var_bytes_size; //!< Number of variable length bytes in the page (byte arrays only) + // enums and smaller stuff down here + PageType page_type; //!< Page type + Encoding encoding; //!< Encoding used for page data + uint16_t num_fragments; //!< Number of fragments in page + + constexpr bool is_v2() const { return page_type == PageType::DATA_PAGE_V2; } + + constexpr auto level_bytes() const { return def_lvl_bytes + rep_lvl_bytes; } }; /** diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 93b225dca1b..0303439fb27 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -2220,10 +2220,6 @@ writer::impl::~impl() { close(); } void writer::impl::init_state() { - // See issue #14781. Can remove this check once that is fixed. - CUDF_EXPECTS(not(_write_v2_headers and _compression == Compression::ZSTD), - "V2 page headers cannot be used with ZSTD compression"); - _current_chunk_offset.resize(_out_sink.size()); // Write file header file_header_s fhdr; @@ -2405,7 +2401,8 @@ void writer::impl::write_parquet_data_to_sink( // skip dict pages if (enc_page.page_type == PageType::DICTIONARY_PAGE) { continue; } - int32_t this_page_size = enc_page.hdr_size + enc_page.max_data_size; + int32_t const this_page_size = + enc_page.hdr_size + (ck.is_compressed ? enc_page.comp_data_size : enc_page.data_size); // first_row_idx is relative to start of row group PageLocation loc{curr_pg_offset, this_page_size, enc_page.start_row - ck.start_row}; if (is_byte_arr) { var_bytes.push_back(enc_page.var_bytes_size); } diff --git a/cpp/tests/io/parquet_v2_test.cpp b/cpp/tests/io/parquet_v2_test.cpp index 1a373ed92ae..25d58a96512 100644 --- a/cpp/tests/io/parquet_v2_test.cpp +++ b/cpp/tests/io/parquet_v2_test.cpp @@ -35,7 +35,7 @@ INSTANTIATE_TEST_SUITE_P(ParquetV2ReadWriteTest, TEST_P(ParquetV2Test, MultiColumn) { - constexpr auto num_rows = 50000; + constexpr auto num_rows = 50'000; auto const is_v2 = GetParam(); // auto col0_data = random_values(num_rows); @@ -84,6 +84,7 @@ TEST_P(ParquetV2Test, MultiColumn) cudf::io::parquet_writer_options out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) .write_v2_headers(is_v2) + .compression(cudf::io::compression_type::ZSTD) .metadata(expected_metadata); cudf::io::write_parquet(out_opts); @@ -156,6 +157,7 @@ TEST_P(ParquetV2Test, MultiColumnWithNulls) cudf::io::parquet_writer_options out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) .write_v2_headers(is_v2) + .compression(cudf::io::compression_type::ZSTD) .metadata(expected_metadata); cudf::io::write_parquet(out_opts); @@ -197,6 +199,7 @@ TEST_P(ParquetV2Test, Strings) cudf::io::parquet_writer_options out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) .write_v2_headers(is_v2) + .compression(cudf::io::compression_type::ZSTD) .metadata(expected_metadata); cudf::io::write_parquet(out_opts); diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index 2df34c7928b..34061cb7bf8 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -1064,7 +1064,6 @@ TEST_F(ParquetWriterTest, DictionaryAdaptiveTest) auto const expected = table_view{{col0, col1}}; auto const filepath = temp_env->get_temp_filepath("DictionaryAdaptiveTest.parquet"); - // no compression so we can easily read page data cudf::io::parquet_writer_options out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) .compression(cudf::io::compression_type::ZSTD) @@ -1116,7 +1115,6 @@ TEST_F(ParquetWriterTest, DictionaryAlwaysTest) auto const expected = table_view{{col0, col1}}; auto const filepath = temp_env->get_temp_filepath("DictionaryAlwaysTest.parquet"); - // no compression so we can easily read page data cudf::io::parquet_writer_options out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) .compression(cudf::io::compression_type::ZSTD) @@ -1428,21 +1426,6 @@ TEST_F(ParquetWriterTest, RowGroupMetadata) static_cast(num_rows * sizeof(column_type))); } -// See #14772. -// zStandard compression cannot currently be used with V2 page headers due to buffer -// alignment issues. -// TODO: Remove this test when #14781 is closed. -TEST_F(ParquetWriterTest, ZstdWithV2Header) -{ - auto const expected = table_view{}; - - cudf::io::parquet_writer_options const out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{"14772.pq"}, expected) - .compression(cudf::io::compression_type::ZSTD) - .write_v2_headers(true); - EXPECT_THROW(cudf::io::write_parquet(out_opts), cudf::logic_error); -} - ///////////////////////////////////////////////////////////// // custom mem mapped data sink that supports device writes template From bb59715162218c0c638f5c368e6871ca15168838 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Wed, 31 Jan 2024 14:52:37 -0600 Subject: [PATCH 162/162] Fix dask token normalization (#14829) This PR fixes cudf's `__dask_tokenization__` definitions so that they will produce data that can be deterministically tokenized when a `MultiIndex` is present. I ran into this problem in dask-expr for an index with datetime data (a case reflected by the new test). Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Charles Blackmon-Luca (https://github.com/charlesbluca) URL: https://github.com/rapidsai/cudf/pull/14829 --- python/cudf/cudf/core/frame.py | 6 ++++-- python/cudf/cudf/core/indexed_frame.py | 8 +++++--- python/dask_cudf/dask_cudf/tests/test_dispatch.py | 14 +++++++++++++- 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 96b62e185b3..79005193b4e 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1954,10 +1954,12 @@ def _repeat( @_cudf_nvtx_annotate @_warn_no_dask_cudf def __dask_tokenize__(self): + from dask.base import normalize_token + return [ type(self), - self._dtypes, - self.to_pandas(), + normalize_token(self._dtypes), + normalize_token(self.to_pandas()), ] diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 15277ff5586..0a0cefde9cd 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -6176,11 +6176,13 @@ def convert_dtypes( @_warn_no_dask_cudf def __dask_tokenize__(self): + from dask.base import normalize_token + return [ type(self), - self._dtypes, - self.index, - self.hash_values().values_host, + normalize_token(self._dtypes), + normalize_token(self.index), + normalize_token(self.hash_values().values_host), ] diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py index c64e25fd437..76703206726 100644 --- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py +++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py @@ -1,4 +1,6 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. + +from datetime import datetime import numpy as np import pandas as pd @@ -82,6 +84,16 @@ def test_deterministic_tokenize(index): assert tokenize(df2) == tokenize(df2) +def test_deterministic_tokenize_multiindex(): + dt = datetime.strptime("1995-03-15", "%Y-%m-%d") + index = cudf.MultiIndex( + levels=[[1, 2], [dt]], + codes=[[0, 1], [0, 0]], + ) + df = cudf.DataFrame(index=index) + assert tokenize(df) == tokenize(df) + + @pytest.mark.parametrize("preserve_index", [True, False]) def test_pyarrow_schema_dispatch(preserve_index): from dask.dataframe.dispatch import (