From 75c357053fa10d034cc531d3f7ad6821ae5af4d3 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 10 Aug 2021 12:52:20 -0700 Subject: [PATCH 01/13] Enable axis=1 for scans. --- python/cudf/cudf/core/dataframe.py | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0aafae0a85b..a068e992a7d 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6355,10 +6355,6 @@ def cummin(self, axis=None, skipna=True, *args, **kwargs): ------- DataFrame - Notes - ----- - Parameters currently not supported is `axis` - Examples -------- >>> import cudf @@ -6370,9 +6366,6 @@ def cummin(self, axis=None, skipna=True, *args, **kwargs): 2 1 7 3 1 7 """ - if axis not in (0, "index", None): - raise NotImplementedError("Only axis=0 is currently supported.") - return self._apply_support_method( "cummin", axis=axis, skipna=skipna, *args, **kwargs ) @@ -6392,10 +6385,6 @@ def cummax(self, axis=None, skipna=True, *args, **kwargs): ------- DataFrame - Notes - ----- - Parameters currently not supported is `axis` - Examples -------- >>> import cudf @@ -6407,9 +6396,6 @@ def cummax(self, axis=None, skipna=True, *args, **kwargs): 2 3 9 3 4 10 """ - if axis not in (0, "index", None): - raise NotImplementedError("Only axis=0 is currently supported.") - return self._apply_support_method( "cummax", axis=axis, skipna=skipna, *args, **kwargs ) @@ -6430,10 +6416,6 @@ def cumsum(self, axis=None, skipna=True, *args, **kwargs): ------- DataFrame - Notes - ----- - Parameters currently not supported is `axis` - Examples -------- >>> import cudf @@ -6445,9 +6427,6 @@ def cumsum(self, axis=None, skipna=True, *args, **kwargs): 2 6 24 3 10 34 """ - if axis not in (0, "index", None): - raise NotImplementedError("Only axis=0 is currently supported.") - return self._apply_support_method( "cumsum", axis=axis, skipna=skipna, *args, **kwargs ) @@ -6467,10 +6446,6 @@ def cumprod(self, axis=None, skipna=True, *args, **kwargs): ------- DataFrame - Notes - ----- - Parameters currently not supported is `axis` - Examples -------- >>> import cudf @@ -6482,9 +6457,6 @@ def cumprod(self, axis=None, skipna=True, *args, **kwargs): 2 6 504 3 24 5040 """ - if axis not in (0, "index", None): - raise NotImplementedError("Only axis=0 is currently supported.") - return self._apply_support_method( "cumprod", axis=axis, skipna=skipna, *args, **kwargs ) From 5ed37091fa4061475a4461935850518c18d13d1b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 10 Aug 2021 13:34:01 -0700 Subject: [PATCH 02/13] Document axis, initial implementation of DataFrame._scan. --- python/cudf/cudf/core/dataframe.py | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a068e992a7d..5d7cb8eb898 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6340,6 +6340,24 @@ def _reduce( elif axis == 1: return self._apply_support_method_axis_1(op, **kwargs) + def _scan( + self, op, axis=None, *args, **kwargs, + ): + axis = self._get_axis_from_axis_arg(axis) + + if axis == 0: + result = [ + getattr(self[col], op)(*args, **kwargs) + for col in self._data.names + ] + + return DataFrame._from_data( + {col: result[i] for i, col in enumerate(self._data.names)}, + index=result[0].index, + ) + elif axis == 1: + return self._apply_support_method_axis_1(op, **kwargs) + def cummin(self, axis=None, skipna=True, *args, **kwargs): """ Return cumulative minimum of the DataFrame. @@ -6347,6 +6365,8 @@ def cummin(self, axis=None, skipna=True, *args, **kwargs): Parameters ---------- + axis: {index (0), columns(1)} + Axis for the function to be applied on. skipna: bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. @@ -6366,9 +6386,7 @@ def cummin(self, axis=None, skipna=True, *args, **kwargs): 2 1 7 3 1 7 """ - return self._apply_support_method( - "cummin", axis=axis, skipna=skipna, *args, **kwargs - ) + return self._scan("cummin", axis=axis, skipna=skipna, *args, **kwargs) def cummax(self, axis=None, skipna=True, *args, **kwargs): """ @@ -6377,6 +6395,8 @@ def cummax(self, axis=None, skipna=True, *args, **kwargs): Parameters ---------- + axis: {index (0), columns(1)} + Axis for the function to be applied on. skipna: bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. @@ -6407,6 +6427,8 @@ def cumsum(self, axis=None, skipna=True, *args, **kwargs): Parameters ---------- + axis: {index (0), columns(1)} + Axis for the function to be applied on. skipna: bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. @@ -6438,6 +6460,8 @@ def cumprod(self, axis=None, skipna=True, *args, **kwargs): Parameters ---------- + axis: {index (0), columns(1)} + Axis for the function to be applied on. skipna: bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. From 5b5502afec8e0e09397d1ca8dde03e15f984493d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 10 Aug 2021 15:07:20 -0700 Subject: [PATCH 03/13] Unify scan implementations for Series. --- python/cudf/cudf/core/series.py | 146 +++++++++----------------------- 1 file changed, 39 insertions(+), 107 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 177208fa921..3381de88182 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3933,6 +3933,39 @@ def count(self, level=None, **kwargs): return self.valid_count + def _scan(self, op, axis=None, skipna=True, cast_to_int=False): + if axis not in (None, 0): + raise NotImplementedError("axis parameter is not implemented yet") + + skipna = True if skipna is None else skipna + + if skipna: + result_col = self.nans_to_nulls()._column + else: + result_col = self._column.copy() + if result_col.has_nulls: + # Workaround as find_first_value doesn't seem to work + # incase of bools. + first_index = int( + result_col.isnull().astype("int8").find_first_value(1) + ) + result_col[first_index:] = None + + if ( + cast_to_int + and not is_decimal_dtype(result_col.dtype) + and ( + np.issubdtype(result_col.dtype, np.integer) + or np.issubdtype(result_col.dtype, np.bool_) + ) + ): + # For reductions that accumulate a value (e.g. sum, not max) pandas + # returns an int64 dtype for all input int or bool dtypes. + result_col = result_col.astype(np.int64) + return Series._from_data( + {self.name: result_col._apply_scan_op(op)}, index=self.index, + ) + def cummin(self, axis=None, skipna=True, *args, **kwargs): """ Return cumulative minimum of the Series. @@ -3963,27 +3996,7 @@ def cummin(self, axis=None, skipna=True, *args, **kwargs): 3 1 4 1 """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - skipna = True if skipna is None else skipna - - if skipna: - result_col = self.nans_to_nulls()._column - else: - result_col = self._column.copy() - if result_col.has_nulls: - # Workaround as find_first_value doesn't seem to work - # incase of bools. - first_index = int( - result_col.isnull().astype("int8").find_first_value(1) - ) - result_col[first_index:] = None - - return Series( - result_col._apply_scan_op("min"), name=self.name, index=self.index, - ) + return self._scan("min", axis=axis, skipna=skipna) def cummax(self, axis=0, skipna=True, *args, **kwargs): """ @@ -4015,24 +4028,7 @@ def cummax(self, axis=0, skipna=True, *args, **kwargs): 3 5 4 5 """ - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - skipna = True if skipna is None else skipna - - if skipna: - result_col = self.nans_to_nulls()._column - else: - result_col = self._column.copy() - if result_col.has_nulls: - first_index = int( - result_col.isnull().astype("int8").find_first_value(1) - ) - result_col[first_index:] = None - - return Series( - result_col._apply_scan_op("max"), name=self.name, index=self.index, - ) + return self._scan("max", axis=axis, skipna=skipna) def cumsum(self, axis=0, skipna=True, *args, **kwargs): """ @@ -4065,38 +4061,7 @@ def cumsum(self, axis=0, skipna=True, *args, **kwargs): 3 12 4 15 """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - skipna = True if skipna is None else skipna - - if skipna: - result_col = self.nans_to_nulls()._column - else: - result_col = self._column.copy() - if result_col.has_nulls: - first_index = int( - result_col.isnull().astype("int8").find_first_value(1) - ) - result_col[first_index:] = None - - # pandas always returns int64 dtype if original dtype is int or `bool` - if not is_decimal_dtype(result_col.dtype) and ( - np.issubdtype(result_col.dtype, np.integer) - or np.issubdtype(result_col.dtype, np.bool_) - ): - return Series( - result_col.astype(np.int64)._apply_scan_op("sum"), - name=self.name, - index=self.index, - ) - else: - return Series( - result_col._apply_scan_op("sum"), - name=self.name, - index=self.index, - ) + return self._scan("sum", axis=axis, skipna=skipna, cast_to_int=True) def cumprod(self, axis=0, skipna=True, *args, **kwargs): """ @@ -4128,42 +4093,9 @@ def cumprod(self, axis=0, skipna=True, *args, **kwargs): 3 40 4 120 """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if is_decimal_dtype(self.dtype): - raise NotImplementedError( - "cumprod does not currently support decimal types" - ) - - skipna = True if skipna is None else skipna - - if skipna: - result_col = self.nans_to_nulls()._column - else: - result_col = self._column.copy() - if result_col.has_nulls: - first_index = int( - result_col.isnull().astype("int8").find_first_value(1) - ) - result_col[first_index:] = None - - # pandas always returns int64 dtype if original dtype is int or `bool` - if np.issubdtype(result_col.dtype, np.integer) or np.issubdtype( - result_col.dtype, np.bool_ - ): - return Series( - result_col.astype(np.int64)._apply_scan_op("product"), - name=self.name, - index=self.index, - ) - else: - return Series( - result_col._apply_scan_op("product"), - name=self.name, - index=self.index, - ) + return self._scan( + "product", axis=axis, skipna=skipna, cast_to_int=True + ) def mode(self, dropna=True): """ From 37473277af2d013dce7d8e3ab27559e82a0bed1c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 10 Aug 2021 16:57:58 -0700 Subject: [PATCH 04/13] Stop expecting axis=1 to fail. --- python/cudf/cudf/tests/test_dataframe.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 8744238a062..0da7da7305c 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8107,17 +8107,7 @@ def custom_func(df, column): @pytest.mark.parametrize( - "op", - [ - "count", - "cummin", - "cummax", - "cummax", - "cumprod", - "kurt", - "kurtosis", - "skew", - ], + "op", ["count", "kurt", "kurtosis", "skew"], ) def test_dataframe_axis1_unsupported_ops(op): df = cudf.DataFrame({"a": [1, 2, 3], "b": [8, 9, 10]}) From 2804f5c1f002016081a17ccdbeed7df5eb1a1f4c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 10 Aug 2021 17:10:27 -0700 Subject: [PATCH 05/13] Move scan implementation up to Frame and reuse for DataFrame. --- python/cudf/cudf/core/dataframe.py | 24 ++++-------------- python/cudf/cudf/core/frame.py | 40 ++++++++++++++++++++++++------ python/cudf/cudf/core/series.py | 29 +--------------------- 3 files changed, 38 insertions(+), 55 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 5d7cb8eb898..4da29c6ac4c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6346,15 +6346,7 @@ def _scan( axis = self._get_axis_from_axis_arg(axis) if axis == 0: - result = [ - getattr(self[col], op)(*args, **kwargs) - for col in self._data.names - ] - - return DataFrame._from_data( - {col: result[i] for i, col in enumerate(self._data.names)}, - index=result[0].index, - ) + return super()._scan(op, axis, *args, **kwargs) elif axis == 1: return self._apply_support_method_axis_1(op, **kwargs) @@ -6386,7 +6378,7 @@ def cummin(self, axis=None, skipna=True, *args, **kwargs): 2 1 7 3 1 7 """ - return self._scan("cummin", axis=axis, skipna=skipna, *args, **kwargs) + return self._scan("min", axis=axis, skipna=skipna, *args, **kwargs) def cummax(self, axis=None, skipna=True, *args, **kwargs): """ @@ -6416,9 +6408,7 @@ def cummax(self, axis=None, skipna=True, *args, **kwargs): 2 3 9 3 4 10 """ - return self._apply_support_method( - "cummax", axis=axis, skipna=skipna, *args, **kwargs - ) + return self._scan("max", axis=axis, skipna=skipna, *args, **kwargs) def cumsum(self, axis=None, skipna=True, *args, **kwargs): """ @@ -6449,9 +6439,7 @@ def cumsum(self, axis=None, skipna=True, *args, **kwargs): 2 6 24 3 10 34 """ - return self._apply_support_method( - "cumsum", axis=axis, skipna=skipna, *args, **kwargs - ) + return self._scan("sum", axis=axis, skipna=skipna, *args, **kwargs) def cumprod(self, axis=None, skipna=True, *args, **kwargs): """ @@ -6481,9 +6469,7 @@ def cumprod(self, axis=None, skipna=True, *args, **kwargs): 2 6 504 3 24 5040 """ - return self._apply_support_method( - "cumprod", axis=axis, skipna=skipna, *args, **kwargs - ) + return self._scan("prod", axis=axis, skipna=skipna, *args, **kwargs) def mode(self, axis=0, numeric_only=False, dropna=True): """ diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 3c6bc057af1..240858dbb6b 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3681,20 +3681,44 @@ def _get_axis_from_axis_arg(cls, axis): try: return cls._SUPPORT_AXIS_LOOKUP[axis] except KeyError: - valid_axes = ", ".join( - ( - ax - for ax in cls._SUPPORT_AXIS_LOOKUP.keys() - if ax is not None - ) - ) - raise ValueError(f"Invalid axis, must be one of {valid_axes}.") + raise ValueError(f"No axis named {axis} for object type {cls}") def _reduce(self, *args, **kwargs): raise NotImplementedError( f"Reductions are not supported for objects of type {type(self)}." ) + def _scan(self, op, axis=None, skipna=True, cast_to_int=False): + skipna = True if skipna is None else skipna + + results = {} + for name, col in self._data.items(): + if skipna: + result_col = self._data[name].nans_to_nulls() + else: + result_col = self._data[name].copy() + if result_col.has_nulls: + # Workaround as find_first_value doesn't seem to work + # incase of bools. + first_index = int( + result_col.isnull().astype("int8").find_first_value(1) + ) + result_col[first_index:] = None + + if ( + cast_to_int + and not is_decimal_dtype(result_col.dtype) + and ( + np.issubdtype(result_col.dtype, np.integer) + or np.issubdtype(result_col.dtype, np.bool_) + ) + ): + # For reductions that accumulate a value (e.g. sum, not max) + # pandas returns an int64 dtype for all int or bool dtypes. + result_col = result_col.astype(np.int64) + results[name] = result_col._apply_scan_op(op) + return self._from_data(results, index=self.index) + def min( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, ): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 3381de88182..c4d19540958 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3937,34 +3937,7 @@ def _scan(self, op, axis=None, skipna=True, cast_to_int=False): if axis not in (None, 0): raise NotImplementedError("axis parameter is not implemented yet") - skipna = True if skipna is None else skipna - - if skipna: - result_col = self.nans_to_nulls()._column - else: - result_col = self._column.copy() - if result_col.has_nulls: - # Workaround as find_first_value doesn't seem to work - # incase of bools. - first_index = int( - result_col.isnull().astype("int8").find_first_value(1) - ) - result_col[first_index:] = None - - if ( - cast_to_int - and not is_decimal_dtype(result_col.dtype) - and ( - np.issubdtype(result_col.dtype, np.integer) - or np.issubdtype(result_col.dtype, np.bool_) - ) - ): - # For reductions that accumulate a value (e.g. sum, not max) pandas - # returns an int64 dtype for all input int or bool dtypes. - result_col = result_col.astype(np.int64) - return Series._from_data( - {self.name: result_col._apply_scan_op(op)}, index=self.index, - ) + return super()._scan(op, axis, skipna, cast_to_int) def cummin(self, axis=None, skipna=True, *args, **kwargs): """ From a190e6fa36f3455df71b447d8690e57875a409bb Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 11 Aug 2021 14:30:20 -0700 Subject: [PATCH 06/13] Move all scan implementations into Frame. --- python/cudf/cudf/core/dataframe.py | 123 +------------------ python/cudf/cudf/core/frame.py | 184 ++++++++++++++++++++++++----- python/cudf/cudf/core/series.py | 135 +-------------------- 3 files changed, 156 insertions(+), 286 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 4da29c6ac4c..a6220038cb2 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6346,131 +6346,10 @@ def _scan( axis = self._get_axis_from_axis_arg(axis) if axis == 0: - return super()._scan(op, axis, *args, **kwargs) + return super()._scan(op, axis=axis, *args, **kwargs) elif axis == 1: return self._apply_support_method_axis_1(op, **kwargs) - def cummin(self, axis=None, skipna=True, *args, **kwargs): - """ - Return cumulative minimum of the DataFrame. - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, - the result will be NA. - - Returns - ------- - DataFrame - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.cummin() - a b - 0 1 7 - 1 1 7 - 2 1 7 - 3 1 7 - """ - return self._scan("min", axis=axis, skipna=skipna, *args, **kwargs) - - def cummax(self, axis=None, skipna=True, *args, **kwargs): - """ - Return cumulative maximum of the DataFrame. - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, - the result will be NA. - - Returns - ------- - DataFrame - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.cummax() - a b - 0 1 7 - 1 2 8 - 2 3 9 - 3 4 10 - """ - return self._scan("max", axis=axis, skipna=skipna, *args, **kwargs) - - def cumsum(self, axis=None, skipna=True, *args, **kwargs): - """ - Return cumulative sum of the DataFrame. - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, - the result will be NA. - - - Returns - ------- - DataFrame - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> s.cumsum() - a b - 0 1 7 - 1 3 15 - 2 6 24 - 3 10 34 - """ - return self._scan("sum", axis=axis, skipna=skipna, *args, **kwargs) - - def cumprod(self, axis=None, skipna=True, *args, **kwargs): - """ - Return cumulative product of the DataFrame. - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, - the result will be NA. - - Returns - ------- - DataFrame - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> s.cumprod() - a b - 0 1 7 - 1 2 56 - 2 6 504 - 3 24 5040 - """ - return self._scan("prod", axis=axis, skipna=skipna, *args, **kwargs) - def mode(self, axis=0, numeric_only=False, dropna=True): """ Get the mode(s) of each element along the selected axis. diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 240858dbb6b..12376ea58e8 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3688,37 +3688,6 @@ def _reduce(self, *args, **kwargs): f"Reductions are not supported for objects of type {type(self)}." ) - def _scan(self, op, axis=None, skipna=True, cast_to_int=False): - skipna = True if skipna is None else skipna - - results = {} - for name, col in self._data.items(): - if skipna: - result_col = self._data[name].nans_to_nulls() - else: - result_col = self._data[name].copy() - if result_col.has_nulls: - # Workaround as find_first_value doesn't seem to work - # incase of bools. - first_index = int( - result_col.isnull().astype("int8").find_first_value(1) - ) - result_col[first_index:] = None - - if ( - cast_to_int - and not is_decimal_dtype(result_col.dtype) - and ( - np.issubdtype(result_col.dtype, np.integer) - or np.issubdtype(result_col.dtype, np.bool_) - ) - ): - # For reductions that accumulate a value (e.g. sum, not max) - # pandas returns an int64 dtype for all int or bool dtypes. - result_col = result_col.astype(np.int64) - results[name] = result_col._apply_scan_op(op) - return self._from_data(results, index=self.index) - def min( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, ): @@ -4223,6 +4192,159 @@ def median( **kwargs, ) + # Scans + def _scan(self, op, axis=None, skipna=True, cast_to_int=False): + skipna = True if skipna is None else skipna + + results = {} + for name, col in self._data.items(): + if skipna: + result_col = self._data[name].nans_to_nulls() + else: + result_col = self._data[name].copy() + if result_col.has_nulls: + # Workaround as find_first_value doesn't seem to work + # incase of bools. + first_index = int( + result_col.isnull().astype("int8").find_first_value(1) + ) + result_col[first_index:] = None + + if ( + cast_to_int + and not is_decimal_dtype(result_col.dtype) + and ( + np.issubdtype(result_col.dtype, np.integer) + or np.issubdtype(result_col.dtype, np.bool_) + ) + ): + # For reductions that accumulate a value (e.g. sum, not max) + # pandas returns an int64 dtype for all int or bool dtypes. + result_col = result_col.astype(np.int64) + results[name] = result_col._apply_scan_op(op) + return self._from_data(results, index=self.index) + + def cummin(self, axis=None, skipna=True, *args, **kwargs): + """ + Return cumulative minimum of the Series or DataFrame. + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values. If an entire row/column is NA, + the result will be NA. + + Returns + ------- + Series or DataFrame + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.cummin() + a b + 0 1 7 + 1 1 7 + 2 1 7 + 3 1 7 + """ + return self._scan("min", axis=axis, skipna=skipna, *args, **kwargs) + + def cummax(self, axis=None, skipna=True, *args, **kwargs): + """ + Return cumulative maximum of the Series or DataFrame. + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values. If an entire row/column is NA, + the result will be NA. + + Returns + ------- + Series or DataFrame + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.cummax() + a b + 0 1 7 + 1 2 8 + 2 3 9 + 3 4 10 + """ + return self._scan("max", axis=axis, skipna=skipna, *args, **kwargs) + + def cumsum(self, axis=None, skipna=True, *args, **kwargs): + """ + Return cumulative sum of the Series or DataFrame. + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values. If an entire row/column is NA, + the result will be NA. + + + Returns + ------- + Series or DataFrame + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> s.cumsum() + a b + 0 1 7 + 1 3 15 + 2 6 24 + 3 10 34 + """ + return self._scan("sum", axis=axis, skipna=skipna, *args, **kwargs) + + def cumprod(self, axis=None, skipna=True, *args, **kwargs): + """ + Return cumulative product of the Series or DataFrame. + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values. If an entire row/column is NA, + the result will be NA. + + Returns + ------- + Series or DataFrame + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> s.cumprod() + a b + 0 1 7 + 1 2 56 + 2 6 504 + 3 24 5040 + """ + return self._scan("prod", axis=axis, skipna=skipna, *args, **kwargs) + class SingleColumnFrame(Frame): """A one-dimensional frame. diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index c4d19540958..405cd96b49a 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3933,142 +3933,11 @@ def count(self, level=None, **kwargs): return self.valid_count - def _scan(self, op, axis=None, skipna=True, cast_to_int=False): + def _scan(self, op, axis=None, *args, **kwargs): if axis not in (None, 0): raise NotImplementedError("axis parameter is not implemented yet") - return super()._scan(op, axis, skipna, cast_to_int) - - def cummin(self, axis=None, skipna=True, *args, **kwargs): - """ - Return cumulative minimum of the Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, - the result will be NA. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported is `axis` - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.cummin() - 0 1 - 1 1 - 2 1 - 3 1 - 4 1 - """ - return self._scan("min", axis=axis, skipna=skipna) - - def cummax(self, axis=0, skipna=True, *args, **kwargs): - """ - Return cumulative maximum of the Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, - the result will be NA. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported is `axis` - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.cummax() - 0 1 - 1 5 - 2 5 - 3 5 - 4 5 - """ - return self._scan("max", axis=axis, skipna=skipna) - - def cumsum(self, axis=0, skipna=True, *args, **kwargs): - """ - Return cumulative sum of the Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, - the result will be NA. - - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported is `axis` - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.cumsum() - 0 1 - 1 6 - 2 8 - 3 12 - 4 15 - """ - return self._scan("sum", axis=axis, skipna=skipna, cast_to_int=True) - - def cumprod(self, axis=0, skipna=True, *args, **kwargs): - """ - Return cumulative product of the Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, - the result will be NA. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported is `axis` - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.cumprod() - 0 1 - 1 5 - 2 10 - 3 40 - 4 120 - """ - return self._scan( - "product", axis=axis, skipna=skipna, cast_to_int=True - ) + return super()._scan(op, axis=axis, *args, **kwargs) def mode(self, dropna=True): """ From 9f96b78df38544ac78255a0b428819d276ee0c1f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 11 Aug 2021 14:42:52 -0700 Subject: [PATCH 07/13] Move Series scan impl to SingleColumnFrame to enable for Index types. --- python/cudf/cudf/core/frame.py | 11 ++++++++++- python/cudf/cudf/core/series.py | 6 ------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 12376ea58e8..9122f5b854f 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -4222,7 +4222,10 @@ def _scan(self, op, axis=None, skipna=True, cast_to_int=False): # pandas returns an int64 dtype for all int or bool dtypes. result_col = result_col.astype(np.int64) results[name] = result_col._apply_scan_op(op) - return self._from_data(results, index=self.index) + # TODO: This will work for Index because it's passing self._index + # (which is None), but eventually we may want to remove that parameter + # for Index._from_data and simplify. + return self._from_data(results, index=self._index) def cummin(self, axis=None, skipna=True, *args, **kwargs): """ @@ -4374,6 +4377,12 @@ def _reduce( ) return getattr(self._column, op)(**kwargs) + def _scan(self, op, axis=None, *args, **kwargs): + if axis not in (None, 0): + raise NotImplementedError("axis parameter is not implemented yet") + + return super()._scan(op, axis=axis, *args, **kwargs) + @classmethod def _from_data( cls, diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 405cd96b49a..75ac9941931 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3933,12 +3933,6 @@ def count(self, level=None, **kwargs): return self.valid_count - def _scan(self, op, axis=None, *args, **kwargs): - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - return super()._scan(op, axis=axis, *args, **kwargs) - def mode(self, dropna=True): """ Return the mode(s) of the dataset. From 1fa2e63c63fae458fc8b4c6bcaeab1923eb9bb0c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 11 Aug 2021 14:49:05 -0700 Subject: [PATCH 08/13] Reenable cumulative ops for axis 1 correctly. --- python/cudf/cudf/core/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a6220038cb2..b45d6510674 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6348,7 +6348,7 @@ def _scan( if axis == 0: return super()._scan(op, axis=axis, *args, **kwargs) elif axis == 1: - return self._apply_support_method_axis_1(op, **kwargs) + return self._apply_support_method_axis_1(f"cum{op}", **kwargs) def mode(self, axis=0, numeric_only=False, dropna=True): """ From a477d1f8b0313daaca4d231c86d7ae9657537dca Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 11 Aug 2021 16:10:08 -0700 Subject: [PATCH 09/13] Add axis=1 tests and get most of them working. --- python/cudf/cudf/core/dataframe.py | 5 +++-- python/cudf/cudf/tests/test_dataframe.py | 27 ++++++++++++++++++------ 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b45d6510674..a1ff6d16a0f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5851,7 +5851,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): ) if data.ndim == 2: - num_cols = len(data[0]) + num_cols = data.shape[1] else: # Since we validate ndim to be either 1 or 2 above, # this case can be assumed to be ndim == 1. @@ -6576,13 +6576,14 @@ def _apply_support_method_axis_0(self, method, *args, **kwargs): def _apply_support_method_axis_1(self, method, *args, **kwargs): # for dask metadata compatibility skipna = kwargs.pop("skipna", None) + skipna = True if skipna is None else skipna if method not in _cupy_nan_methods_map and skipna not in ( None, True, 1, ): raise NotImplementedError( - f"Row-wise operation to calculate '{method}'" + f"Row-wise operations to calculate '{method}'" f" currently do not support `skipna=False`." ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 0da7da7305c..32c2e9f9fbf 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1835,6 +1835,8 @@ def gdf(pdf): {"x": []}, ], ) +# @pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize("axis", [1]) @pytest.mark.parametrize( "func", [ @@ -1852,19 +1854,32 @@ def gdf(pdf): lambda df, **kwargs: df.max(**kwargs), lambda df, **kwargs: df.std(ddof=1, **kwargs), lambda df, **kwargs: df.var(ddof=1, **kwargs), - lambda df, **kwargs: df.std(ddof=2, **kwargs), - lambda df, **kwargs: df.var(ddof=2, **kwargs), - lambda df, **kwargs: df.kurt(**kwargs), - lambda df, **kwargs: df.skew(**kwargs), + # lambda df, **kwargs: df.std(ddof=2, **kwargs), + # lambda df, **kwargs: df.var(ddof=2, **kwargs), + # lambda df, **kwargs: df.kurt(**kwargs), + # lambda df, **kwargs: df.skew(**kwargs), lambda df, **kwargs: df.all(**kwargs), lambda df, **kwargs: df.any(**kwargs), ], ) @pytest.mark.parametrize("skipna", [True, False, None]) -def test_dataframe_reductions(data, func, skipna): +def test_dataframe_reductions(data, axis, func, skipna): pdf = pd.DataFrame(data=data) gdf = cudf.DataFrame.from_pandas(pdf) - assert_eq(func(pdf, skipna=skipna), func(gdf, skipna=skipna)) + try: + assert_eq( + func(pdf, axis=axis, skipna=skipna), + func(gdf, axis=axis, skipna=skipna), + check_dtype=False, + ) + except Exception as e: + acceptable_errors = ( + "Row-wise operations to calculate", + "module 'cupy' has no attribute", + ) + if any(a in str(e) for a in acceptable_errors): + return + raise e @pytest.mark.parametrize( From 745684f952a7dcab5a61245663e3bfd93bbe26c0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 11 Aug 2021 16:53:00 -0700 Subject: [PATCH 10/13] Get all possible axis=1 tests working and filter properly. --- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/tests/test_dataframe.py | 82 +++++++++++++----------- 2 files changed, 46 insertions(+), 38 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a1ff6d16a0f..1b41703ba3f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6225,7 +6225,7 @@ def _prepare_for_rowwise_op(self, method, skipna): col.nullable for col in self._columns ): msg = ( - f"Row-wise operations to calculate '{method}' is not " + f"Row-wise operations to calculate '{method}' do not " f"currently support columns with null values. " f"Consider removing them with .dropna() " f"or using .fillna()." diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 32c2e9f9fbf..412b9fccb6b 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1828,58 +1828,66 @@ def gdf(pdf): @pytest.mark.parametrize( "data", [ - {"x": [np.nan, 2, 3, 4, 100, np.nan], "y": [4, 5, 6, 88, 99, np.nan]}, - {"x": [1, 2, 3], "y": [4, 5, 6]}, - {"x": [np.nan, np.nan, np.nan], "y": [np.nan, np.nan, np.nan]}, - {"x": [], "y": []}, + { + "x": [np.nan, 2, 3, 4, 100, np.nan], + "y": [4, 5, 6, 88, 99, np.nan], + "z": [7, 8, 9, 66, np.nan, 77], + }, + {"x": [1, 2, 3], "y": [4, 5, 6], "z": [7, 8, 9]}, + { + "x": [np.nan, np.nan, np.nan], + "y": [np.nan, np.nan, np.nan], + "z": [np.nan, np.nan, np.nan], + }, + {"x": [], "y": [], "z": []}, {"x": []}, ], ) -# @pytest.mark.parametrize("axis", [0, 1]) -@pytest.mark.parametrize("axis", [1]) +@pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize( "func", [ - lambda df, **kwargs: df.min(**kwargs), - lambda df, **kwargs: df.max(**kwargs), - lambda df, **kwargs: df.sum(**kwargs), - lambda df, **kwargs: df.product(**kwargs), - lambda df, **kwargs: df.cummin(**kwargs), - lambda df, **kwargs: df.cummax(**kwargs), - lambda df, **kwargs: df.cumsum(**kwargs), - lambda df, **kwargs: df.cumprod(**kwargs), - lambda df, **kwargs: df.mean(**kwargs), - lambda df, **kwargs: df.median(**kwargs), - lambda df, **kwargs: df.sum(**kwargs), - lambda df, **kwargs: df.max(**kwargs), - lambda df, **kwargs: df.std(ddof=1, **kwargs), - lambda df, **kwargs: df.var(ddof=1, **kwargs), - # lambda df, **kwargs: df.std(ddof=2, **kwargs), - # lambda df, **kwargs: df.var(ddof=2, **kwargs), - # lambda df, **kwargs: df.kurt(**kwargs), - # lambda df, **kwargs: df.skew(**kwargs), - lambda df, **kwargs: df.all(**kwargs), - lambda df, **kwargs: df.any(**kwargs), + "min", + "max", + "sum", + "product", + "cummin", + "cummax", + "cumsum", + "cumprod", + "mean", + "median", + "sum", + "max", + "std", + "var", + "kurt", + "skew", + "all", + "any", ], ) @pytest.mark.parametrize("skipna", [True, False, None]) def test_dataframe_reductions(data, axis, func, skipna): pdf = pd.DataFrame(data=data) gdf = cudf.DataFrame.from_pandas(pdf) - try: + + # These reductions don't support axis=1 + if axis == 1 and func in ("kurt", "skew"): + return + + # We need cupy-supported operations when performing rowwise ops. + if func not in cudf.core.dataframe._cupy_nan_methods_map and axis == 1: + return + + # Test different degrees of freedom for var and std. + all_kwargs = [{"ddof": 1}, {"ddof": 2}] if func in ("var", "std") else [{}] + for kwargs in all_kwargs: assert_eq( - func(pdf, axis=axis, skipna=skipna), - func(gdf, axis=axis, skipna=skipna), + getattr(pdf, func)(axis=axis, skipna=skipna, **kwargs), + getattr(gdf, func)(axis=axis, skipna=skipna, **kwargs), check_dtype=False, ) - except Exception as e: - acceptable_errors = ( - "Row-wise operations to calculate", - "module 'cupy' has no attribute", - ) - if any(a in str(e) for a in acceptable_errors): - return - raise e @pytest.mark.parametrize( From 939b45561eadfb37120c1b0c797edaa9cb7c277d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 12 Aug 2021 09:32:16 -0700 Subject: [PATCH 11/13] Add in cast that was lost in the scramble. --- python/cudf/cudf/core/frame.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 9122f5b854f..37584d2c3e4 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -4316,7 +4316,9 @@ def cumsum(self, axis=None, skipna=True, *args, **kwargs): 2 6 24 3 10 34 """ - return self._scan("sum", axis=axis, skipna=skipna, *args, **kwargs) + return self._scan( + "sum", axis=axis, skipna=skipna, cast_to_int=True, *args, **kwargs + ) def cumprod(self, axis=None, skipna=True, *args, **kwargs): """ @@ -4346,7 +4348,9 @@ def cumprod(self, axis=None, skipna=True, *args, **kwargs): 2 6 504 3 24 5040 """ - return self._scan("prod", axis=axis, skipna=skipna, *args, **kwargs) + return self._scan( + "prod", axis=axis, skipna=skipna, cast_to_int=True, *args, **kwargs + ) class SingleColumnFrame(Frame): From d804de5c8b9752b35fece4015697a8d72292b4e1 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 16 Aug 2021 10:51:47 -0700 Subject: [PATCH 12/13] Add back Series examples. --- python/cudf/cudf/core/frame.py | 52 ++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 37584d2c3e4..324c555f974 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -4246,6 +4246,19 @@ def cummin(self, axis=None, skipna=True, *args, **kwargs): Examples -------- + **Series** + + >>> import cudf + >>> ser = cudf.Series([1, 5, 2, 4, 3]) + >>> ser.cummin() + 0 1 + 1 1 + 2 1 + 3 1 + 4 1 + + **DataFrame** + >>> import cudf >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) >>> df.cummin() @@ -4276,6 +4289,19 @@ def cummax(self, axis=None, skipna=True, *args, **kwargs): Examples -------- + **Series** + + >>> import cudf + >>> ser = cudf.Series([1, 5, 2, 4, 3]) + >>> ser.cummax() + 0 1 + 1 5 + 2 5 + 3 5 + 4 5 + + **DataFrame** + >>> import cudf >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) >>> df.cummax() @@ -4307,6 +4333,19 @@ def cumsum(self, axis=None, skipna=True, *args, **kwargs): Examples -------- + **Series** + + >>> import cudf + >>> ser = cudf.Series([1, 5, 2, 4, 3]) + >>> ser.cumsum() + 0 1 + 1 6 + 2 8 + 3 12 + 4 15 + + **DataFrame** + >>> import cudf >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) >>> s.cumsum() @@ -4339,6 +4378,19 @@ def cumprod(self, axis=None, skipna=True, *args, **kwargs): Examples -------- + **Series** + + >>> import cudf + >>> ser = cudf.Series([1, 5, 2, 4, 3]) + >>> ser.cumprod() + 0 1 + 1 5 + 2 10 + 3 40 + 4 120 + + **DataFrame** + >>> import cudf >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) >>> s.cumprod() From d4abda9658f61227daec6023ee2ae7fac8df9bf5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 16 Aug 2021 15:34:44 -0700 Subject: [PATCH 13/13] Change test to verify that the correct exceptions are thrown. --- python/cudf/cudf/core/dataframe.py | 6 ++++ python/cudf/cudf/tests/test_dataframe.py | 37 ++++++++++++++++-------- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1b41703ba3f..3f9804daf05 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -62,6 +62,7 @@ "max": "nanmax", "sum": "nansum", "prod": "nanprod", + "product": "nanprod", "mean": "nanmean", "std": "nanstd", "var": "nanvar", @@ -6612,6 +6613,11 @@ def _apply_support_method_axis_1(self, method, *args, **kwargs): "Row-wise operations currently do not " "support `bool_only`." ) + # This parameter is only necessary for axis 0 reductions that cuDF + # performs internally. cupy already upcasts smaller integer/bool types + # to int64 when accumulating. + kwargs.pop("cast_to_int", None) + prepared, mask, common_dtype = self._prepare_for_rowwise_op( method, skipna ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 412b9fccb6b..484278d0237 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1850,6 +1850,7 @@ def gdf(pdf): "min", "max", "sum", + "prod", "product", "cummin", "cummax", @@ -1872,22 +1873,34 @@ def test_dataframe_reductions(data, axis, func, skipna): pdf = pd.DataFrame(data=data) gdf = cudf.DataFrame.from_pandas(pdf) - # These reductions don't support axis=1 - if axis == 1 and func in ("kurt", "skew"): - return - - # We need cupy-supported operations when performing rowwise ops. - if func not in cudf.core.dataframe._cupy_nan_methods_map and axis == 1: - return + # Reductions can fail in numerous possible ways when attempting row-wise + # reductions, which are only partially supported. Catching the appropriate + # exception here allows us to detect API breakage in the form of changing + # exceptions. + expected_exception = None + if axis == 1: + if func in ("kurt", "skew"): + expected_exception = NotImplementedError + elif func not in cudf.core.dataframe._cupy_nan_methods_map: + if skipna is False: + expected_exception = NotImplementedError + elif any(col.nullable for name, col in gdf.iteritems()): + expected_exception = ValueError + elif func in ("cummin", "cummax"): + expected_exception = AttributeError # Test different degrees of freedom for var and std. all_kwargs = [{"ddof": 1}, {"ddof": 2}] if func in ("var", "std") else [{}] for kwargs in all_kwargs: - assert_eq( - getattr(pdf, func)(axis=axis, skipna=skipna, **kwargs), - getattr(gdf, func)(axis=axis, skipna=skipna, **kwargs), - check_dtype=False, - ) + if expected_exception is not None: + with pytest.raises(expected_exception): + getattr(gdf, func)(axis=axis, skipna=skipna, **kwargs), + else: + assert_eq( + getattr(pdf, func)(axis=axis, skipna=skipna, **kwargs), + getattr(gdf, func)(axis=axis, skipna=skipna, **kwargs), + check_dtype=False, + ) @pytest.mark.parametrize(