From 94a5d4180b1281d4250e9f915e547789d8da3ce0 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Fri, 15 Apr 2022 16:08:09 -0400 Subject: [PATCH] Add support for null and non-numeric types in Series.diff and DataFrame.diff (#10625) This PR supports non-numeric data types (timestamp and ranges) in `Series.diff` and `DataFrame.diff`. In `DataFrame.diff`, datetime ranges are already supported because `DataFrame.shift` works. But `Series.diff` doesn't use the `Series.shift` implementation, so there wasn't support for datetime ranges. ```python import datetime dti = pd.to_datetime( ["1/1/2018", np.datetime64("2018-01-01"), datetime.datetime(2018, 1, 1), datetime.datetime(2020, 1, 1)] ) df = DataFrame({"dates": dti}) df.diff(periods=periods, axis=axis) ``` closes #10212. Authors: - Matthew Murray (https://github.com/Matt711) - Bradley Dice (https://github.com/bdice) Approvers: - Ashwin Srinath (https://github.com/shwina) - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) URL: https://github.com/rapidsai/cudf/pull/10625 --- python/cudf/cudf/core/dataframe.py | 5 -- python/cudf/cudf/core/series.py | 52 +++++++-------------- python/cudf/cudf/tests/test_dataframe.py | 28 +++++++++--- python/cudf/cudf/tests/test_series.py | 58 ++++++++++++++++++++++++ python/cudf/cudf/utils/cudautils.py | 21 --------- 5 files changed, 96 insertions(+), 68 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index ae60cd91fac..8893b85c97c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2660,11 +2660,6 @@ def diff(self, periods=1, axis=0): if axis != 0: raise NotImplementedError("Only axis=0 is supported.") - if not all(is_numeric_dtype(i) for i in self.dtypes): - raise NotImplementedError( - "DataFrame.diff only supports numeric dtypes" - ) - if abs(periods) > len(self): df = cudf.DataFrame._from_data( { diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 6e15c03e6b4..20ba52afccd 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -14,11 +14,11 @@ import numpy as np import pandas as pd from pandas._config import get_option +from pandas.core.dtypes.common import is_float import cudf from cudf import _lib as libcudf from cudf._lib.scalar import _is_null_host_scalar -from cudf._lib.transform import bools_to_mask from cudf._typing import ColumnLike, DataFrameOrSeries, ScalarLike from cudf.api.types import ( _is_non_decimal_numeric_dtype, @@ -42,7 +42,6 @@ arange, as_column, column, - column_empty_like, full, ) from cudf.core.column.categorical import ( @@ -64,7 +63,7 @@ ) from cudf.core.single_column_frame import SingleColumnFrame from cudf.core.udf.scalar_function import _get_scalar_kernel -from cudf.utils import cudautils, docutils +from cudf.utils import docutils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( can_convert_to_column, @@ -2969,19 +2968,22 @@ def digitize(self, bins, right=False): @_cudf_nvtx_annotate def diff(self, periods=1): - """Calculate the difference between values at positions i and i - N in - an array and store the output in a new array. + """First discrete difference of element. + + Calculates the difference of a Series element compared with another + element in the Series (default is element in previous row). + + Parameters + ---------- + periods : int, default 1 + Periods to shift for calculating difference, + accepts negative values. Returns ------- Series First differences of the Series. - Notes - ----- - Diff currently only supports float and integer dtype columns with - no null values. - Examples -------- >>> import cudf @@ -3028,32 +3030,12 @@ def diff(self, periods=1): 5 dtype: int64 """ - if self.has_nulls: - raise AssertionError( - "Diff currently requires columns with no null values" - ) - - if not np.issubdtype(self.dtype, np.number): - raise NotImplementedError( - "Diff currently only supports numeric dtypes" - ) - - # TODO: move this libcudf - input_col = self._column - output_col = column_empty_like(input_col) - output_mask = column_empty_like(input_col, dtype="bool") - if output_col.size > 0: - cudautils.gpu_diff.forall(output_col.size)( - input_col, output_col, output_mask, periods - ) - - output_col = column.build_column( - data=output_col.data, - dtype=output_col.dtype, - mask=bools_to_mask(output_mask), - ) + if not is_integer(periods): + if not (is_float(periods) and periods.is_integer()): + raise ValueError("periods must be an integer") + periods = int(periods) - return Series(output_col, name=self.name, index=self.index) + return self - self.shift(periods=periods) @copy_docstring(SeriesGroupBy) @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 13ab0b35822..07261534777 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9098,7 +9098,7 @@ def test_groupby_cov_for_pandas_bug_case(): ], ) @pytest.mark.parametrize("periods", (-5, -1, 0, 1, 5)) -def test_diff_dataframe_numeric_dtypes(data, periods): +def test_diff_numeric_dtypes(data, periods): gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() @@ -9137,7 +9137,7 @@ def test_diff_decimal_dtypes(precision, scale, dtype): ) -def test_diff_dataframe_invalid_axis(): +def test_diff_invalid_axis(): gdf = cudf.DataFrame(np.array([1.123, 2.343, 5.890, 0.0])) with pytest.raises(NotImplementedError, match="Only axis=0 is supported."): gdf.diff(periods=1, axis=1) @@ -9152,16 +9152,30 @@ def test_diff_dataframe_invalid_axis(): "string_col": ["a", "b", "c", "d", "e"], }, ["a", "b", "c", "d", "e"], - [np.nan, None, np.nan, None], ], ) -def test_diff_dataframe_non_numeric_dypes(data): +def test_diff_unsupported_dtypes(data): gdf = cudf.DataFrame(data) with pytest.raises( - NotImplementedError, - match="DataFrame.diff only supports numeric dtypes", + TypeError, + match=r"unsupported operand type\(s\)", ): - gdf.diff(periods=2, axis=0) + gdf.diff() + + +def test_diff_many_dtypes(): + pdf = pd.DataFrame( + { + "dates": pd.date_range("2020-01-01", "2020-01-06", freq="D"), + "bools": [True, True, True, False, True, True], + "floats": [1.0, 2.0, 3.5, np.nan, 5.0, -1.7], + "ints": [1, 2, 3, 3, 4, 5], + "nans_nulls": [np.nan, None, None, np.nan, np.nan, None], + } + ) + gdf = cudf.from_pandas(pdf) + assert_eq(pdf.diff(), gdf.diff()) + assert_eq(pdf.diff(periods=2), gdf.diff(periods=2)) def test_dataframe_assign_cp_np_array(): diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 6f0f77f0aa2..fccb9f680d9 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -18,6 +18,7 @@ TIMEDELTA_TYPES, assert_eq, assert_exceptions_equal, + gen_rand, ) @@ -1724,3 +1725,60 @@ def test_isin_categorical(data, values): got = gsr.isin(values) expected = psr.isin(values) assert_eq(got, expected) + + +@pytest.mark.parametrize("dtype", NUMERIC_TYPES) +@pytest.mark.parametrize("period", [-1, -5, -10, -20, 0, 1, 5, 10, 20]) +@pytest.mark.parametrize("data_empty", [False, True]) +def test_diff(dtype, period, data_empty): + if data_empty: + data = None + else: + if dtype == np.int8: + # to keep data in range + data = gen_rand(dtype, 100000, low=-2, high=2) + else: + data = gen_rand(dtype, 100000) + + gs = cudf.Series(data, dtype=dtype) + ps = pd.Series(data, dtype=dtype) + + expected_outcome = ps.diff(period) + diffed_outcome = gs.diff(period).astype(expected_outcome.dtype) + + if data_empty: + assert_eq(diffed_outcome, expected_outcome, check_index_type=False) + else: + assert_eq(diffed_outcome, expected_outcome) + + +@pytest.mark.parametrize( + "data", + [ + ["a", "b", "c", "d", "e"], + ], +) +def test_diff_unsupported_dtypes(data): + gs = cudf.Series(data) + with pytest.raises( + TypeError, + match=r"unsupported operand type\(s\)", + ): + gs.diff() + + +@pytest.mark.parametrize( + "data", + [ + pd.date_range("2020-01-01", "2020-01-06", freq="D"), + [True, True, True, False, True, True], + [1.0, 2.0, 3.5, 4.0, 5.0, -1.7], + [1, 2, 3, 3, 4, 5], + [np.nan, None, None, np.nan, np.nan, None], + ], +) +def test_diff_many_dtypes(data): + ps = pd.Series(data) + gs = cudf.from_pandas(ps) + assert_eq(ps.diff(), gs.diff()) + assert_eq(ps.diff(periods=2), gs.diff(periods=2)) diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py index 4796402f14d..fb6e35f4f58 100755 --- a/python/cudf/cudf/utils/cudautils.py +++ b/python/cudf/cudf/utils/cudautils.py @@ -14,27 +14,6 @@ # -@cuda.jit -def gpu_diff(in_col, out_col, out_mask, N): - """Calculate the difference between values at positions i and i - N in an - array and store the output in a new array. - """ - i = cuda.grid(1) - - if N > 0: - if i < in_col.size: - out_col[i] = in_col[i] - in_col[i - N] - out_mask[i] = True - if i < N: - out_mask[i] = False - else: - if i <= (in_col.size + N): - out_col[i] = in_col[i] - in_col[i - N] - out_mask[i] = True - if i >= (in_col.size + N) and i < in_col.size: - out_mask[i] = False - - # Find segments