diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 9f26ac8ee78..f953c894db2 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -2,7 +2,11 @@ from warnings import warn import cupy as cp +import numpy as np +from cudf.core.column import as_column +from cudf.core.frame import Frame +from cudf.core.index import RangeIndex from cudf.core.series import Index, Series @@ -59,3 +63,55 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None): values.name = name return labels, cats.values if return_cupy_array else Index(cats) + + +def _linear_interpolation(column, index=None): + """ + Interpolate over a float column. Implicitly assumes that values are + evenly spaced with respect to the x-axis, for example the data + [1.0, NaN, 3.0] will be interpolated assuming the NaN is half way + between the two valid values, yielding [1.0, 2.0, 3.0] + """ + + index = RangeIndex(start=0, stop=len(column), step=1) + return _index_or_values_interpolation(column, index=index) + + +def _index_or_values_interpolation(column, index=None): + """ + Interpolate over a float column. assumes a linear interpolation + strategy using the index of the data to denote spacing of the x + values. For example the data and index [1.0, NaN, 4.0], [1, 3, 4] + would result in [1.0, 3.0, 4.0] + """ + # figure out where the nans are + mask = cp.isnan(column) + + # trivial cases, all nan or no nans + num_nan = mask.sum() + if num_nan == 0 or num_nan == len(column): + return column + + to_interp = Frame(data={None: column}, index=index) + known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask)) + + known_x = known_x_and_y._index._column.values + known_y = known_x_and_y._data.columns[0].values + + result = cp.interp(to_interp._index.values, known_x, known_y) + + # find the first nan + first_nan_idx = (mask == 0).argmax().item() + result[:first_nan_idx] = np.nan + return result + + +def get_column_interpolator(method): + interpolator = { + "linear": _linear_interpolation, + "index": _index_or_values_interpolation, + "values": _index_or_values_interpolation, + }.get(method, None) + if not interpolator: + raise ValueError(f"Interpolation method `{method}` not found") + return interpolator diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 8cdc6eebaee..ad5dc3430b0 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5893,6 +5893,36 @@ def _from_columns(cls, cols, index=None, columns=None): return cls(data=data, index=index,) + def interpolate( + self, + method="linear", + axis=0, + limit=None, + inplace=False, + limit_direction=None, + limit_area=None, + downcast=None, + **kwargs, + ): + + if all(dt == np.dtype("object") for dt in self.dtypes): + raise TypeError( + "Cannot interpolate with all object-dtype " + "columns in the DataFrame. Try setting at " + "least one column to a numeric dtype." + ) + + return super().interpolate( + method=method, + axis=axis, + limit=limit, + inplace=inplace, + limit_direction=limit_direction, + limit_area=limit_area, + downcast=downcast, + **kwargs, + ) + def quantile( self, q=0.5, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 6a976f54c2b..9e5787c6d5f 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1443,6 +1443,75 @@ def _apply_boolean_mask(self, boolean_mask): result._copy_type_metadata(self) return result + def interpolate( + self, + method="linear", + axis=0, + limit=None, + inplace=False, + limit_direction=None, + limit_area=None, + downcast=None, + **kwargs, + ): + """ + Interpolate data values between some points. + + Parameters + ---------- + method : str, default 'linear' + Interpolation technique to use. Currently, + only 'linear` is supported. + * 'linear': Ignore the index and treat the values as + equally spaced. This is the only method supported on MultiIndexes. + * 'index', 'values': linearly interpolate using the index as + an x-axis. Unsorted indices can lead to erroneous results. + axis : int, default 0 + Axis to interpolate along. Currently, + only 'axis=0' is supported. + inplace : bool, default False + Update the data in place if possible. + + Returns + ------- + Series or DataFrame + Returns the same object type as the caller, interpolated at + some or all ``NaN`` values + + """ + + if method in {"pad", "ffill"} and limit_direction != "forward": + raise ValueError( + f"`limit_direction` must be 'forward' for method `{method}`" + ) + if method in {"backfill", "bfill"} and limit_direction != "backward": + raise ValueError( + f"`limit_direction` must be 'backward' for method `{method}`" + ) + + data = self + + if not isinstance(data._index, cudf.RangeIndex): + perm_sort = data._index.argsort() + data = data._gather(perm_sort) + + interpolator = cudf.core.algorithms.get_column_interpolator(method) + columns = {} + for colname, col in data._data.items(): + if col.nullable: + col = col.astype("float64").fillna(np.nan) + + # Interpolation methods may or may not need the index + columns[colname] = interpolator(col, index=data._index) + + result = self._from_data(columns, index=data._index) + + return ( + result + if isinstance(data._index, cudf.RangeIndex) + else result._gather(perm_sort.argsort()) + ) + def _quantiles( self, q, diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py new file mode 100644 index 00000000000..66556c48828 --- /dev/null +++ b/python/cudf/cudf/tests/test_interpolate.py @@ -0,0 +1,119 @@ +import pytest + +import cudf +from cudf.testing._utils import assert_eq, assert_exceptions_equal + + +@pytest.mark.parametrize( + "data", + [ + # basics + {"A": [1.0, 2.0, 3.0], "B": [4.0, 5.0, 6.0]}, + {"A": [1.0, None, 3.0], "B": [4.0, None, 6.0]}, + {"A": [None, 2.0, 3.0], "B": [4.0, 5.0, None]}, + ], +) +@pytest.mark.parametrize("method", ["linear"]) +@pytest.mark.parametrize("axis", [0]) +def test_interpolate_dataframe(data, method, axis): + # Pandas interpolate methods do not seem to work + # with nullable dtypes yet, so this method treats + # NAs as NaNs + # https://github.com/pandas-dev/pandas/issues/40252 + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + expect = pdf.interpolate(method=method, axis=axis) + got = gdf.interpolate(method=method, axis=axis) + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data", + [ + [1.0, 2.0, 3.0], + [1.0, None, 3.0], + [None, 2.0, None, 4.0], + [1.0, None, 3.0, None], + [None, None, 3.0, 4.0], + [1.0, 2.0, None, None], + [None, None, None, None], + [0.1, 0.2, 0.3], + ], +) +@pytest.mark.parametrize("method", ["linear"]) +@pytest.mark.parametrize("axis", [0]) +def test_interpolate_series(data, method, axis): + gsr = cudf.Series(data) + psr = gsr.to_pandas() + + expect = psr.interpolate(method=method, axis=axis) + got = gsr.interpolate(method=method, axis=axis) + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data,index", [([2.0, None, 4.0, None, 2.0], [1, 2, 3, 2, 1])] +) +def test_interpolate_series_unsorted_index(data, index): + gsr = cudf.Series(data, index=index) + psr = gsr.to_pandas() + + expect = psr.interpolate(method="values") + got = gsr.interpolate(method="values") + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data", + [ + [1.0, 2.0, 3.0, 4.0], + [None, 2.0, 3.0, 4.0], + [1.0, 2.0, 3.0, None], + [None, None, 3.0, 4.0], + [1.0, 2.0, None, None], + [1.0, None, 3.0, None], + [None, 2.0, None, 4.0], + [None, None, None, None], + ], +) +@pytest.mark.parametrize("index", [[0, 1, 2, 3], [0, 2, 4, 6], [0, 3, 4, 9]]) +@pytest.mark.parametrize("method", ["index", "values"]) +def test_interpolate_series_values_or_index(data, index, method): + gsr = cudf.Series(data, index=index) + psr = gsr.to_pandas() + + expect = psr.interpolate(method=method) + got = gsr.interpolate(method=method) + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data,kwargs", + [ + ( + {"A": ["a", "b", "c"], "B": ["d", "e", "f"]}, + {"axis": 0, "method": "linear"}, + ), + ({"A": [1, 2, 3]}, {"method": "pad", "limit_direction": "backward"}), + ({"A": [1, 2, 3]}, {"method": "ffill", "limit_direction": "backward"}), + ({"A": [1, 2, 3]}, {"method": "bfill", "limit_direction": "forward"}), + ( + {"A": [1, 2, 3]}, + {"method": "backfill", "limit_direction": "forward"}, + ), + ], +) +def test_interpolate_dataframe_error_cases(data, kwargs): + gsr = cudf.DataFrame(data) + psr = gsr.to_pandas() + + assert_exceptions_equal( + lfunc=psr.interpolate, + rfunc=gsr.interpolate, + lfunc_args_and_kwargs=([], kwargs), + rfunc_args_and_kwargs=([], kwargs), + )