Linear Interpolation of nans via cupy (#8767)

Adds Series and DataFrame level functions for linear interpolation of missing values, built around CuPy's `interp` method. Pandas `interpolate` API supports somewhat varied functionality for filling `NaN`s. It currently does not work for actual `<NA>` values - pandas issue [here.](pandas-dev/pandas#40252). That said one might expect both kinds of missing data to be treated equally for the purposes of interpolation, and this PR does that. While `cp.interp` is great for getting us off the ground, but only supports linear interpolation and its results aren't exactly what pandas produces. In particular pandas will not fill `NaN`s at the start of the series, because the default value of `limit_direction` is `forward` and the default `limit` is `None` which from my experimentation means 'unlimited'. This means that that despite this, the `NaN`s at the end WILL get filled. This means we need to actually figure out where the first NaN is and mask out that part of the series with `NaN`s. Closes #8685. Authors: - https://github.com/brandon-b-miller Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Ashwin Srinath (https://github.com/shwina) URL: #8767
rapidsai · Aug 10, 2021 · b1c2dd4 · b1c2dd4
1 parent e8b05de
commit b1c2dd4
Show file tree

Hide file tree

Showing 4 changed files with 274 additions and 0 deletions.
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
@@ -2,7 +2,11 @@
 from warnings import warn
 
 import cupy as cp
+import numpy as np
 
+from cudf.core.column import as_column
+from cudf.core.frame import Frame
+from cudf.core.index import RangeIndex
 from cudf.core.series import Index, Series
 
 
@@ -59,3 +63,55 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None):
     values.name = name
 
     return labels, cats.values if return_cupy_array else Index(cats)
+
+
+def _linear_interpolation(column, index=None):
+    """
+    Interpolate over a float column. Implicitly assumes that values are
+    evenly spaced with respect to the x-axis, for example the data
+    [1.0, NaN, 3.0] will be interpolated assuming the NaN is half way
+    between the two valid values, yielding [1.0, 2.0, 3.0]
+    """
+
+    index = RangeIndex(start=0, stop=len(column), step=1)
+    return _index_or_values_interpolation(column, index=index)
+
+
+def _index_or_values_interpolation(column, index=None):
+    """
+    Interpolate over a float column. assumes a linear interpolation
+    strategy using the index of the data to denote spacing of the x
+    values. For example the data and index [1.0, NaN, 4.0], [1, 3, 4]
+    would result in [1.0, 3.0, 4.0]
+    """
+    # figure out where the nans are
+    mask = cp.isnan(column)
+
+    # trivial cases, all nan or no nans
+    num_nan = mask.sum()
+    if num_nan == 0 or num_nan == len(column):
+        return column
+
+    to_interp = Frame(data={None: column}, index=index)
+    known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask))
+
+    known_x = known_x_and_y._index._column.values
+    known_y = known_x_and_y._data.columns[0].values
+
+    result = cp.interp(to_interp._index.values, known_x, known_y)
+
+    # find the first nan
+    first_nan_idx = (mask == 0).argmax().item()
+    result[:first_nan_idx] = np.nan
+    return result
+
+
+def get_column_interpolator(method):
+    interpolator = {
+        "linear": _linear_interpolation,
+        "index": _index_or_values_interpolation,
+        "values": _index_or_values_interpolation,
+    }.get(method, None)
+    if not interpolator:
+        raise ValueError(f"Interpolation method `{method}` not found")
+    return interpolator
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -5893,6 +5893,36 @@ def _from_columns(cls, cols, index=None, columns=None):
 
         return cls(data=data, index=index,)
 
+    def interpolate(
+        self,
+        method="linear",
+        axis=0,
+        limit=None,
+        inplace=False,
+        limit_direction=None,
+        limit_area=None,
+        downcast=None,
+        **kwargs,
+    ):
+
+        if all(dt == np.dtype("object") for dt in self.dtypes):
+            raise TypeError(
+                "Cannot interpolate with all object-dtype "
+                "columns in the DataFrame. Try setting at "
+                "least one column to a numeric dtype."
+            )
+
+        return super().interpolate(
+            method=method,
+            axis=axis,
+            limit=limit,
+            inplace=inplace,
+            limit_direction=limit_direction,
+            limit_area=limit_area,
+            downcast=downcast,
+            **kwargs,
+        )
+
     def quantile(
         self,
         q=0.5,

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
@@ -1443,6 +1443,75 @@ def _apply_boolean_mask(self, boolean_mask):
         result._copy_type_metadata(self)
         return result
 
+    def interpolate(
+        self,
+        method="linear",
+        axis=0,
+        limit=None,
+        inplace=False,
+        limit_direction=None,
+        limit_area=None,
+        downcast=None,
+        **kwargs,
+    ):
+        """
+        Interpolate data values between some points.
+
+        Parameters
+        ----------
+        method : str, default 'linear'
+            Interpolation technique to use. Currently,
+            only 'linear` is supported.
+            * 'linear': Ignore the index and treat the values as
+            equally spaced. This is the only method supported on MultiIndexes.
+            * 'index', 'values': linearly interpolate using the index as
+            an x-axis. Unsorted indices can lead to erroneous results.
+        axis : int, default 0
+            Axis to interpolate along. Currently,
+            only 'axis=0' is supported.
+        inplace : bool, default False
+            Update the data in place if possible.
+
+        Returns
+        -------
+        Series or DataFrame
+            Returns the same object type as the caller, interpolated at
+            some or all ``NaN`` values
+
+        """
+
+        if method in {"pad", "ffill"} and limit_direction != "forward":
+            raise ValueError(
+                f"`limit_direction` must be 'forward' for method `{method}`"
+            )
+        if method in {"backfill", "bfill"} and limit_direction != "backward":
+            raise ValueError(
+                f"`limit_direction` must be 'backward' for method `{method}`"
+            )
+
+        data = self
+
+        if not isinstance(data._index, cudf.RangeIndex):
+            perm_sort = data._index.argsort()
+            data = data._gather(perm_sort)
+
+        interpolator = cudf.core.algorithms.get_column_interpolator(method)
+        columns = {}
+        for colname, col in data._data.items():
+            if col.nullable:
+                col = col.astype("float64").fillna(np.nan)
+
+            # Interpolation methods may or may not need the index
+            columns[colname] = interpolator(col, index=data._index)
+
+        result = self._from_data(columns, index=data._index)
+
+        return (
+            result
+            if isinstance(data._index, cudf.RangeIndex)
+            else result._gather(perm_sort.argsort())
+        )
+
     def _quantiles(
         self,
         q,

diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py
@@ -0,0 +1,119 @@
+import pytest
+
+import cudf
+from cudf.testing._utils import assert_eq, assert_exceptions_equal
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        # basics
+        {"A": [1.0, 2.0, 3.0], "B": [4.0, 5.0, 6.0]},
+        {"A": [1.0, None, 3.0], "B": [4.0, None, 6.0]},
+        {"A": [None, 2.0, 3.0], "B": [4.0, 5.0, None]},
+    ],
+)
+@pytest.mark.parametrize("method", ["linear"])
+@pytest.mark.parametrize("axis", [0])
+def test_interpolate_dataframe(data, method, axis):
+    # Pandas interpolate methods do not seem to work
+    # with nullable dtypes yet, so this method treats
+    # NAs as NaNs
+    # https://github.com/pandas-dev/pandas/issues/40252
+    gdf = cudf.DataFrame(data)
+    pdf = gdf.to_pandas()
+
+    expect = pdf.interpolate(method=method, axis=axis)
+    got = gdf.interpolate(method=method, axis=axis)
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1.0, 2.0, 3.0],
+        [1.0, None, 3.0],
+        [None, 2.0, None, 4.0],
+        [1.0, None, 3.0, None],
+        [None, None, 3.0, 4.0],
+        [1.0, 2.0, None, None],
+        [None, None, None, None],
+        [0.1, 0.2, 0.3],
+    ],
+)
+@pytest.mark.parametrize("method", ["linear"])
+@pytest.mark.parametrize("axis", [0])
+def test_interpolate_series(data, method, axis):
+    gsr = cudf.Series(data)
+    psr = gsr.to_pandas()
+
+    expect = psr.interpolate(method=method, axis=axis)
+    got = gsr.interpolate(method=method, axis=axis)
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data,index", [([2.0, None, 4.0, None, 2.0], [1, 2, 3, 2, 1])]
+)
+def test_interpolate_series_unsorted_index(data, index):
+    gsr = cudf.Series(data, index=index)
+    psr = gsr.to_pandas()
+
+    expect = psr.interpolate(method="values")
+    got = gsr.interpolate(method="values")
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1.0, 2.0, 3.0, 4.0],
+        [None, 2.0, 3.0, 4.0],
+        [1.0, 2.0, 3.0, None],
+        [None, None, 3.0, 4.0],
+        [1.0, 2.0, None, None],
+        [1.0, None, 3.0, None],
+        [None, 2.0, None, 4.0],
+        [None, None, None, None],
+    ],
+)
+@pytest.mark.parametrize("index", [[0, 1, 2, 3], [0, 2, 4, 6], [0, 3, 4, 9]])
+@pytest.mark.parametrize("method", ["index", "values"])
+def test_interpolate_series_values_or_index(data, index, method):
+    gsr = cudf.Series(data, index=index)
+    psr = gsr.to_pandas()
+
+    expect = psr.interpolate(method=method)
+    got = gsr.interpolate(method=method)
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data,kwargs",
+    [
+        (
+            {"A": ["a", "b", "c"], "B": ["d", "e", "f"]},
+            {"axis": 0, "method": "linear"},
+        ),
+        ({"A": [1, 2, 3]}, {"method": "pad", "limit_direction": "backward"}),
+        ({"A": [1, 2, 3]}, {"method": "ffill", "limit_direction": "backward"}),
+        ({"A": [1, 2, 3]}, {"method": "bfill", "limit_direction": "forward"}),
+        (
+            {"A": [1, 2, 3]},
+            {"method": "backfill", "limit_direction": "forward"},
+        ),
+    ],
+)
+def test_interpolate_dataframe_error_cases(data, kwargs):
+    gsr = cudf.DataFrame(data)
+    psr = gsr.to_pandas()
+
+    assert_exceptions_equal(
+        lfunc=psr.interpolate,
+        rfunc=gsr.interpolate,
+        lfunc_args_and_kwargs=([], kwargs),
+        rfunc_args_and_kwargs=([], kwargs),
+    )