rapidsai · rapids-bot · Aug 10, 2021 · Jul 13, 2021 · Jul 13, 2021 · Jul 13, 2021
@@ -2,7 +2,10 @@
 from warnings import warn
 
 import cupy as cp
+import numpy as np
 
+from cudf.core.column import as_column
+from cudf.core.index import RangeIndex
 from cudf.core.series import Index, Series
 
 
@@ -59,3 +62,59 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None):
     values.name = name
 
     return labels, cats.values if return_cupy_array else Index(cats)
+
+
+def linear_interpolation(to_interp):
+    """
+    Interpolate over a float column. Implicitly assumes that values are
+    evenly spaced with respect to the x-axis, for example the data
+    [1.0, NaN, 3.0] will be interpolated assuming the NaN is half way
+    between the two valid values, yielding [1.0, 2.0, 3.0]
+    """
+
+    to_interp._index = RangeIndex(start=0, stop=len(to_interp), step=1)
+    return index_or_values_interpolation(to_interp)
+
+
+def index_or_values_interpolation(to_interp):
+    """
+    Interpolate over a float column. assumes a linear interpolation
+    strategy using the index of the data to denote spacing of the x
+    values. For example the data and index [1.0, NaN, 4.0], [1, 3, 4]
+    would result in [1.0, 3.0, 4.0]
+    """
+    colname = list(to_interp._data.keys())[0]
+    to_interp._data[colname] = (
+        to_interp._data[colname].astype("float64").fillna(np.nan)
+    )
+
+    col = to_interp._data[colname]
+
+    # figure out where the nans are
+    mask = cp.isnan(col)
+
+    # trivial case
+    if mask.all():
+        return col
+
+    mask = as_column(~mask)
+    known_x_and_y = to_interp._apply_boolean_mask(mask)
+
+    known_x = cp.asarray(known_x_and_y._index._column)
+    known_y = cp.asarray(known_x_and_y._data.columns[0])
+
+    result = cp.interp(cp.asarray(to_interp._index), known_x, known_y)
+
+    # find the first nan
+    first_nan_idx = as_column(mask).find_first_value(1)
+    result[:first_nan_idx] = np.nan
+    return result
+
+
+def get_column_interpolator(method):
+    if method == "linear":
+        return linear_interpolation
+    elif method in {"index", "values"}:
+        return index_or_values_interpolation
+    else:
+        raise ValueError(f"Interpolation method `{method}` not found")
@@ -5891,6 +5891,41 @@ def _from_columns(cls, cols, index=None, columns=None):
 
         return cls(data=data, index=index,)
 
+    @copy_docstring(Frame._interpolate)
+    def interpolate(
+        self,
+        method="linear",
+        axis=0,
+        limit=None,
+        inplace=False,
+        limit_direction=None,
+        limit_area=None,
+        downcast=None,
+        **kwargs,
+    ):
+
+        if (
+            method in {"index", "values"}
+            and not self.index.is_monotonic_increasing
+        ):
+            warnings.warn("Unsorted Index...")
+        if all(dt == np.dtype("object") for dt in self.dtypes):
+            raise TypeError(
+                "Cannot interpolate with all object-dtype "
+                "columns in the DataFrame. Try setting at "
+                "least one column to a numeric dtype."
+            )
+        return super()._interpolate(
+            method=method,
+            axis=axis,
+            limit=limit,
+            inplace=inplace,
+            limit_direction=limit_direction,
+            limit_area=limit_area,
+            downcast=downcast,
+            **kwargs,
+        )
+
     def quantile(
         self,
         q=0.5,

@@ -65,7 +65,12 @@ def __init_subclass__(cls):
 
     @classmethod
     def _from_table(cls, table: Frame):
-        return cls(table._data, index=table._index)
+        return cls(
+            table._data,
+            index=cudf.Index._from_table(table._index)
+            if table._index is not None
+            else table._index,
+        )
 
     def _mimic_inplace(
         self: T, result: Frame, inplace: bool = False
@@ -1415,7 +1420,6 @@ def _apply_boolean_mask(self, boolean_mask):
         rows corresponding to `False` is dropped
         """
         boolean_mask = as_column(boolean_mask)
-
         result = self.__class__._from_table(
             libcudf.stream_compaction.apply_boolean_mask(
                 self, as_column(boolean_mask)
@@ -1424,6 +1428,66 @@ def _apply_boolean_mask(self, boolean_mask):
         result._copy_type_metadata(self)
         return result
 
+    def _interpolate(
+        self,
+        method="linear",
+        axis=0,
+        limit=None,
+        inplace=False,
+        limit_direction=None,
+        limit_area=None,
+        downcast=None,
+        **kwargs,
+    ):
+        """
+        Interpolate data values between some points.
+
+        Parameters
+        ----------
+        method : str, default 'linear'
+            Interpolation technique to use. Currently,
+            only 'linear` is supported.
+            * 'linear': Ignore the index and treat the values as
+            equally spaced. This is the only method supported on MultiIndexes.
+            * 'index', 'values': linearly interpolate using the index as
+            an x-axis. Unsorted indices can lead to erroneous results.
+        axis : int, default 0
+            Axis to interpolate along. Currently,
+            only 'axis=0' is supprted.
+        inplace : bool, default False
+            Update the data in place if possible.
+
+        Returns
+        -------
+        Series or DataFrame
+            Returns the same object type as the caller, interpolated at
+            some or all ``NaN`` values
+
+        """
+
+        if method in {"pad", "ffill"} and limit_direction != "forward":
+            raise ValueError(
+                f"`limit_direction` must be 'forward' for method `{method}`"
+            )
+        if method in {"backfill", "bfill"} and limit_direction != "backward":
+            raise ValueError(
+                f"`limit_direction` must be 'backward' for method `{method}`"
+            )
+
+        columns = ColumnAccessor()
+
+        interpolator = cudf.core.algorithms.get_column_interpolator(method)
+        for colname, col in self._data.items():
+            if col.nullable:
+                col = col.fillna(np.nan)
+
+            # Interpolation methods may or may not need the index
+            to_interp = Frame(data={colname: col}, index=self.index)
+            result = interpolator(to_interp)
+            columns[colname] = result
+
+        return self.__class__(columns, index=self.index.copy())
+
     def _quantiles(
         self,
         q,

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -5418,6 +5418,29 @@ def hash_encode(self, stop, use_name=False):
         mod_vals = hashed_values % stop
         return Series(mod_vals._column, index=self.index, name=self.name)
 
+    @copy_docstring(Frame._interpolate)
+    def interpolate(
+        self,
+        method="linear",
+        axis=0,
+        limit=None,
+        inplace=False,
+        limit_direction=None,
+        limit_area=None,
+        downcast=None,
+        **kwargs,
+    ):
+        return super()._interpolate(
+            method=method,
+            axis=axis,
+            limit=limit,
+            inplace=inplace,
+            limit_direction=limit_direction,
+            limit_area=limit_area,
+            downcast=downcast,
+            **kwargs,
+        )
+
     def quantile(
         self, q=0.5, interpolation="linear", exact=True, quant_index=True
     ):

@@ -0,0 +1,103 @@
+import pytest
+
+import cudf
+from cudf.testing._utils import assert_eq, assert_exceptions_equal
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        # basics
+        {"A": [1.0, 2.0, 3.0], "B": [4.0, 5.0, 6.0]},
+        {"A": [1.0, None, 3.0], "B": [4.0, None, 6.0]},
+        {"A": [None, 2.0, 3.0], "B": [4.0, 5.0, None]},
+    ],
+)
+@pytest.mark.parametrize("method", ["linear"])
+@pytest.mark.parametrize("axis", [0])
+def test_interpolate_dataframe(data, method, axis):
+    # doesn't seem to work with NAs just yet
+    gdf = cudf.DataFrame(data)
+    pdf = gdf.to_pandas()
+
+    expect = pdf.interpolate(method=method, axis=axis)
+    got = gdf.interpolate(method=method, axis=axis)
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1.0, 2.0, 3.0],
+        [1.0, None, 3.0],
+        [None, 2.0, None, 4.0],
+        [1.0, None, 3.0, None],
+        [None, None, 3.0, 4.0],
+        [1.0, 2.0, None, None],
+        [None, None, None, None],
+        [0.1, 0.2, 0.3],
+    ],
+)
+@pytest.mark.parametrize("method", ["linear"])
+@pytest.mark.parametrize("axis", [0])
+def test_interpolate_series(data, method, axis):
+    gsr = cudf.Series(data)
+    psr = gsr.to_pandas()
+
+    expect = psr.interpolate(method=method, axis=axis)
+    got = gsr.interpolate(method=method, axis=axis)
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1.0, 2.0, 3.0, 4.0],
+        [None, 2.0, 3.0, 4.0],
+        [1.0, 2.0, 3.0, None],
+        [None, None, 3.0, 4.0],
+        [1.0, 2.0, None, None],
+        [1.0, None, 3.0, None],
+        [None, 2.0, None, 4.0],
+        [None, None, None, None],
+    ],
+)
+@pytest.mark.parametrize("index", [[0, 1, 2, 3], [0, 2, 4, 6], [0, 3, 4, 9]])
+@pytest.mark.parametrize("method", ["index", "values"])
+def test_interpolate_series_values_or_index(data, index, method):
+    gsr = cudf.Series(data, index=index)
+    psr = gsr.to_pandas()
+
+    expect = psr.interpolate(method=method)
+    got = gsr.interpolate(method=method)
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data,kwargs",
+    [
+        (
+            {"A": ["a", "b", "c"], "B": ["d", "e", "f"]},
+            {"axis": 0, "method": "linear"},
+        ),
+        ({"A": [1, 2, 3]}, {"method": "pad", "limit_direction": "backward"}),
+        ({"A": [1, 2, 3]}, {"method": "ffill", "limit_direction": "backward"}),
+        ({"A": [1, 2, 3]}, {"method": "bfill", "limit_direction": "forward"}),
+        (
+            {"A": [1, 2, 3]},
+            {"method": "backfill", "limit_direction": "forward"},
+        ),
+    ],
+)
+def test_interpolate_dataframe_error_cases(data, kwargs):
+    gsr = cudf.DataFrame(data)
+    psr = gsr.to_pandas()
+
+    assert_exceptions_equal(
+        lfunc=psr.interpolate,
+        rfunc=gsr.interpolate,
+        lfunc_args_and_kwargs=([], kwargs),
+        rfunc_args_and_kwargs=([], kwargs),
+    )