Skip to content

Commit

Permalink
Linear Interpolation of nans via cupy (#8767)
Browse files Browse the repository at this point in the history
Adds Series and DataFrame level functions for linear interpolation of missing values, built around CuPy's `interp` method. 

Pandas `interpolate` API supports somewhat varied functionality for filling `NaN`s. It currently does not work for actual `<NA>` values - pandas issue [here.](pandas-dev/pandas#40252). That said one might expect both kinds of missing data to be treated equally for the purposes of interpolation, and this PR does that. 

While `cp.interp` is great for getting us off the ground, but only supports linear interpolation and its results aren't exactly what pandas produces. In particular pandas will not fill `NaN`s at the start of the series, because the default value of `limit_direction` is `forward` and the default `limit` is `None` which from my experimentation means 'unlimited'. This means that that despite this, the `NaN`s at the end WILL get filled. This means we need to actually figure out where the first NaN is and mask out that part of the series with `NaN`s. 

Closes #8685.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ashwin Srinath (https://github.com/shwina)

URL: #8767
  • Loading branch information
brandon-b-miller authored Aug 10, 2021
1 parent e8b05de commit b1c2dd4
Show file tree
Hide file tree
Showing 4 changed files with 274 additions and 0 deletions.
56 changes: 56 additions & 0 deletions python/cudf/cudf/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@
from warnings import warn

import cupy as cp
import numpy as np

from cudf.core.column import as_column
from cudf.core.frame import Frame
from cudf.core.index import RangeIndex
from cudf.core.series import Index, Series


Expand Down Expand Up @@ -59,3 +63,55 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None):
values.name = name

return labels, cats.values if return_cupy_array else Index(cats)


def _linear_interpolation(column, index=None):
"""
Interpolate over a float column. Implicitly assumes that values are
evenly spaced with respect to the x-axis, for example the data
[1.0, NaN, 3.0] will be interpolated assuming the NaN is half way
between the two valid values, yielding [1.0, 2.0, 3.0]
"""

index = RangeIndex(start=0, stop=len(column), step=1)
return _index_or_values_interpolation(column, index=index)


def _index_or_values_interpolation(column, index=None):
"""
Interpolate over a float column. assumes a linear interpolation
strategy using the index of the data to denote spacing of the x
values. For example the data and index [1.0, NaN, 4.0], [1, 3, 4]
would result in [1.0, 3.0, 4.0]
"""
# figure out where the nans are
mask = cp.isnan(column)

# trivial cases, all nan or no nans
num_nan = mask.sum()
if num_nan == 0 or num_nan == len(column):
return column

to_interp = Frame(data={None: column}, index=index)
known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask))

known_x = known_x_and_y._index._column.values
known_y = known_x_and_y._data.columns[0].values

result = cp.interp(to_interp._index.values, known_x, known_y)

# find the first nan
first_nan_idx = (mask == 0).argmax().item()
result[:first_nan_idx] = np.nan
return result


def get_column_interpolator(method):
interpolator = {
"linear": _linear_interpolation,
"index": _index_or_values_interpolation,
"values": _index_or_values_interpolation,
}.get(method, None)
if not interpolator:
raise ValueError(f"Interpolation method `{method}` not found")
return interpolator
30 changes: 30 additions & 0 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5893,6 +5893,36 @@ def _from_columns(cls, cols, index=None, columns=None):

return cls(data=data, index=index,)

def interpolate(
self,
method="linear",
axis=0,
limit=None,
inplace=False,
limit_direction=None,
limit_area=None,
downcast=None,
**kwargs,
):

if all(dt == np.dtype("object") for dt in self.dtypes):
raise TypeError(
"Cannot interpolate with all object-dtype "
"columns in the DataFrame. Try setting at "
"least one column to a numeric dtype."
)

return super().interpolate(
method=method,
axis=axis,
limit=limit,
inplace=inplace,
limit_direction=limit_direction,
limit_area=limit_area,
downcast=downcast,
**kwargs,
)

def quantile(
self,
q=0.5,
Expand Down
69 changes: 69 additions & 0 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1443,6 +1443,75 @@ def _apply_boolean_mask(self, boolean_mask):
result._copy_type_metadata(self)
return result

def interpolate(
self,
method="linear",
axis=0,
limit=None,
inplace=False,
limit_direction=None,
limit_area=None,
downcast=None,
**kwargs,
):
"""
Interpolate data values between some points.
Parameters
----------
method : str, default 'linear'
Interpolation technique to use. Currently,
only 'linear` is supported.
* 'linear': Ignore the index and treat the values as
equally spaced. This is the only method supported on MultiIndexes.
* 'index', 'values': linearly interpolate using the index as
an x-axis. Unsorted indices can lead to erroneous results.
axis : int, default 0
Axis to interpolate along. Currently,
only 'axis=0' is supported.
inplace : bool, default False
Update the data in place if possible.
Returns
-------
Series or DataFrame
Returns the same object type as the caller, interpolated at
some or all ``NaN`` values
"""

if method in {"pad", "ffill"} and limit_direction != "forward":
raise ValueError(
f"`limit_direction` must be 'forward' for method `{method}`"
)
if method in {"backfill", "bfill"} and limit_direction != "backward":
raise ValueError(
f"`limit_direction` must be 'backward' for method `{method}`"
)

data = self

if not isinstance(data._index, cudf.RangeIndex):
perm_sort = data._index.argsort()
data = data._gather(perm_sort)

interpolator = cudf.core.algorithms.get_column_interpolator(method)
columns = {}
for colname, col in data._data.items():
if col.nullable:
col = col.astype("float64").fillna(np.nan)

# Interpolation methods may or may not need the index
columns[colname] = interpolator(col, index=data._index)

result = self._from_data(columns, index=data._index)

return (
result
if isinstance(data._index, cudf.RangeIndex)
else result._gather(perm_sort.argsort())
)

def _quantiles(
self,
q,
Expand Down
119 changes: 119 additions & 0 deletions python/cudf/cudf/tests/test_interpolate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import pytest

import cudf
from cudf.testing._utils import assert_eq, assert_exceptions_equal


@pytest.mark.parametrize(
"data",
[
# basics
{"A": [1.0, 2.0, 3.0], "B": [4.0, 5.0, 6.0]},
{"A": [1.0, None, 3.0], "B": [4.0, None, 6.0]},
{"A": [None, 2.0, 3.0], "B": [4.0, 5.0, None]},
],
)
@pytest.mark.parametrize("method", ["linear"])
@pytest.mark.parametrize("axis", [0])
def test_interpolate_dataframe(data, method, axis):
# Pandas interpolate methods do not seem to work
# with nullable dtypes yet, so this method treats
# NAs as NaNs
# https://github.com/pandas-dev/pandas/issues/40252
gdf = cudf.DataFrame(data)
pdf = gdf.to_pandas()

expect = pdf.interpolate(method=method, axis=axis)
got = gdf.interpolate(method=method, axis=axis)
assert_eq(expect, got)


@pytest.mark.parametrize(
"data",
[
[1.0, 2.0, 3.0],
[1.0, None, 3.0],
[None, 2.0, None, 4.0],
[1.0, None, 3.0, None],
[None, None, 3.0, 4.0],
[1.0, 2.0, None, None],
[None, None, None, None],
[0.1, 0.2, 0.3],
],
)
@pytest.mark.parametrize("method", ["linear"])
@pytest.mark.parametrize("axis", [0])
def test_interpolate_series(data, method, axis):
gsr = cudf.Series(data)
psr = gsr.to_pandas()

expect = psr.interpolate(method=method, axis=axis)
got = gsr.interpolate(method=method, axis=axis)

assert_eq(expect, got)


@pytest.mark.parametrize(
"data,index", [([2.0, None, 4.0, None, 2.0], [1, 2, 3, 2, 1])]
)
def test_interpolate_series_unsorted_index(data, index):
gsr = cudf.Series(data, index=index)
psr = gsr.to_pandas()

expect = psr.interpolate(method="values")
got = gsr.interpolate(method="values")

assert_eq(expect, got)


@pytest.mark.parametrize(
"data",
[
[1.0, 2.0, 3.0, 4.0],
[None, 2.0, 3.0, 4.0],
[1.0, 2.0, 3.0, None],
[None, None, 3.0, 4.0],
[1.0, 2.0, None, None],
[1.0, None, 3.0, None],
[None, 2.0, None, 4.0],
[None, None, None, None],
],
)
@pytest.mark.parametrize("index", [[0, 1, 2, 3], [0, 2, 4, 6], [0, 3, 4, 9]])
@pytest.mark.parametrize("method", ["index", "values"])
def test_interpolate_series_values_or_index(data, index, method):
gsr = cudf.Series(data, index=index)
psr = gsr.to_pandas()

expect = psr.interpolate(method=method)
got = gsr.interpolate(method=method)

assert_eq(expect, got)


@pytest.mark.parametrize(
"data,kwargs",
[
(
{"A": ["a", "b", "c"], "B": ["d", "e", "f"]},
{"axis": 0, "method": "linear"},
),
({"A": [1, 2, 3]}, {"method": "pad", "limit_direction": "backward"}),
({"A": [1, 2, 3]}, {"method": "ffill", "limit_direction": "backward"}),
({"A": [1, 2, 3]}, {"method": "bfill", "limit_direction": "forward"}),
(
{"A": [1, 2, 3]},
{"method": "backfill", "limit_direction": "forward"},
),
],
)
def test_interpolate_dataframe_error_cases(data, kwargs):
gsr = cudf.DataFrame(data)
psr = gsr.to_pandas()

assert_exceptions_equal(
lfunc=psr.interpolate,
rfunc=gsr.interpolate,
lfunc_args_and_kwargs=([], kwargs),
rfunc_args_and_kwargs=([], kwargs),
)

0 comments on commit b1c2dd4

Please sign in to comment.