Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Linear Interpolation of nans via cupy #8767

Merged
merged 27 commits into from
Aug 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
6b97e6e
very basic stuff
brandon-b-miller Jul 13, 2021
676388b
forgot test
brandon-b-miller Jul 13, 2021
d625c30
move things to frame
brandon-b-miller Jul 13, 2021
42b7311
Merge branch 'branch-21.08' into fea-linear-interp
brandon-b-miller Jul 19, 2021
c89d938
updates
brandon-b-miller Jul 19, 2021
5a4e720
sig and docstring updates
brandon-b-miller Jul 19, 2021
c17cd4f
updates
brandon-b-miller Jul 20, 2021
c16f2b3
progress
brandon-b-miller Jul 21, 2021
fe56bb1
refactoring
brandon-b-miller Jul 22, 2021
a681616
test index and values methods
brandon-b-miller Jul 22, 2021
98608a9
forgot the index
brandon-b-miller Jul 22, 2021
143c798
style
brandon-b-miller Jul 23, 2021
81ffee1
remove unnecessary older changes
brandon-b-miller Jul 23, 2021
f859d0e
directly add and test unsorted index case
brandon-b-miller Jul 28, 2021
71272a9
....but dont do it for RangeIndex based data
brandon-b-miller Jul 28, 2021
52e431a
Apply suggestions from code review
brandon-b-miller Jul 28, 2021
4fc0978
Merge branch 'fea-linear-interp' of github.com:brandon-b-miller/cudf …
brandon-b-miller Jul 28, 2021
088618e
fix minor bugs
brandon-b-miller Jul 28, 2021
4785a56
address reviews
brandon-b-miller Jul 28, 2021
ed6cb81
more reviews
brandon-b-miller Jul 28, 2021
b85edc1
just expose interpolate directly
brandon-b-miller Jul 28, 2021
82c4f1e
style
brandon-b-miller Jul 28, 2021
bb31ab0
Merge branch 'branch-21.10' into fea-linear-interp
brandon-b-miller Aug 2, 2021
b486c8b
address last review comment
brandon-b-miller Aug 2, 2021
0f29b34
merge
brandon-b-miller Aug 5, 2021
296eddc
address review
brandon-b-miller Aug 6, 2021
94cc6da
Update python/cudf/cudf/core/frame.py
brandon-b-miller Aug 6, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions python/cudf/cudf/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@
from warnings import warn

import cupy as cp
import numpy as np

from cudf.core.column import as_column
from cudf.core.frame import Frame
from cudf.core.index import RangeIndex
from cudf.core.series import Index, Series


Expand Down Expand Up @@ -59,3 +63,55 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None):
values.name = name

return labels, cats.values if return_cupy_array else Index(cats)


def _linear_interpolation(column, index=None):
"""
Interpolate over a float column. Implicitly assumes that values are
evenly spaced with respect to the x-axis, for example the data
[1.0, NaN, 3.0] will be interpolated assuming the NaN is half way
between the two valid values, yielding [1.0, 2.0, 3.0]
"""

index = RangeIndex(start=0, stop=len(column), step=1)
return _index_or_values_interpolation(column, index=index)


def _index_or_values_interpolation(column, index=None):
"""
Interpolate over a float column. assumes a linear interpolation
strategy using the index of the data to denote spacing of the x
values. For example the data and index [1.0, NaN, 4.0], [1, 3, 4]
would result in [1.0, 3.0, 4.0]
"""
# figure out where the nans are
mask = cp.isnan(column)

# trivial cases, all nan or no nans
num_nan = mask.sum()
if num_nan == 0 or num_nan == len(column):
return column

to_interp = Frame(data={None: column}, index=index)
known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask))

known_x = known_x_and_y._index._column.values
known_y = known_x_and_y._data.columns[0].values

result = cp.interp(to_interp._index.values, known_x, known_y)

# find the first nan
first_nan_idx = (mask == 0).argmax().item()
result[:first_nan_idx] = np.nan
return result


def get_column_interpolator(method):
interpolator = {
"linear": _linear_interpolation,
"index": _index_or_values_interpolation,
"values": _index_or_values_interpolation,
}.get(method, None)
if not interpolator:
raise ValueError(f"Interpolation method `{method}` not found")
return interpolator
30 changes: 30 additions & 0 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5893,6 +5893,36 @@ def _from_columns(cls, cols, index=None, columns=None):

return cls(data=data, index=index,)

def interpolate(
self,
method="linear",
axis=0,
limit=None,
inplace=False,
limit_direction=None,
limit_area=None,
downcast=None,
**kwargs,
):

if all(dt == np.dtype("object") for dt in self.dtypes):
raise TypeError(
"Cannot interpolate with all object-dtype "
"columns in the DataFrame. Try setting at "
"least one column to a numeric dtype."
)

return super().interpolate(
method=method,
axis=axis,
limit=limit,
inplace=inplace,
limit_direction=limit_direction,
limit_area=limit_area,
downcast=downcast,
**kwargs,
)

def quantile(
self,
q=0.5,
Expand Down
69 changes: 69 additions & 0 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1443,6 +1443,75 @@ def _apply_boolean_mask(self, boolean_mask):
result._copy_type_metadata(self)
return result

def interpolate(
self,
method="linear",
axis=0,
limit=None,
inplace=False,
limit_direction=None,
limit_area=None,
downcast=None,
**kwargs,
):
"""
Interpolate data values between some points.

Parameters
----------
method : str, default 'linear'
Interpolation technique to use. Currently,
only 'linear` is supported.
* 'linear': Ignore the index and treat the values as
equally spaced. This is the only method supported on MultiIndexes.
* 'index', 'values': linearly interpolate using the index as
an x-axis. Unsorted indices can lead to erroneous results.
axis : int, default 0
Axis to interpolate along. Currently,
only 'axis=0' is supported.
inplace : bool, default False
Update the data in place if possible.

Returns
-------
Series or DataFrame
Returns the same object type as the caller, interpolated at
some or all ``NaN`` values

"""

if method in {"pad", "ffill"} and limit_direction != "forward":
raise ValueError(
f"`limit_direction` must be 'forward' for method `{method}`"
)
if method in {"backfill", "bfill"} and limit_direction != "backward":
raise ValueError(
f"`limit_direction` must be 'backward' for method `{method}`"
)

data = self

if not isinstance(data._index, cudf.RangeIndex):
perm_sort = data._index.argsort()
data = data._gather(perm_sort)

interpolator = cudf.core.algorithms.get_column_interpolator(method)
columns = {}
for colname, col in data._data.items():
if col.nullable:
col = col.astype("float64").fillna(np.nan)

# Interpolation methods may or may not need the index
columns[colname] = interpolator(col, index=data._index)

result = self._from_data(columns, index=data._index)

return (
result
if isinstance(data._index, cudf.RangeIndex)
else result._gather(perm_sort.argsort())
)

def _quantiles(
self,
q,
Expand Down
119 changes: 119 additions & 0 deletions python/cudf/cudf/tests/test_interpolate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import pytest

import cudf
from cudf.testing._utils import assert_eq, assert_exceptions_equal


@pytest.mark.parametrize(
"data",
[
# basics
{"A": [1.0, 2.0, 3.0], "B": [4.0, 5.0, 6.0]},
{"A": [1.0, None, 3.0], "B": [4.0, None, 6.0]},
{"A": [None, 2.0, 3.0], "B": [4.0, 5.0, None]},
],
)
@pytest.mark.parametrize("method", ["linear"])
@pytest.mark.parametrize("axis", [0])
def test_interpolate_dataframe(data, method, axis):
# Pandas interpolate methods do not seem to work
# with nullable dtypes yet, so this method treats
# NAs as NaNs
# https://github.com/pandas-dev/pandas/issues/40252
gdf = cudf.DataFrame(data)
pdf = gdf.to_pandas()

expect = pdf.interpolate(method=method, axis=axis)
got = gdf.interpolate(method=method, axis=axis)
assert_eq(expect, got)


@pytest.mark.parametrize(
"data",
[
[1.0, 2.0, 3.0],
[1.0, None, 3.0],
[None, 2.0, None, 4.0],
[1.0, None, 3.0, None],
[None, None, 3.0, 4.0],
[1.0, 2.0, None, None],
[None, None, None, None],
[0.1, 0.2, 0.3],
],
)
@pytest.mark.parametrize("method", ["linear"])
@pytest.mark.parametrize("axis", [0])
def test_interpolate_series(data, method, axis):
gsr = cudf.Series(data)
psr = gsr.to_pandas()

expect = psr.interpolate(method=method, axis=axis)
got = gsr.interpolate(method=method, axis=axis)

assert_eq(expect, got)


@pytest.mark.parametrize(
"data,index", [([2.0, None, 4.0, None, 2.0], [1, 2, 3, 2, 1])]
)
def test_interpolate_series_unsorted_index(data, index):
gsr = cudf.Series(data, index=index)
psr = gsr.to_pandas()

expect = psr.interpolate(method="values")
got = gsr.interpolate(method="values")

assert_eq(expect, got)


@pytest.mark.parametrize(
"data",
[
[1.0, 2.0, 3.0, 4.0],
[None, 2.0, 3.0, 4.0],
[1.0, 2.0, 3.0, None],
[None, None, 3.0, 4.0],
[1.0, 2.0, None, None],
[1.0, None, 3.0, None],
[None, 2.0, None, 4.0],
[None, None, None, None],
],
)
@pytest.mark.parametrize("index", [[0, 1, 2, 3], [0, 2, 4, 6], [0, 3, 4, 9]])
@pytest.mark.parametrize("method", ["index", "values"])
def test_interpolate_series_values_or_index(data, index, method):
gsr = cudf.Series(data, index=index)
psr = gsr.to_pandas()

expect = psr.interpolate(method=method)
got = gsr.interpolate(method=method)

assert_eq(expect, got)


@pytest.mark.parametrize(
"data,kwargs",
[
(
{"A": ["a", "b", "c"], "B": ["d", "e", "f"]},
{"axis": 0, "method": "linear"},
),
({"A": [1, 2, 3]}, {"method": "pad", "limit_direction": "backward"}),
({"A": [1, 2, 3]}, {"method": "ffill", "limit_direction": "backward"}),
({"A": [1, 2, 3]}, {"method": "bfill", "limit_direction": "forward"}),
(
{"A": [1, 2, 3]},
{"method": "backfill", "limit_direction": "forward"},
),
],
)
def test_interpolate_dataframe_error_cases(data, kwargs):
gsr = cudf.DataFrame(data)
psr = gsr.to_pandas()

assert_exceptions_equal(
lfunc=psr.interpolate,
rfunc=gsr.interpolate,
lfunc_args_and_kwargs=([], kwargs),
rfunc_args_and_kwargs=([], kwargs),
)