Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Linear Interpolation of nans via cupy #8767

Merged
merged 27 commits into from
Aug 10, 2021
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
6b97e6e
very basic stuff
brandon-b-miller Jul 13, 2021
676388b
forgot test
brandon-b-miller Jul 13, 2021
d625c30
move things to frame
brandon-b-miller Jul 13, 2021
42b7311
Merge branch 'branch-21.08' into fea-linear-interp
brandon-b-miller Jul 19, 2021
c89d938
updates
brandon-b-miller Jul 19, 2021
5a4e720
sig and docstring updates
brandon-b-miller Jul 19, 2021
c17cd4f
updates
brandon-b-miller Jul 20, 2021
c16f2b3
progress
brandon-b-miller Jul 21, 2021
fe56bb1
refactoring
brandon-b-miller Jul 22, 2021
a681616
test index and values methods
brandon-b-miller Jul 22, 2021
98608a9
forgot the index
brandon-b-miller Jul 22, 2021
143c798
style
brandon-b-miller Jul 23, 2021
81ffee1
remove unnecessary older changes
brandon-b-miller Jul 23, 2021
f859d0e
directly add and test unsorted index case
brandon-b-miller Jul 28, 2021
71272a9
....but dont do it for RangeIndex based data
brandon-b-miller Jul 28, 2021
52e431a
Apply suggestions from code review
brandon-b-miller Jul 28, 2021
4fc0978
Merge branch 'fea-linear-interp' of github.com:brandon-b-miller/cudf …
brandon-b-miller Jul 28, 2021
088618e
fix minor bugs
brandon-b-miller Jul 28, 2021
4785a56
address reviews
brandon-b-miller Jul 28, 2021
ed6cb81
more reviews
brandon-b-miller Jul 28, 2021
b85edc1
just expose interpolate directly
brandon-b-miller Jul 28, 2021
82c4f1e
style
brandon-b-miller Jul 28, 2021
bb31ab0
Merge branch 'branch-21.10' into fea-linear-interp
brandon-b-miller Aug 2, 2021
b486c8b
address last review comment
brandon-b-miller Aug 2, 2021
0f29b34
merge
brandon-b-miller Aug 5, 2021
296eddc
address review
brandon-b-miller Aug 6, 2021
94cc6da
Update python/cudf/cudf/core/frame.py
brandon-b-miller Aug 6, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions python/cudf/cudf/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
from warnings import warn

import cupy as cp
import numpy as np

from cudf.core.column import as_column
from cudf.core.index import RangeIndex
from cudf.core.series import Index, Series


Expand Down Expand Up @@ -59,3 +62,59 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None):
values.name = name

return labels, cats.values if return_cupy_array else Index(cats)


def linear_interpolation(to_interp):
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved
"""
Interpolate over a float column. Implicitly assumes that values are
evenly spaced with respect to the x-axis, for example the data
[1.0, NaN, 3.0] will be interpolated assuming the NaN is half way
between the two valid values, yielding [1.0, 2.0, 3.0]
"""

to_interp._index = RangeIndex(start=0, stop=len(to_interp), step=1)
return index_or_values_interpolation(to_interp)


def index_or_values_interpolation(to_interp):
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved
"""
Interpolate over a float column. assumes a linear interpolation
strategy using the index of the data to denote spacing of the x
values. For example the data and index [1.0, NaN, 4.0], [1, 3, 4]
would result in [1.0, 3.0, 4.0]
"""
colname = list(to_interp._data.keys())[0]
to_interp._data[colname] = (
to_interp._data[colname].astype("float64").fillna(np.nan)
)

col = to_interp._data[colname]

# figure out where the nans are
mask = cp.isnan(col)

# trivial case
if mask.all():
return col

mask = as_column(~mask)
known_x_and_y = to_interp._apply_boolean_mask(mask)
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved

known_x = cp.asarray(known_x_and_y._index._column)
known_y = cp.asarray(known_x_and_y._data.columns[0])
vyasr marked this conversation as resolved.
Show resolved Hide resolved

result = cp.interp(cp.asarray(to_interp._index), known_x, known_y)
vyasr marked this conversation as resolved.
Show resolved Hide resolved

# find the first nan
first_nan_idx = as_column(mask).find_first_value(1)
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved
result[:first_nan_idx] = np.nan
return result


def get_column_interpolator(method):
if method == "linear":
return linear_interpolation
elif method in {"index", "values"}:
return index_or_values_interpolation
else:
raise ValueError(f"Interpolation method `{method}` not found")
35 changes: 35 additions & 0 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5891,6 +5891,41 @@ def _from_columns(cls, cols, index=None, columns=None):

return cls(data=data, index=index,)

@copy_docstring(Frame._interpolate)
vyasr marked this conversation as resolved.
Show resolved Hide resolved
def interpolate(
self,
method="linear",
axis=0,
limit=None,
inplace=False,
limit_direction=None,
limit_area=None,
downcast=None,
**kwargs,
):

if (
method in {"index", "values"}
and not self.index.is_monotonic_increasing
):
warnings.warn("Unsorted Index...")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we do this? What should we put here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As usual, pandas seems OK with some pretty nonsensical cases:

In [83]: pd.Series([2, None, 4, None, 2], index=[1, 2, 3, 2, 1]).interpolate('values')
Out[83]:
1    2.0
2    3.0
3    4.0
2    3.0
1    2.0
dtype: float64

if all(dt == np.dtype("object") for dt in self.dtypes):
raise TypeError(
"Cannot interpolate with all object-dtype "
"columns in the DataFrame. Try setting at "
"least one column to a numeric dtype."
)
return super()._interpolate(
method=method,
axis=axis,
limit=limit,
inplace=inplace,
limit_direction=limit_direction,
limit_area=limit_area,
downcast=downcast,
**kwargs,
)

def quantile(
self,
q=0.5,
Expand Down
68 changes: 66 additions & 2 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,12 @@ def __init_subclass__(cls):

@classmethod
def _from_table(cls, table: Frame):
return cls(table._data, index=table._index)
return cls(
vyasr marked this conversation as resolved.
Show resolved Hide resolved
table._data,
index=cudf.Index._from_table(table._index)
if table._index is not None
else table._index,
)

def _mimic_inplace(
self: T, result: Frame, inplace: bool = False
Expand Down Expand Up @@ -1415,7 +1420,6 @@ def _apply_boolean_mask(self, boolean_mask):
rows corresponding to `False` is dropped
"""
boolean_mask = as_column(boolean_mask)

result = self.__class__._from_table(
libcudf.stream_compaction.apply_boolean_mask(
self, as_column(boolean_mask)
Expand All @@ -1424,6 +1428,66 @@ def _apply_boolean_mask(self, boolean_mask):
result._copy_type_metadata(self)
return result

def _interpolate(
self,
method="linear",
axis=0,
limit=None,
inplace=False,
limit_direction=None,
limit_area=None,
downcast=None,
**kwargs,
):
"""
Interpolate data values between some points.

Parameters
----------
method : str, default 'linear'
Interpolation technique to use. Currently,
only 'linear` is supported.
* 'linear': Ignore the index and treat the values as
equally spaced. This is the only method supported on MultiIndexes.
* 'index', 'values': linearly interpolate using the index as
an x-axis. Unsorted indices can lead to erroneous results.
axis : int, default 0
Axis to interpolate along. Currently,
only 'axis=0' is supprted.
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved
inplace : bool, default False
Update the data in place if possible.

Returns
-------
Series or DataFrame
Returns the same object type as the caller, interpolated at
some or all ``NaN`` values

"""

if method in {"pad", "ffill"} and limit_direction != "forward":
raise ValueError(
f"`limit_direction` must be 'forward' for method `{method}`"
)
if method in {"backfill", "bfill"} and limit_direction != "backward":
raise ValueError(
f"`limit_direction` must be 'backward' for method `{method}`"
)

columns = ColumnAccessor()

interpolator = cudf.core.algorithms.get_column_interpolator(method)
for colname, col in self._data.items():
if col.nullable:
col = col.fillna(np.nan)

# Interpolation methods may or may not need the index
to_interp = Frame(data={colname: col}, index=self.index)
result = interpolator(to_interp)
vyasr marked this conversation as resolved.
Show resolved Hide resolved
columns[colname] = result
vyasr marked this conversation as resolved.
Show resolved Hide resolved

return self.__class__(columns, index=self.index.copy())

def _quantiles(
self,
q,
Expand Down
23 changes: 23 additions & 0 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -5418,6 +5418,29 @@ def hash_encode(self, stop, use_name=False):
mod_vals = hashed_values % stop
return Series(mod_vals._column, index=self.index, name=self.name)

@copy_docstring(Frame._interpolate)
def interpolate(
self,
method="linear",
axis=0,
limit=None,
inplace=False,
limit_direction=None,
limit_area=None,
downcast=None,
**kwargs,
):
return super()._interpolate(
method=method,
axis=axis,
limit=limit,
inplace=inplace,
limit_direction=limit_direction,
limit_area=limit_area,
downcast=downcast,
**kwargs,
)

vyasr marked this conversation as resolved.
Show resolved Hide resolved
def quantile(
self, q=0.5, interpolation="linear", exact=True, quant_index=True
):
Expand Down
103 changes: 103 additions & 0 deletions python/cudf/cudf/tests/test_interpolate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import pytest

import cudf
from cudf.testing._utils import assert_eq, assert_exceptions_equal


@pytest.mark.parametrize(
"data",
[
# basics
{"A": [1.0, 2.0, 3.0], "B": [4.0, 5.0, 6.0]},
{"A": [1.0, None, 3.0], "B": [4.0, None, 6.0]},
{"A": [None, 2.0, 3.0], "B": [4.0, 5.0, None]},
],
)
@pytest.mark.parametrize("method", ["linear"])
@pytest.mark.parametrize("axis", [0])
def test_interpolate_dataframe(data, method, axis):
# doesn't seem to work with NAs just yet
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this an issue still?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated this with a more descriptive comment, nullable dtypes don't interpolate in pandas yet as there are some bugs it seems, our impl treats nulls and nans the same.

gdf = cudf.DataFrame(data)
pdf = gdf.to_pandas()

expect = pdf.interpolate(method=method, axis=axis)
got = gdf.interpolate(method=method, axis=axis)
assert_eq(expect, got)


@pytest.mark.parametrize(
"data",
[
[1.0, 2.0, 3.0],
[1.0, None, 3.0],
[None, 2.0, None, 4.0],
[1.0, None, 3.0, None],
[None, None, 3.0, 4.0],
[1.0, 2.0, None, None],
[None, None, None, None],
[0.1, 0.2, 0.3],
],
)
@pytest.mark.parametrize("method", ["linear"])
@pytest.mark.parametrize("axis", [0])
def test_interpolate_series(data, method, axis):
gsr = cudf.Series(data)
psr = gsr.to_pandas()

expect = psr.interpolate(method=method, axis=axis)
got = gsr.interpolate(method=method, axis=axis)

assert_eq(expect, got)


@pytest.mark.parametrize(
"data",
[
[1.0, 2.0, 3.0, 4.0],
[None, 2.0, 3.0, 4.0],
[1.0, 2.0, 3.0, None],
[None, None, 3.0, 4.0],
[1.0, 2.0, None, None],
[1.0, None, 3.0, None],
[None, 2.0, None, 4.0],
[None, None, None, None],
],
)
@pytest.mark.parametrize("index", [[0, 1, 2, 3], [0, 2, 4, 6], [0, 3, 4, 9]])
@pytest.mark.parametrize("method", ["index", "values"])
def test_interpolate_series_values_or_index(data, index, method):
gsr = cudf.Series(data, index=index)
psr = gsr.to_pandas()

expect = psr.interpolate(method=method)
got = gsr.interpolate(method=method)

assert_eq(expect, got)


@pytest.mark.parametrize(
"data,kwargs",
[
(
{"A": ["a", "b", "c"], "B": ["d", "e", "f"]},
{"axis": 0, "method": "linear"},
),
({"A": [1, 2, 3]}, {"method": "pad", "limit_direction": "backward"}),
({"A": [1, 2, 3]}, {"method": "ffill", "limit_direction": "backward"}),
({"A": [1, 2, 3]}, {"method": "bfill", "limit_direction": "forward"}),
(
{"A": [1, 2, 3]},
{"method": "backfill", "limit_direction": "forward"},
),
],
)
def test_interpolate_dataframe_error_cases(data, kwargs):
gsr = cudf.DataFrame(data)
psr = gsr.to_pandas()

assert_exceptions_equal(
lfunc=psr.interpolate,
rfunc=gsr.interpolate,
lfunc_args_and_kwargs=([], kwargs),
rfunc_args_and_kwargs=([], kwargs),
)