From 6b97e6e7761876648038e02fe8c1f2f77fa0ed33 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 13 Jul 2021 08:39:36 -0700 Subject: [PATCH 01/23] very basic stuff --- python/cudf/cudf/core/dataframe.py | 25 ++++++++++++++++--------- python/cudf/cudf/core/series.py | 15 +++++++++++++++ 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c02bf3d11a4..ba2fc029f9c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5818,6 +5818,13 @@ def _from_columns(cls, cols, index=None, columns=None): return cls(data=data, index=index,) + def interpolate( + self, + method='linear', + axis=0 + ): + return self._apply_support_method("interpolate", method=method, axis=axis) + def quantile( self, q=0.5, @@ -7063,12 +7070,12 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): **kwargs, ) - def _apply_support_method(self, method, axis=0, *args, **kwargs): + def _apply_support_method(self, _method, axis=0, *args, **kwargs): assert axis in (None, 0, 1) if axis in (None, 0): result = [ - getattr(self[col], method)(*args, **kwargs) + getattr(self[col], _method)(*args, **kwargs) for col in self._data.names ] @@ -7085,13 +7092,13 @@ def _apply_support_method(self, method, axis=0, *args, **kwargs): elif axis == 1: # for dask metadata compatibility skipna = kwargs.pop("skipna", None) - if method not in _cupy_nan_methods_map and skipna not in ( + if _method not in _cupy_nan_methods_map and skipna not in ( None, True, 1, ): raise NotImplementedError( - f"Row-wise operation to calculate '{method}'" + f"Row-wise operation to calculate '{_method}'" f" currently do not support `skipna=False`." ) @@ -7123,7 +7130,7 @@ def _apply_support_method(self, method, axis=0, *args, **kwargs): ) prepared, mask, common_dtype = self._prepare_for_rowwise_op( - method, skipna + _method, skipna ) for col in prepared._data.names: if prepared._data[col].nullable: @@ -7140,10 +7147,10 @@ def _apply_support_method(self, method, axis=0, *args, **kwargs): ) arr = cupy.asarray(prepared.as_gpu_matrix()) - if skipna is not False and method in _cupy_nan_methods_map: - method = _cupy_nan_methods_map[method] + if skipna is not False and _method in _cupy_nan_methods_map: + _method = _cupy_nan_methods_map[_method] - result = getattr(cupy, method)(arr, axis=1, **kwargs) + result = getattr(cupy, _method)(arr, axis=1, **kwargs) if result.ndim == 1: type_coerced_methods = { @@ -7159,7 +7166,7 @@ def _apply_support_method(self, method, axis=0, *args, **kwargs): } result_dtype = ( common_dtype - if method in type_coerced_methods + if _method in type_coerced_methods or is_datetime_dtype(common_dtype) else None ) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 77640db6a1d..50fd32cee5d 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -5403,6 +5403,21 @@ def hash_encode(self, stop, use_name=False): mod_vals = hashed_values % stop return Series(mod_vals._column, index=self.index, name=self.name) + def interpolate( + self, + method='linear' + ): + data = cupy.asarray(self._column.astype('float').fillna(np.nan)) + interp_points = cupy.asarray(self.index) + + known = self[~self.isnull()] + known_x = cupy.asarray(known.index) + known_y = cupy.asarray(known._column) + + result = cupy.interp(interp_points, known_x, known_y) + + return cudf.Series(result) + def quantile( self, q=0.5, interpolation="linear", exact=True, quant_index=True ): From 676388b774786dcd647395cd636c11a47cd782f0 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 13 Jul 2021 11:55:46 -0700 Subject: [PATCH 02/23] forgot test --- python/cudf/cudf/tests/test_interpolate.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 python/cudf/cudf/tests/test_interpolate.py diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py new file mode 100644 index 00000000000..bab680bbb3c --- /dev/null +++ b/python/cudf/cudf/tests/test_interpolate.py @@ -0,0 +1,21 @@ +import pandas as pd +import cudf +import pytest +from cudf.testing._utils import assert_eq + +@pytest.mark.parametrize("data", [ + { + 'A': [1, None, 3] + } +]) +@pytest.mark.parametrize("method", ['linear']) +@pytest.mark.parametrize("axis", [0]) +def test_interpolate_nans(data, method,axis): + # doesn't seem to work with NAs just yet + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + expect = pdf.interpolate(method=method, axis=axis) + got = gdf.interpolate(method=method, axis=axis) + + assert_eq(expect, got) From d625c306e97ac817dda6e96ba91d6a2c5b6485f2 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 13 Jul 2021 14:54:53 -0700 Subject: [PATCH 03/23] move things to frame --- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/frame.py | 19 +++++++++++++++++++ python/cudf/cudf/core/series.py | 11 +---------- python/cudf/cudf/tests/test_interpolate.py | 9 ++++++++- 4 files changed, 29 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index ba2fc029f9c..c479570616c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5823,7 +5823,7 @@ def interpolate( method='linear', axis=0 ): - return self._apply_support_method("interpolate", method=method, axis=axis) + return super()._interpolate(method) def quantile( self, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 3629358ee9f..8f102861218 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1424,6 +1424,25 @@ def _apply_boolean_mask(self, boolean_mask): result._copy_type_metadata(self) return result + def _interpolate(self, method='linear'): + + to_return = {} + for colname, col in self._data.items(): + if col.nullable: + data = cupy.asarray(col.astype('float').fillna(np.nan)) + not_null = col.isnull().unary_operator('not') + + known_x = cupy.asarray(self.index._column.apply_boolean_mask(not_null)) + known_y = cupy.asarray(col.apply_boolean_mask(not_null)).astype('float') + + result = cupy.interp(cupy.asarray(self.index), known_x, known_y, left=np.nan, right=np.nan) + else: + result = col + to_return[colname] = result + + + return self.__class__(to_return) + def _quantiles( self, q, diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 50fd32cee5d..b091880ade3 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -5407,16 +5407,7 @@ def interpolate( self, method='linear' ): - data = cupy.asarray(self._column.astype('float').fillna(np.nan)) - interp_points = cupy.asarray(self.index) - - known = self[~self.isnull()] - known_x = cupy.asarray(known.index) - known_y = cupy.asarray(known._column) - - result = cupy.interp(interp_points, known_x, known_y) - - return cudf.Series(result) + return super()._interpolate(method) def quantile( self, q=0.5, interpolation="linear", exact=True, quant_index=True diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py index bab680bbb3c..f5e2523f323 100644 --- a/python/cudf/cudf/tests/test_interpolate.py +++ b/python/cudf/cudf/tests/test_interpolate.py @@ -6,6 +6,13 @@ @pytest.mark.parametrize("data", [ { 'A': [1, None, 3] + }, + { + 'A': [None, 2, 3, None, 5] + }, + { + 'A': [1, None, 3], + 'B': [None, 2, 3] } ]) @pytest.mark.parametrize("method", ['linear']) @@ -17,5 +24,5 @@ def test_interpolate_nans(data, method,axis): expect = pdf.interpolate(method=method, axis=axis) got = gdf.interpolate(method=method, axis=axis) - + breakpoint() assert_eq(expect, got) From c89d93893acb19dd43b7311024ec280e48c4934a Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 19 Jul 2021 05:33:34 -0700 Subject: [PATCH 04/23] updates --- python/cudf/cudf/core/dataframe.py | 35 ++++++++++++++++++++++++++++++ python/cudf/cudf/core/frame.py | 33 +++++++++++++++++++--------- python/cudf/cudf/core/series.py | 1 + 3 files changed, 59 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d53a392c0b7..6fc1293cf19 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5896,6 +5896,41 @@ def interpolate( method='linear', axis=0 ): + """ + Interpolate data values between some points. + + Parameters + ---------- + method : str, default 'linear' + Interpolation technique to use. Currently, + only 'linear` is supported. + * 'linear': Ignore the index and treat the values as + equally spaced. This is the only method supported on MultiIndexes. + * 'index', 'values': linearly interpolate using the index as + an x-axis. Note that unsorted indices can lead to erroneous results. + axis : int, default 0 + Axis to interpolate along. Currently, + only 'axis=0' is supprted. + inplace : bool, default False + Update the data in place if possible. + + Returns + ------- + Series or DataFrame + Returns the same object type as the caller, interpolated at + some or all ``NaN`` values + + """ + + if method not in {'linear', 'index', 'values'}: + raise ValueError( + f"method {method} is not supported." + ) + if method in {'index', 'values'} and not self.index.is_monotonic_increasing: + warnings.warn( + "Unsorted Index..." + ) + return super()._interpolate(method) def quantile( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 0d45269af04..8b439271867 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1425,23 +1425,36 @@ def _apply_boolean_mask(self, boolean_mask): return result def _interpolate(self, method='linear'): + columns = ColumnAccessor() + + if method == 'linear': + xax = as_column(cupy.arange(len(self))) + elif method in {'index', 'values'}: + xax = self.index - to_return = {} for colname, col in self._data.items(): if col.nullable: - data = cupy.asarray(col.astype('float').fillna(np.nan)) - not_null = col.isnull().unary_operator('not') - - known_x = cupy.asarray(self.index._column.apply_boolean_mask(not_null)) - known_y = cupy.asarray(col.apply_boolean_mask(not_null)).astype('float') - - result = cupy.interp(cupy.asarray(self.index), known_x, known_y, left=np.nan, right=np.nan) + not_null = col.notnull() + known_x = cupy.asarray( + xax.apply_boolean_mask(not_null) + ) + known_y = cupy.asarray( + col.apply_boolean_mask(not_null) + ).astype(np.dtype('float64')) + + result = cupy.interp( + cupy.asarray(xax), + known_x, + known_y, + left=np.nan, + right=np.nan) else: + # The trivial case result = col - to_return[colname] = result + columns[colname] = result - return self.__class__(to_return) + return self.__class__(columns) def _quantiles( self, diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index f123baf6897..e3a0024c886 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -5418,6 +5418,7 @@ def hash_encode(self, stop, use_name=False): mod_vals = hashed_values % stop return Series(mod_vals._column, index=self.index, name=self.name) + @copy_docstring(DataFrame.interpolate) def interpolate( self, method='linear' From 5a4e720927067173a57de4a5f192215973176988 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 19 Jul 2021 06:32:48 -0700 Subject: [PATCH 05/23] sig and docstring updates --- python/cudf/cudf/core/dataframe.py | 46 ++++++++++++------------------ python/cudf/cudf/core/frame.py | 25 ++++++++++++++++ python/cudf/cudf/core/series.py | 22 ++++++++++++-- 3 files changed, 62 insertions(+), 31 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 6fc1293cf19..05acc3f4201 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5891,36 +5891,18 @@ def _from_columns(cls, cols, index=None, columns=None): return cls(data=data, index=index,) + @copy_docstring(Frame._interpolate) def interpolate( self, method='linear', - axis=0 + axis=0, + limit=None, + inplace=False, + limit_direction=None, + limit_area=None, + downcast=None, + **kwargs ): - """ - Interpolate data values between some points. - - Parameters - ---------- - method : str, default 'linear' - Interpolation technique to use. Currently, - only 'linear` is supported. - * 'linear': Ignore the index and treat the values as - equally spaced. This is the only method supported on MultiIndexes. - * 'index', 'values': linearly interpolate using the index as - an x-axis. Note that unsorted indices can lead to erroneous results. - axis : int, default 0 - Axis to interpolate along. Currently, - only 'axis=0' is supprted. - inplace : bool, default False - Update the data in place if possible. - - Returns - ------- - Series or DataFrame - Returns the same object type as the caller, interpolated at - some or all ``NaN`` values - - """ if method not in {'linear', 'index', 'values'}: raise ValueError( @@ -5930,8 +5912,16 @@ def interpolate( warnings.warn( "Unsorted Index..." ) - - return super()._interpolate(method) + return super()._interpolate( + method=method, + axis=axis, + limit=limit, + inplace=inplace, + limit_direction=limit_direction, + limit_area=limit_area, + downcast=downcast, + **kwargs + ) def quantile( self, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 8b439271867..a8b84aead29 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1425,6 +1425,31 @@ def _apply_boolean_mask(self, boolean_mask): return result def _interpolate(self, method='linear'): + """ + Interpolate data values between some points. + + Parameters + ---------- + method : str, default 'linear' + Interpolation technique to use. Currently, + only 'linear` is supported. + * 'linear': Ignore the index and treat the values as + equally spaced. This is the only method supported on MultiIndexes. + * 'index', 'values': linearly interpolate using the index as + an x-axis. Note that unsorted indices can lead to erroneous results. + axis : int, default 0 + Axis to interpolate along. Currently, + only 'axis=0' is supprted. + inplace : bool, default False + Update the data in place if possible. + + Returns + ------- + Series or DataFrame + Returns the same object type as the caller, interpolated at + some or all ``NaN`` values + + """ columns = ColumnAccessor() if method == 'linear': diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index e3a0024c886..0f97d9607ff 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -5418,12 +5418,28 @@ def hash_encode(self, stop, use_name=False): mod_vals = hashed_values % stop return Series(mod_vals._column, index=self.index, name=self.name) - @copy_docstring(DataFrame.interpolate) + @copy_docstring(Frame._interpolate) def interpolate( self, - method='linear' + method='linear', + axis=0, + limit=None, + inplace=False, + limit_direction=None, + limit_area=None, + downcast=None, + **kwargs ): - return super()._interpolate(method) + return super()._interpolate( + method=method, + axis=axis, + limit=limit, + inplace=inplace, + limit_direction=limit_direction, + limit_area=limit_area, + downcast=downcast, + **kwargs + ) def quantile( self, q=0.5, interpolation="linear", exact=True, quant_index=True From c17cd4fac3c6f6a767561f4739766bd9d0514a80 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 20 Jul 2021 07:42:17 -0700 Subject: [PATCH 06/23] updates --- python/cudf/cudf/core/dataframe.py | 5 ++ python/cudf/cudf/core/frame.py | 12 ++++- python/cudf/cudf/tests/test_interpolate.py | 60 +++++++++++++++++++--- 3 files changed, 69 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 05acc3f4201..7717fa167c5 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5912,6 +5912,11 @@ def interpolate( warnings.warn( "Unsorted Index..." ) + if all(dt == np.dtype('object') for dt in self.dtypes): + raise TypeError( + "Cannot interpolate with all object-dtype columns in the DataFrame. " + "Try setting at least one column to a numeric dtype." + ) return super()._interpolate( method=method, axis=axis, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index a8b84aead29..970ca189a48 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1424,7 +1424,17 @@ def _apply_boolean_mask(self, boolean_mask): result._copy_type_metadata(self) return result - def _interpolate(self, method='linear'): + def _interpolate( + self, + method='linear', + axis=0, + limit=None, + inplace=False, + limit_direction=None, + limit_area=None, + downcast=None, + **kwargs + ): """ Interpolate data values between some points. diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py index f5e2523f323..3a04ef887bf 100644 --- a/python/cudf/cudf/tests/test_interpolate.py +++ b/python/cudf/cudf/tests/test_interpolate.py @@ -1,28 +1,74 @@ import pandas as pd import cudf import pytest -from cudf.testing._utils import assert_eq +from cudf.testing._utils import assert_eq, assert_exceptions_equal @pytest.mark.parametrize("data", [ + # basics { - 'A': [1, None, 3] + 'A': [1, 2, 3], + 'B': [4, 5, 6] }, { - 'A': [None, 2, 3, None, 5] + 'A': [1, None, 3], + 'B': [4, None, 6] }, { - 'A': [1, None, 3], - 'B': [None, 2, 3] + 'A': [None, 2, 3], + 'B': [4, 5, None] } ]) @pytest.mark.parametrize("method", ['linear']) @pytest.mark.parametrize("axis", [0]) -def test_interpolate_nans(data, method,axis): +def test_interpolate_dataframe(data, method, axis): # doesn't seem to work with NAs just yet gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() expect = pdf.interpolate(method=method, axis=axis) got = gdf.interpolate(method=method, axis=axis) - breakpoint() assert_eq(expect, got) + +@pytest.mark.parametrize("data", [ + [1,2,3], + [1, None, 3], + [None, 2, None, 4], + [1, None, 3, None], + [0.1, 0.2, 0.3] +]) +@pytest.mark.parametrize("method", ['linear']) +@pytest.mark.parametrize("axis", [0]) +def test_interpolate_series(data, method, axis): + gsr = cudf.Series(data) + psr = gsr.to_pandas() + + expect = psr.interpolate(method=method, axis=axis) + got = gsr.interpolate(method=method, axis=axis) + + assert_eq(expect, got) + +@pytest.mark.parametrize('data,kwargs', [ + ( + { + 'A': ['a','b','c'], + 'B': ['d','e','f'] + }, + {'axis': 0, 'method': 'linear'}, + ) +]) +def test_interpolate_dataframe_error_cases(data, kwargs): + gsr = cudf.DataFrame(data) + psr = gsr.to_pandas() + + assert_exceptions_equal( + lfunc = psr.interpolate, + rfunc = gsr.interpolate, + lfunc_args_and_kwargs = ( + [], + kwargs + ), + rfunc_args_and_kwargs = ( + [], + kwargs + ) + ) From c16f2b3896c998e2e20a58d100f1f0ac0b908d61 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 21 Jul 2021 15:40:57 -0700 Subject: [PATCH 07/23] progress --- python/cudf/cudf/core/algorithms.py | 35 ++++++++++++++++-- python/cudf/cudf/core/dataframe.py | 4 --- python/cudf/cudf/core/frame.py | 34 ++++++++---------- python/cudf/cudf/tests/test_interpolate.py | 42 +++++++++++++++++----- 4 files changed, 81 insertions(+), 34 deletions(-) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 9f26ac8ee78..0afb505a67a 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -1,10 +1,10 @@ # Copyright (c) 2020, NVIDIA CORPORATION. from warnings import warn - +import numpy as np import cupy as cp from cudf.core.series import Index, Series - +from cudf.core.column import as_column def factorize(values, sort=False, na_sentinel=-1, size_hint=None): """Encode the input values as integer labels @@ -59,3 +59,34 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None): values.name = name return labels, cats.values if return_cupy_array else Index(cats) + +def linear_interpolation(col, xax): + # fill all NAs with NaNs + col = col.astype('float64').fillna(np.nan) + + # figure out where the nans are + not_nan_mask = ~cp.isnan(col) + + # find the first nan + first_nan_idx = as_column(not_nan_mask).find_first_value(1) + + known_x = cp.asarray(xax.apply_boolean_mask(not_nan_mask)) + known_y = cp.asarray(col.apply_boolean_mask(not_nan_mask)).astype(np.dtype('float64')) + + result = cp.interp( + cp.asarray(xax), + known_x, + known_y + ) + + result[:first_nan_idx] = np.nan + + return result + +def get_column_interpolator(method): + if method == 'linear': + return linear_interpolation + else: + raise ValueError( + f"Interpolation method `{method}` not found" + ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 7717fa167c5..7876d402899 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5904,10 +5904,6 @@ def interpolate( **kwargs ): - if method not in {'linear', 'index', 'values'}: - raise ValueError( - f"method {method} is not supported." - ) if method in {'index', 'values'} and not self.index.is_monotonic_increasing: warnings.warn( "Unsorted Index..." diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 970ca189a48..4381690fc7b 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1460,35 +1460,31 @@ def _interpolate( some or all ``NaN`` values """ + + if method in {'pad', 'ffill'} and limit_direction != 'forward': + raise ValueError( + f"`limit_direction` must be 'forward' for method `{method}`" + ) + if method in {'backfill', 'bfill'} and limit_direction != 'backward': + raise ValueError( + f"`limit_direction` must be 'backward' for method `{method}`" + ) + + columns = ColumnAccessor() - if method == 'linear': + if method in 'linear': xax = as_column(cupy.arange(len(self))) elif method in {'index', 'values'}: xax = self.index + interpolator = cudf.core.algorithms.get_column_interpolator(method) for colname, col in self._data.items(): if col.nullable: - not_null = col.notnull() - known_x = cupy.asarray( - xax.apply_boolean_mask(not_null) - ) - known_y = cupy.asarray( - col.apply_boolean_mask(not_null) - ).astype(np.dtype('float64')) - - result = cupy.interp( - cupy.asarray(xax), - known_x, - known_y, - left=np.nan, - right=np.nan) - else: - # The trivial case - result = col + col = col.fillna(np.nan) + result = interpolator(col, xax) columns[colname] = result - return self.__class__(columns) def _quantiles( diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py index 3a04ef887bf..1c9ac46af66 100644 --- a/python/cudf/cudf/tests/test_interpolate.py +++ b/python/cudf/cudf/tests/test_interpolate.py @@ -10,12 +10,12 @@ 'B': [4, 5, 6] }, { - 'A': [1, None, 3], - 'B': [4, None, 6] + 'A': [1.0, None, 3.0], + 'B': [4.0, None, 6.0] }, { - 'A': [None, 2, 3], - 'B': [4, 5, None] + 'A': [None, 2.0, 3.0], + 'B': [4.0, 5.0, None] } ]) @pytest.mark.parametrize("method", ['linear']) @@ -30,10 +30,10 @@ def test_interpolate_dataframe(data, method, axis): assert_eq(expect, got) @pytest.mark.parametrize("data", [ - [1,2,3], - [1, None, 3], - [None, 2, None, 4], - [1, None, 3, None], + [1.0,2.0,3.0], + [1.0, None, 3.0], + [None, 2.0, None, 4.0], + [1.0, None, 3.0, None], [0.1, 0.2, 0.3] ]) @pytest.mark.parametrize("method", ['linear']) @@ -54,7 +54,31 @@ def test_interpolate_series(data, method, axis): 'B': ['d','e','f'] }, {'axis': 0, 'method': 'linear'}, - ) + ), + ( + { + 'A': [1,2,3] + }, + {'method': 'pad', 'limit_direction': 'backward'} + ), + ( + { + 'A': [1,2,3] + }, + {'method': 'ffill', 'limit_direction': 'backward'} + ), + ( + { + 'A': [1,2,3] + }, + {'method': 'bfill', 'limit_direction': 'forward'} + ), + ( + { + 'A': [1,2,3] + }, + {'method': 'backfill', 'limit_direction': 'forward'} + ), ]) def test_interpolate_dataframe_error_cases(data, kwargs): gsr = cudf.DataFrame(data) From fe56bb190d7a55c979cc702a7f300437921163fa Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 22 Jul 2021 12:19:08 -0700 Subject: [PATCH 08/23] refactoring --- python/cudf/cudf/core/algorithms.py | 43 ++++++++++++++++++---- python/cudf/cudf/core/frame.py | 13 ++++--- python/cudf/cudf/tests/test_interpolate.py | 7 +++- 3 files changed, 47 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 0afb505a67a..6a2b22aa1ef 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -5,7 +5,7 @@ from cudf.core.series import Index, Series from cudf.core.column import as_column - +from cudf.core.index import RangeIndex def factorize(values, sort=False, na_sentinel=-1, size_hint=None): """Encode the input values as integer labels @@ -60,32 +60,59 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None): return labels, cats.values if return_cupy_array else Index(cats) -def linear_interpolation(col, xax): +def linear_interpolation(to_interp): + """ + Interpolate over a float column. Implicitly assumes that values are + evenly spaced with respect to the x-axis, for example the data + [1.0, NaN, 3.0] will be interpolated assuming the NaN is half way + between the two valid values, yielding [1.0, 2.0, 3.0] + """ + + to_interp._index = RangeIndex(start=0, stop=len(to_interp), step=1) + return index_or_values_interpolation(to_interp) + +def index_or_values_interpolation(to_interp): + """ + Interpolate over a float column. assumes a linear interpolation + strategy using the index of the data to denote spacing of the x + values. For example the data and index [1.0, NaN, 4.0], [1, 3, 4] + would result in [1.0, 3.0, 4.0] + """ + xax = to_interp._index._column + + col = to_interp._data[list(to_interp._data.keys())[0]] + # fill all NAs with NaNs col = col.astype('float64').fillna(np.nan) # figure out where the nans are - not_nan_mask = ~cp.isnan(col) + mask = cp.isnan(col) + + # trivial case + if mask.all(): + return col + + mask = ~mask # find the first nan - first_nan_idx = as_column(not_nan_mask).find_first_value(1) + first_nan_idx = as_column(mask).find_first_value(1) - known_x = cp.asarray(xax.apply_boolean_mask(not_nan_mask)) - known_y = cp.asarray(col.apply_boolean_mask(not_nan_mask)).astype(np.dtype('float64')) + known_x = cp.asarray(xax.apply_boolean_mask(mask)) + known_y = cp.asarray(col.apply_boolean_mask(mask)).astype(np.dtype('float64')) result = cp.interp( cp.asarray(xax), known_x, known_y ) - result[:first_nan_idx] = np.nan - return result def get_column_interpolator(method): if method == 'linear': return linear_interpolation + elif method in {'index', 'values'}: + return index_or_values_interpolation else: raise ValueError( f"Interpolation method `{method}` not found" diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 4381690fc7b..5686927bbf4 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1473,16 +1473,17 @@ def _interpolate( columns = ColumnAccessor() - if method in 'linear': - xax = as_column(cupy.arange(len(self))) - elif method in {'index', 'values'}: - xax = self.index interpolator = cudf.core.algorithms.get_column_interpolator(method) - for colname, col in self._data.items(): if col.nullable: col = col.fillna(np.nan) - result = interpolator(col, xax) + + # Interpolation methods may or may not need the index + to_interp = Frame( + data={colname: col}, + index=self.index + ) + result = interpolator(to_interp) columns[colname] = result return self.__class__(columns) diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py index 1c9ac46af66..974ade03ef7 100644 --- a/python/cudf/cudf/tests/test_interpolate.py +++ b/python/cudf/cudf/tests/test_interpolate.py @@ -6,8 +6,8 @@ @pytest.mark.parametrize("data", [ # basics { - 'A': [1, 2, 3], - 'B': [4, 5, 6] + 'A': [1.0, 2.0, 3.0], + 'B': [4.0, 5.0, 6.0] }, { 'A': [1.0, None, 3.0], @@ -34,6 +34,9 @@ def test_interpolate_dataframe(data, method, axis): [1.0, None, 3.0], [None, 2.0, None, 4.0], [1.0, None, 3.0, None], + [None, None, 3.0, 4.0], + [1.0, 2.0, None, None], + [None, None, None, None], [0.1, 0.2, 0.3] ]) @pytest.mark.parametrize("method", ['linear']) From a68161639c92973f343a65fa495c008bedc8ee49 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 22 Jul 2021 12:31:58 -0700 Subject: [PATCH 09/23] test index and values methods --- python/cudf/cudf/tests/test_interpolate.py | 25 ++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py index 974ade03ef7..3002e7b9279 100644 --- a/python/cudf/cudf/tests/test_interpolate.py +++ b/python/cudf/cudf/tests/test_interpolate.py @@ -50,6 +50,31 @@ def test_interpolate_series(data, method, axis): assert_eq(expect, got) +@pytest.mark.parametrize('data', [ + [1.0, 2.0, 3.0, 4.0], + [None, 2.0, 3.0, 4.0], + [1.0, 2.0, 3.0, None], + [None, None, 3.0, 4.0], + [1.0, 2.0, None, None], + [1.0, None, 3.0, None], + [None, 2.0, None, 4.0], + [None, None, None, None] +]) +@pytest.mark.parametrize('index', [ + [0, 1, 2, 3], + [0, 2, 4, 6], + [0, 3, 4, 9] +]) +@pytest.mark.parametrize('method', ['index', 'values']) +def test_interpolate_series_values_or_index(data, index, method): + gsr = cudf.Series(data, index=index) + psr = gsr.to_pandas() + + expect = psr.interpolate(method=method) + got = gsr.interpolate(method=method) + + assert_eq(expect, got) + @pytest.mark.parametrize('data,kwargs', [ ( { From 98608a9026044c5cc3e39ee1753dfa75b02036aa Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 22 Jul 2021 12:38:39 -0700 Subject: [PATCH 10/23] forgot the index --- python/cudf/cudf/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 5686927bbf4..bbb2f3192ec 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1486,7 +1486,7 @@ def _interpolate( result = interpolator(to_interp) columns[colname] = result - return self.__class__(columns) + return self.__class__(columns, index=self.index.copy()) def _quantiles( self, From 143c79808ac1d3f7da748f2909976cfcedb0039d Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 23 Jul 2021 08:39:43 -0700 Subject: [PATCH 11/23] style --- python/cudf/cudf/core/algorithms.py | 51 +++---- python/cudf/cudf/core/dataframe.py | 36 ++--- python/cudf/cudf/core/frame.py | 44 +++--- python/cudf/cudf/core/series.py | 20 +-- python/cudf/cudf/tests/test_interpolate.py | 157 +++++++++------------ 5 files changed, 144 insertions(+), 164 deletions(-) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 6a2b22aa1ef..7e865850c38 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -1,11 +1,14 @@ # Copyright (c) 2020, NVIDIA CORPORATION. from warnings import warn -import numpy as np + import cupy as cp +import numpy as np -from cudf.core.series import Index, Series from cudf.core.column import as_column from cudf.core.index import RangeIndex +from cudf.core.series import Index, Series + + def factorize(values, sort=False, na_sentinel=-1, size_hint=None): """Encode the input values as integer labels @@ -60,17 +63,19 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None): return labels, cats.values if return_cupy_array else Index(cats) + def linear_interpolation(to_interp): """ - Interpolate over a float column. Implicitly assumes that values are + Interpolate over a float column. Implicitly assumes that values are evenly spaced with respect to the x-axis, for example the data - [1.0, NaN, 3.0] will be interpolated assuming the NaN is half way + [1.0, NaN, 3.0] will be interpolated assuming the NaN is half way between the two valid values, yielding [1.0, 2.0, 3.0] """ to_interp._index = RangeIndex(start=0, stop=len(to_interp), step=1) return index_or_values_interpolation(to_interp) + def index_or_values_interpolation(to_interp): """ Interpolate over a float column. assumes a linear interpolation @@ -78,12 +83,12 @@ def index_or_values_interpolation(to_interp): values. For example the data and index [1.0, NaN, 4.0], [1, 3, 4] would result in [1.0, 3.0, 4.0] """ - xax = to_interp._index._column - - col = to_interp._data[list(to_interp._data.keys())[0]] + colname = list(to_interp._data.keys())[0] + to_interp._data[colname] = ( + to_interp._data[colname].astype("float64").fillna(np.nan) + ) - # fill all NAs with NaNs - col = col.astype('float64').fillna(np.nan) + col = to_interp._data[colname] # figure out where the nans are mask = cp.isnan(col) @@ -91,29 +96,25 @@ def index_or_values_interpolation(to_interp): # trivial case if mask.all(): return col - - mask = ~mask - # find the first nan - first_nan_idx = as_column(mask).find_first_value(1) + mask = as_column(~mask) + known_x_and_y = to_interp._apply_boolean_mask(mask) - known_x = cp.asarray(xax.apply_boolean_mask(mask)) - known_y = cp.asarray(col.apply_boolean_mask(mask)).astype(np.dtype('float64')) + known_x = cp.asarray(known_x_and_y._index._column) + known_y = cp.asarray(known_x_and_y._data.columns[0]) - result = cp.interp( - cp.asarray(xax), - known_x, - known_y - ) + result = cp.interp(cp.asarray(to_interp._index), known_x, known_y) + + # find the first nan + first_nan_idx = as_column(mask).find_first_value(1) result[:first_nan_idx] = np.nan return result + def get_column_interpolator(method): - if method == 'linear': + if method == "linear": return linear_interpolation - elif method in {'index', 'values'}: + elif method in {"index", "values"}: return index_or_values_interpolation else: - raise ValueError( - f"Interpolation method `{method}` not found" - ) + raise ValueError(f"Interpolation method `{method}` not found") diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 7876d402899..79ddd035f9e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5894,34 +5894,36 @@ def _from_columns(cls, cols, index=None, columns=None): @copy_docstring(Frame._interpolate) def interpolate( self, - method='linear', + method="linear", axis=0, limit=None, inplace=False, limit_direction=None, limit_area=None, downcast=None, - **kwargs + **kwargs, ): - if method in {'index', 'values'} and not self.index.is_monotonic_increasing: - warnings.warn( - "Unsorted Index..." - ) - if all(dt == np.dtype('object') for dt in self.dtypes): + if ( + method in {"index", "values"} + and not self.index.is_monotonic_increasing + ): + warnings.warn("Unsorted Index...") + if all(dt == np.dtype("object") for dt in self.dtypes): raise TypeError( - "Cannot interpolate with all object-dtype columns in the DataFrame. " - "Try setting at least one column to a numeric dtype." + "Cannot interpolate with all object-dtype " + "columns in the DataFrame. Try setting at " + "least one column to a numeric dtype." ) return super()._interpolate( - method=method, - axis=axis, - limit=limit, - inplace=inplace, - limit_direction=limit_direction, - limit_area=limit_area, - downcast=downcast, - **kwargs + method=method, + axis=axis, + limit=limit, + inplace=inplace, + limit_direction=limit_direction, + limit_area=limit_area, + downcast=downcast, + **kwargs, ) def quantile( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index bbb2f3192ec..82224e92550 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -65,7 +65,12 @@ def __init_subclass__(cls): @classmethod def _from_table(cls, table: Frame): - return cls(table._data, index=table._index) + return cls( + table._data, + index=cudf.Index._from_table(table._index) + if table._index is not None + else table._index, + ) def _mimic_inplace( self: T, result: Frame, inplace: bool = False @@ -1415,7 +1420,6 @@ def _apply_boolean_mask(self, boolean_mask): rows corresponding to `False` is dropped """ boolean_mask = as_column(boolean_mask) - result = self.__class__._from_table( libcudf.stream_compaction.apply_boolean_mask( self, as_column(boolean_mask) @@ -1425,15 +1429,15 @@ def _apply_boolean_mask(self, boolean_mask): return result def _interpolate( - self, - method='linear', - axis=0, - limit=None, - inplace=False, - limit_direction=None, - limit_area=None, - downcast=None, - **kwargs + self, + method="linear", + axis=0, + limit=None, + inplace=False, + limit_direction=None, + limit_area=None, + downcast=None, + **kwargs, ): """ Interpolate data values between some points. @@ -1445,8 +1449,8 @@ def _interpolate( only 'linear` is supported. * 'linear': Ignore the index and treat the values as equally spaced. This is the only method supported on MultiIndexes. - * 'index', 'values': linearly interpolate using the index as - an x-axis. Note that unsorted indices can lead to erroneous results. + * 'index', 'values': linearly interpolate using the index as + an x-axis. Unsorted indices can lead to erroneous results. axis : int, default 0 Axis to interpolate along. Currently, only 'axis=0' is supprted. @@ -1461,29 +1465,25 @@ def _interpolate( """ - if method in {'pad', 'ffill'} and limit_direction != 'forward': + if method in {"pad", "ffill"} and limit_direction != "forward": raise ValueError( f"`limit_direction` must be 'forward' for method `{method}`" ) - if method in {'backfill', 'bfill'} and limit_direction != 'backward': + if method in {"backfill", "bfill"} and limit_direction != "backward": raise ValueError( f"`limit_direction` must be 'backward' for method `{method}`" ) - columns = ColumnAccessor() interpolator = cudf.core.algorithms.get_column_interpolator(method) for colname, col in self._data.items(): if col.nullable: col = col.fillna(np.nan) - + # Interpolation methods may or may not need the index - to_interp = Frame( - data={colname: col}, - index=self.index - ) - result = interpolator(to_interp) + to_interp = Frame(data={colname: col}, index=self.index) + result = interpolator(to_interp) columns[colname] = result return self.__class__(columns, index=self.index.copy()) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 0f97d9607ff..3a720a8cfc1 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -5421,24 +5421,24 @@ def hash_encode(self, stop, use_name=False): @copy_docstring(Frame._interpolate) def interpolate( self, - method='linear', + method="linear", axis=0, limit=None, inplace=False, limit_direction=None, limit_area=None, downcast=None, - **kwargs + **kwargs, ): return super()._interpolate( - method=method, - axis=axis, - limit=limit, - inplace=inplace, - limit_direction=limit_direction, - limit_area=limit_area, - downcast=downcast, - **kwargs + method=method, + axis=axis, + limit=limit, + inplace=inplace, + limit_direction=limit_direction, + limit_area=limit_area, + downcast=downcast, + **kwargs, ) def quantile( diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py index 3002e7b9279..70a9425ad0f 100644 --- a/python/cudf/cudf/tests/test_interpolate.py +++ b/python/cudf/cudf/tests/test_interpolate.py @@ -1,45 +1,44 @@ -import pandas as pd -import cudf import pytest + +import cudf from cudf.testing._utils import assert_eq, assert_exceptions_equal -@pytest.mark.parametrize("data", [ - # basics - { - 'A': [1.0, 2.0, 3.0], - 'B': [4.0, 5.0, 6.0] - }, - { - 'A': [1.0, None, 3.0], - 'B': [4.0, None, 6.0] - }, - { - 'A': [None, 2.0, 3.0], - 'B': [4.0, 5.0, None] - } -]) -@pytest.mark.parametrize("method", ['linear']) + +@pytest.mark.parametrize( + "data", + [ + # basics + {"A": [1.0, 2.0, 3.0], "B": [4.0, 5.0, 6.0]}, + {"A": [1.0, None, 3.0], "B": [4.0, None, 6.0]}, + {"A": [None, 2.0, 3.0], "B": [4.0, 5.0, None]}, + ], +) +@pytest.mark.parametrize("method", ["linear"]) @pytest.mark.parametrize("axis", [0]) -def test_interpolate_dataframe(data, method, axis): +def test_interpolate_dataframe(data, method, axis): # doesn't seem to work with NAs just yet gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() - + expect = pdf.interpolate(method=method, axis=axis) got = gdf.interpolate(method=method, axis=axis) assert_eq(expect, got) -@pytest.mark.parametrize("data", [ - [1.0,2.0,3.0], - [1.0, None, 3.0], - [None, 2.0, None, 4.0], - [1.0, None, 3.0, None], - [None, None, 3.0, 4.0], - [1.0, 2.0, None, None], - [None, None, None, None], - [0.1, 0.2, 0.3] -]) -@pytest.mark.parametrize("method", ['linear']) + +@pytest.mark.parametrize( + "data", + [ + [1.0, 2.0, 3.0], + [1.0, None, 3.0], + [None, 2.0, None, 4.0], + [1.0, None, 3.0, None], + [None, None, 3.0, 4.0], + [1.0, 2.0, None, None], + [None, None, None, None], + [0.1, 0.2, 0.3], + ], +) +@pytest.mark.parametrize("method", ["linear"]) @pytest.mark.parametrize("axis", [0]) def test_interpolate_series(data, method, axis): gsr = cudf.Series(data) @@ -50,22 +49,22 @@ def test_interpolate_series(data, method, axis): assert_eq(expect, got) -@pytest.mark.parametrize('data', [ - [1.0, 2.0, 3.0, 4.0], - [None, 2.0, 3.0, 4.0], - [1.0, 2.0, 3.0, None], - [None, None, 3.0, 4.0], - [1.0, 2.0, None, None], - [1.0, None, 3.0, None], - [None, 2.0, None, 4.0], - [None, None, None, None] -]) -@pytest.mark.parametrize('index', [ - [0, 1, 2, 3], - [0, 2, 4, 6], - [0, 3, 4, 9] -]) -@pytest.mark.parametrize('method', ['index', 'values']) + +@pytest.mark.parametrize( + "data", + [ + [1.0, 2.0, 3.0, 4.0], + [None, 2.0, 3.0, 4.0], + [1.0, 2.0, 3.0, None], + [None, None, 3.0, 4.0], + [1.0, 2.0, None, None], + [1.0, None, 3.0, None], + [None, 2.0, None, 4.0], + [None, None, None, None], + ], +) +@pytest.mark.parametrize("index", [[0, 1, 2, 3], [0, 2, 4, 6], [0, 3, 4, 9]]) +@pytest.mark.parametrize("method", ["index", "values"]) def test_interpolate_series_values_or_index(data, index, method): gsr = cudf.Series(data, index=index) psr = gsr.to_pandas() @@ -75,52 +74,30 @@ def test_interpolate_series_values_or_index(data, index, method): assert_eq(expect, got) -@pytest.mark.parametrize('data,kwargs', [ - ( - { - 'A': ['a','b','c'], - 'B': ['d','e','f'] - }, - {'axis': 0, 'method': 'linear'}, - ), - ( - { - 'A': [1,2,3] - }, - {'method': 'pad', 'limit_direction': 'backward'} - ), - ( - { - 'A': [1,2,3] - }, - {'method': 'ffill', 'limit_direction': 'backward'} - ), - ( - { - 'A': [1,2,3] - }, - {'method': 'bfill', 'limit_direction': 'forward'} - ), - ( - { - 'A': [1,2,3] - }, - {'method': 'backfill', 'limit_direction': 'forward'} - ), -]) + +@pytest.mark.parametrize( + "data,kwargs", + [ + ( + {"A": ["a", "b", "c"], "B": ["d", "e", "f"]}, + {"axis": 0, "method": "linear"}, + ), + ({"A": [1, 2, 3]}, {"method": "pad", "limit_direction": "backward"}), + ({"A": [1, 2, 3]}, {"method": "ffill", "limit_direction": "backward"}), + ({"A": [1, 2, 3]}, {"method": "bfill", "limit_direction": "forward"}), + ( + {"A": [1, 2, 3]}, + {"method": "backfill", "limit_direction": "forward"}, + ), + ], +) def test_interpolate_dataframe_error_cases(data, kwargs): gsr = cudf.DataFrame(data) psr = gsr.to_pandas() assert_exceptions_equal( - lfunc = psr.interpolate, - rfunc = gsr.interpolate, - lfunc_args_and_kwargs = ( - [], - kwargs - ), - rfunc_args_and_kwargs = ( - [], - kwargs - ) + lfunc=psr.interpolate, + rfunc=gsr.interpolate, + lfunc_args_and_kwargs=([], kwargs), + rfunc_args_and_kwargs=([], kwargs), ) From 81ffee10f735779db52cfa57b8b0d11cc564dc16 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 23 Jul 2021 08:41:59 -0700 Subject: [PATCH 12/23] remove unnecessary older changes --- python/cudf/cudf/core/dataframe.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 79ddd035f9e..4ee21d5777e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7171,12 +7171,12 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): **kwargs, ) - def _apply_support_method(self, _method, axis=0, *args, **kwargs): + def _apply_support_method(self, method, axis=0, *args, **kwargs): assert axis in (None, 0, 1) if axis in (None, 0): result = [ - getattr(self[col], _method)(*args, **kwargs) + getattr(self[col], method)(*args, **kwargs) for col in self._data.names ] @@ -7193,13 +7193,13 @@ def _apply_support_method(self, _method, axis=0, *args, **kwargs): elif axis == 1: # for dask metadata compatibility skipna = kwargs.pop("skipna", None) - if _method not in _cupy_nan_methods_map and skipna not in ( + if method not in _cupy_nan_methods_map and skipna not in ( None, True, 1, ): raise NotImplementedError( - f"Row-wise operation to calculate '{_method}'" + f"Row-wise operation to calculate '{method}'" f" currently do not support `skipna=False`." ) @@ -7231,7 +7231,7 @@ def _apply_support_method(self, _method, axis=0, *args, **kwargs): ) prepared, mask, common_dtype = self._prepare_for_rowwise_op( - _method, skipna + method, skipna ) for col in prepared._data.names: if prepared._data[col].nullable: @@ -7248,10 +7248,10 @@ def _apply_support_method(self, _method, axis=0, *args, **kwargs): ) arr = cupy.asarray(prepared.as_gpu_matrix()) - if skipna is not False and _method in _cupy_nan_methods_map: - _method = _cupy_nan_methods_map[_method] + if skipna is not False and method in _cupy_nan_methods_map: + method = _cupy_nan_methods_map[method] - result = getattr(cupy, _method)(arr, axis=1, **kwargs) + result = getattr(cupy, method)(arr, axis=1, **kwargs) if result.ndim == 1: type_coerced_methods = { @@ -7267,7 +7267,7 @@ def _apply_support_method(self, _method, axis=0, *args, **kwargs): } result_dtype = ( common_dtype - if _method in type_coerced_methods + if method in type_coerced_methods or is_datetime_dtype(common_dtype) else None ) From f859d0ee692965d591471918fc8be7087e10807d Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 28 Jul 2021 06:07:24 -0700 Subject: [PATCH 13/23] directly add and test unsorted index case --- python/cudf/cudf/core/algorithms.py | 6 +++--- python/cudf/cudf/core/dataframe.py | 6 +----- python/cudf/cudf/core/frame.py | 14 +++++++++++--- python/cudf/cudf/tests/test_interpolate.py | 14 ++++++++++++++ 4 files changed, 29 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 7e865850c38..bbb01958b73 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -89,12 +89,12 @@ def index_or_values_interpolation(to_interp): ) col = to_interp._data[colname] - # figure out where the nans are mask = cp.isnan(col) - # trivial case - if mask.all(): + # trivial cases, all nan or no nans + num_nan = mask.sum() + if num_nan == 0 or num_nan == len(to_interp): return col mask = as_column(~mask) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 4ee21d5777e..1f928b77943 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5904,17 +5904,13 @@ def interpolate( **kwargs, ): - if ( - method in {"index", "values"} - and not self.index.is_monotonic_increasing - ): - warnings.warn("Unsorted Index...") if all(dt == np.dtype("object") for dt in self.dtypes): raise TypeError( "Cannot interpolate with all object-dtype " "columns in the DataFrame. Try setting at " "least one column to a numeric dtype." ) + return super()._interpolate( method=method, axis=axis, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 82224e92550..f28d0a1bebe 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1474,19 +1474,27 @@ def _interpolate( f"`limit_direction` must be 'backward' for method `{method}`" ) + columns = ColumnAccessor() + perm_sort = self._index.argsort() + sorted_data = self._gather(perm_sort) + interpolator = cudf.core.algorithms.get_column_interpolator(method) - for colname, col in self._data.items(): + for colname, col in sorted_data._data.items(): if col.nullable: col = col.fillna(np.nan) # Interpolation methods may or may not need the index - to_interp = Frame(data={colname: col}, index=self.index) + to_interp = Frame(data={colname: col}, index=sorted_data._index) result = interpolator(to_interp) columns[colname] = result - return self.__class__(columns, index=self.index.copy()) + result = self.__class__(columns, index=sorted_data._index) + # that which was once sorted, now is not + restored = result._gather(perm_sort.argsort()) + + return restored def _quantiles( self, diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py index 70a9425ad0f..ba5792d4e84 100644 --- a/python/cudf/cudf/tests/test_interpolate.py +++ b/python/cudf/cudf/tests/test_interpolate.py @@ -49,6 +49,20 @@ def test_interpolate_series(data, method, axis): assert_eq(expect, got) +@pytest.mark.parametrize('data,index', [ + ( + [2.0, None, 4.0, None, 2.0], + [1, 2, 3, 2, 1] + ) +]) +def test_interpolate_series_unsorted_index(data, index): + gsr = cudf.Series(data, index=index) + psr = gsr.to_pandas() + + expect = psr.interpolate(method='values') + got = gsr.interpolate(method='values') + + assert_eq(expect, got) @pytest.mark.parametrize( "data", From 71272a94806e375fd109885d81a6271363d7cd34 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 28 Jul 2021 06:20:00 -0700 Subject: [PATCH 14/23] ....but dont do it for RangeIndex based data --- python/cudf/cudf/core/frame.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index f28d0a1bebe..8ca26df70de 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1474,27 +1474,30 @@ def _interpolate( f"`limit_direction` must be 'backward' for method `{method}`" ) - + data = self columns = ColumnAccessor() - perm_sort = self._index.argsort() - sorted_data = self._gather(perm_sort) + if not isinstance(data._index, cudf.RangeIndex): + perm_sort = data._index.argsort() + data = data._gather(perm_sort) interpolator = cudf.core.algorithms.get_column_interpolator(method) - for colname, col in sorted_data._data.items(): + for colname, col in data._data.items(): if col.nullable: col = col.fillna(np.nan) # Interpolation methods may or may not need the index - to_interp = Frame(data={colname: col}, index=sorted_data._index) + to_interp = Frame(data={colname: col}, index=data._index) result = interpolator(to_interp) columns[colname] = result - result = self.__class__(columns, index=sorted_data._index) - # that which was once sorted, now is not - restored = result._gather(perm_sort.argsort()) + result = self.__class__(columns, index=data._index) + + if not isinstance(data._index, cudf.RangeIndex): + # that which was once sorted, now is not + result = result._gather(perm_sort.argsort()) - return restored + return result def _quantiles( self, From 52e431aed0ae3faf6c2665fa198afcdb5b661d0f Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Wed, 28 Jul 2021 08:28:11 -0500 Subject: [PATCH 15/23] Apply suggestions from code review Co-authored-by: Vyas Ramasubramani --- python/cudf/cudf/core/algorithms.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 7e865850c38..0fc678a45cc 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -64,7 +64,7 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None): return labels, cats.values if return_cupy_array else Index(cats) -def linear_interpolation(to_interp): +def _linear_interpolation(to_interp): """ Interpolate over a float column. Implicitly assumes that values are evenly spaced with respect to the x-axis, for example the data @@ -76,7 +76,7 @@ def linear_interpolation(to_interp): return index_or_values_interpolation(to_interp) -def index_or_values_interpolation(to_interp): +def _index_or_values_interpolation(to_interp): """ Interpolate over a float column. assumes a linear interpolation strategy using the index of the data to denote spacing of the x @@ -97,8 +97,7 @@ def index_or_values_interpolation(to_interp): if mask.all(): return col - mask = as_column(~mask) - known_x_and_y = to_interp._apply_boolean_mask(mask) + known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask)) known_x = cp.asarray(known_x_and_y._index._column) known_y = cp.asarray(known_x_and_y._data.columns[0]) @@ -106,7 +105,7 @@ def index_or_values_interpolation(to_interp): result = cp.interp(cp.asarray(to_interp._index), known_x, known_y) # find the first nan - first_nan_idx = as_column(mask).find_first_value(1) + first_nan_idx = (mask == 1).argmax().item() result[:first_nan_idx] = np.nan return result From 088618efa771c2991c8c6168539298f142e3781b Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 28 Jul 2021 06:30:19 -0700 Subject: [PATCH 16/23] fix minor bugs --- python/cudf/cudf/core/algorithms.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 9f89abe54f7..cf84a2f3432 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -73,7 +73,7 @@ def _linear_interpolation(to_interp): """ to_interp._index = RangeIndex(start=0, stop=len(to_interp), step=1) - return index_or_values_interpolation(to_interp) + return _index_or_values_interpolation(to_interp) def _index_or_values_interpolation(to_interp): @@ -105,15 +105,15 @@ def _index_or_values_interpolation(to_interp): result = cp.interp(cp.asarray(to_interp._index), known_x, known_y) # find the first nan - first_nan_idx = (mask == 1).argmax().item() + first_nan_idx = (mask == 0).argmax().item() result[:first_nan_idx] = np.nan return result def get_column_interpolator(method): if method == "linear": - return linear_interpolation + return _linear_interpolation elif method in {"index", "values"}: - return index_or_values_interpolation + return _index_or_values_interpolation else: raise ValueError(f"Interpolation method `{method}` not found") From 4785a564534abad4c258dd94dae27426cf211f1c Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 28 Jul 2021 07:46:20 -0700 Subject: [PATCH 17/23] address reviews --- python/cudf/cudf/core/dataframe.py | 1 - python/cudf/cudf/core/frame.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1f928b77943..f5e43eb4af7 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5891,7 +5891,6 @@ def _from_columns(cls, cols, index=None, columns=None): return cls(data=data, index=index,) - @copy_docstring(Frame._interpolate) def interpolate( self, method="linear", diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 8ca26df70de..5de7e6df53c 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1475,7 +1475,7 @@ def _interpolate( ) data = self - columns = ColumnAccessor() + columns = {} if not isinstance(data._index, cudf.RangeIndex): perm_sort = data._index.argsort() @@ -1491,7 +1491,7 @@ def _interpolate( result = interpolator(to_interp) columns[colname] = result - result = self.__class__(columns, index=data._index) + result = self.__class__(ColumnAccessor(columns), index=data._index) if not isinstance(data._index, cudf.RangeIndex): # that which was once sorted, now is not From ed6cb8168900f3f412a49a95f2e06a1e23e83f65 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 28 Jul 2021 07:55:29 -0700 Subject: [PATCH 18/23] more reviews --- python/cudf/cudf/core/algorithms.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index cf84a2f3432..f681721552b 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -99,10 +99,10 @@ def _index_or_values_interpolation(to_interp): known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask)) - known_x = cp.asarray(known_x_and_y._index._column) - known_y = cp.asarray(known_x_and_y._data.columns[0]) + known_x = known_x_and_y._index._column.values + known_y = known_x_and_y._data.columns[0].values - result = cp.interp(cp.asarray(to_interp._index), known_x, known_y) + result = cp.interp(to_interp._index.values, known_x, known_y) # find the first nan first_nan_idx = (mask == 0).argmax().item() From b85edc177f39ca2ef14e45aa4638c7fd10cf21e2 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 28 Jul 2021 08:02:29 -0700 Subject: [PATCH 19/23] just expose interpolate directly --- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/frame.py | 2 +- python/cudf/cudf/core/series.py | 23 ----------------------- 3 files changed, 2 insertions(+), 25 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index f5e43eb4af7..59d9576376d 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5910,7 +5910,7 @@ def interpolate( "least one column to a numeric dtype." ) - return super()._interpolate( + return super().interpolate( method=method, axis=axis, limit=limit, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 5de7e6df53c..e564452c43f 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1428,7 +1428,7 @@ def _apply_boolean_mask(self, boolean_mask): result._copy_type_metadata(self) return result - def _interpolate( + def interpolate( self, method="linear", axis=0, diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 3a720a8cfc1..565b72bd087 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -5418,29 +5418,6 @@ def hash_encode(self, stop, use_name=False): mod_vals = hashed_values % stop return Series(mod_vals._column, index=self.index, name=self.name) - @copy_docstring(Frame._interpolate) - def interpolate( - self, - method="linear", - axis=0, - limit=None, - inplace=False, - limit_direction=None, - limit_area=None, - downcast=None, - **kwargs, - ): - return super()._interpolate( - method=method, - axis=axis, - limit=limit, - inplace=inplace, - limit_direction=limit_direction, - limit_area=limit_area, - downcast=downcast, - **kwargs, - ) - def quantile( self, q=0.5, interpolation="linear", exact=True, quant_index=True ): From 82c4f1ebe7a96abeca90e45f7ae360ab544d1c12 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 28 Jul 2021 08:30:06 -0700 Subject: [PATCH 20/23] style --- python/cudf/cudf/core/frame.py | 2 +- python/cudf/cudf/tests/test_interpolate.py | 15 +++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index e564452c43f..6eb3442091d 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1480,7 +1480,7 @@ def interpolate( if not isinstance(data._index, cudf.RangeIndex): perm_sort = data._index.argsort() data = data._gather(perm_sort) - + interpolator = cudf.core.algorithms.get_column_interpolator(method) for colname, col in data._data.items(): if col.nullable: diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py index ba5792d4e84..e9b9e03891e 100644 --- a/python/cudf/cudf/tests/test_interpolate.py +++ b/python/cudf/cudf/tests/test_interpolate.py @@ -49,21 +49,20 @@ def test_interpolate_series(data, method, axis): assert_eq(expect, got) -@pytest.mark.parametrize('data,index', [ - ( - [2.0, None, 4.0, None, 2.0], - [1, 2, 3, 2, 1] - ) -]) + +@pytest.mark.parametrize( + "data,index", [([2.0, None, 4.0, None, 2.0], [1, 2, 3, 2, 1])] +) def test_interpolate_series_unsorted_index(data, index): gsr = cudf.Series(data, index=index) psr = gsr.to_pandas() - expect = psr.interpolate(method='values') - got = gsr.interpolate(method='values') + expect = psr.interpolate(method="values") + got = gsr.interpolate(method="values") assert_eq(expect, got) + @pytest.mark.parametrize( "data", [ From b486c8b0a82afad3842386d8e3af33ff0c22ed24 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 2 Aug 2021 08:57:59 -0700 Subject: [PATCH 21/23] address last review comment --- python/cudf/cudf/core/algorithms.py | 34 ++++++++++++++--------------- python/cudf/cudf/core/frame.py | 5 ++--- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index f681721552b..f953c894db2 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -5,6 +5,7 @@ import numpy as np from cudf.core.column import as_column +from cudf.core.frame import Frame from cudf.core.index import RangeIndex from cudf.core.series import Index, Series @@ -64,7 +65,7 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None): return labels, cats.values if return_cupy_array else Index(cats) -def _linear_interpolation(to_interp): +def _linear_interpolation(column, index=None): """ Interpolate over a float column. Implicitly assumes that values are evenly spaced with respect to the x-axis, for example the data @@ -72,31 +73,26 @@ def _linear_interpolation(to_interp): between the two valid values, yielding [1.0, 2.0, 3.0] """ - to_interp._index = RangeIndex(start=0, stop=len(to_interp), step=1) - return _index_or_values_interpolation(to_interp) + index = RangeIndex(start=0, stop=len(column), step=1) + return _index_or_values_interpolation(column, index=index) -def _index_or_values_interpolation(to_interp): +def _index_or_values_interpolation(column, index=None): """ Interpolate over a float column. assumes a linear interpolation strategy using the index of the data to denote spacing of the x values. For example the data and index [1.0, NaN, 4.0], [1, 3, 4] would result in [1.0, 3.0, 4.0] """ - colname = list(to_interp._data.keys())[0] - to_interp._data[colname] = ( - to_interp._data[colname].astype("float64").fillna(np.nan) - ) - - col = to_interp._data[colname] # figure out where the nans are - mask = cp.isnan(col) + mask = cp.isnan(column) # trivial cases, all nan or no nans num_nan = mask.sum() - if num_nan == 0 or num_nan == len(to_interp): - return col + if num_nan == 0 or num_nan == len(column): + return column + to_interp = Frame(data={None: column}, index=index) known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask)) known_x = known_x_and_y._index._column.values @@ -111,9 +107,11 @@ def _index_or_values_interpolation(to_interp): def get_column_interpolator(method): - if method == "linear": - return _linear_interpolation - elif method in {"index", "values"}: - return _index_or_values_interpolation - else: + interpolator = { + "linear": _linear_interpolation, + "index": _index_or_values_interpolation, + "values": _index_or_values_interpolation, + }.get(method, None) + if not interpolator: raise ValueError(f"Interpolation method `{method}` not found") + return interpolator diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 03c18ade830..6673caa05cf 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1495,11 +1495,10 @@ def interpolate( interpolator = cudf.core.algorithms.get_column_interpolator(method) for colname, col in data._data.items(): if col.nullable: - col = col.fillna(np.nan) + col = col.astype("float64").fillna(np.nan) # Interpolation methods may or may not need the index - to_interp = Frame(data={colname: col}, index=data._index) - result = interpolator(to_interp) + result = interpolator(col, index=data._index) columns[colname] = result result = self.__class__(ColumnAccessor(columns), index=data._index) From 296eddc84adeb1236fa10d5c628f44453986c384 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 6 Aug 2021 05:39:30 -0700 Subject: [PATCH 22/23] address review --- python/cudf/cudf/core/frame.py | 17 ++++++++--------- python/cudf/cudf/tests/test_interpolate.py | 5 ++++- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 8d4b66bcbc7..cbd92920b33 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1490,28 +1490,27 @@ def interpolate( ) data = self - columns = {} if not isinstance(data._index, cudf.RangeIndex): perm_sort = data._index.argsort() data = data._gather(perm_sort) interpolator = cudf.core.algorithms.get_column_interpolator(method) + columns = {} for colname, col in data._data.items(): if col.nullable: col = col.astype("float64").fillna(np.nan) # Interpolation methods may or may not need the index - result = interpolator(col, index=data._index) - columns[colname] = result - - result = self.__class__(ColumnAccessor(columns), index=data._index) + columns[colname] = interpolator(col, index=data._index) - if not isinstance(data._index, cudf.RangeIndex): - # that which was once sorted, now is not - result = result._gather(perm_sort.argsort()) + result = self._from_data(columns, index=data._index) - return result + return ( + result + if isinstance(data._index, cudf.RangeIndex) + else result._gather(perm_sort.argsort()) + ) def _quantiles( self, diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py index e9b9e03891e..66556c48828 100644 --- a/python/cudf/cudf/tests/test_interpolate.py +++ b/python/cudf/cudf/tests/test_interpolate.py @@ -16,7 +16,10 @@ @pytest.mark.parametrize("method", ["linear"]) @pytest.mark.parametrize("axis", [0]) def test_interpolate_dataframe(data, method, axis): - # doesn't seem to work with NAs just yet + # Pandas interpolate methods do not seem to work + # with nullable dtypes yet, so this method treats + # NAs as NaNs + # https://github.com/pandas-dev/pandas/issues/40252 gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() From 94cc6da0a85eed096171721be97134088c186cf0 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Fri, 6 Aug 2021 07:40:08 -0500 Subject: [PATCH 23/23] Update python/cudf/cudf/core/frame.py Co-authored-by: Vyas Ramasubramani --- python/cudf/cudf/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index cbd92920b33..27c89246b07 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1468,7 +1468,7 @@ def interpolate( an x-axis. Unsorted indices can lead to erroneous results. axis : int, default 0 Axis to interpolate along. Currently, - only 'axis=0' is supprted. + only 'axis=0' is supported. inplace : bool, default False Update the data in place if possible.