Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Kernel ridge regression #4492

Merged
merged 19 commits into from
Feb 9, 2022
Merged
Prev Previous commit
Next Next commit
Numerical stability
RAMitchell committed Jan 23, 2022
commit a7ec29c1bbae3e6f150a433ce128fa7fb2c48d43
24 changes: 16 additions & 8 deletions python/cuml/kernel_ridge/kernel_ridge.py
Original file line number Diff line number Diff line change
@@ -146,7 +146,7 @@ def _solve_cholesky_kernel(K, y, alpha, sample_weight=None, copy=False):
K.flat[:: n_samples + 1] += alpha[0]

try:
# we need to set tthe error mode of cupy to raise
# we need to set the error mode of cupy to raise
# otherwise we silently get an array of NaNs
err_mode = geterr()["linalg"]
seterr(linalg="raise")
@@ -297,7 +297,11 @@ def evaluate_pairwise_kernels(X, Y, K):

threadsperblock = 256
blockspergrid = (X.shape[0] * Y.shape[0] + (threadsperblock - 1)) // threadsperblock
K = cp.zeros((X.shape[0], Y.shape[0]), dtype=X.dtype)

# Here we force K to use 64 bit, even if the input is 32 bit
# 32 bit K results in serious numerical stability problems
K = cp.zeros((X.shape[0], Y.shape[0]), dtype=np.float64)

key = (metric, filtered_kwds_tuple,X.dtype, Y.dtype)
if key in _kernel_cache:
compiled_kernel = _kernel_cache[key]
@@ -334,10 +338,10 @@ def __init__(
self.kernel_params = kernel_params

def _get_kernel(self, X, Y=None):
if callable(self.kernel):
params = self.kernel_params or {}
else:
if isinstance(self.kernel,str):
params = {"gamma": self.gamma, "degree": self.degree, "coef0": self.coef0}
else:
params = self.kernel_params or {}
return pairwise_kernels(X, metric=self.kernel, filter_params=True, **params)

@generate_docstring()
@@ -367,15 +371,15 @@ def fit(self, X, y, sample_weight=None, convert_dtype=True) -> "KernelRidge":
msg = "X matrix must have at least a column"
raise TypeError(msg)

K = self._get_kernel(X, self.kernel)
K = self._get_kernel(X_m, self.kernel)
copy = self.kernel == "precomputed"
self.dual_coef_ = _solve_cholesky_kernel(
K, cp.asarray(y_m), self.alpha, sample_weight, copy
)

if ravel:
self.dual_coef_ = self.dual_coef_.ravel()
self.X_fit_ = X
self.X_fit_ = X_m
return self

def predict(self, X):
@@ -392,6 +396,10 @@ def predict(self, X):
C : array of shape (n_samples,) or (n_samples, n_targets)
Returns predicted values.
"""
K = self._get_kernel(X, self.X_fit_)
X_m, _, _, _ = input_to_cuml_array(
X, check_dtype=[np.float32, np.float64]
)

K = self._get_kernel(X_m, self.X_fit_)
return cp.dot(K, self.dual_coef_)

88 changes: 58 additions & 30 deletions python/cuml/test/test_kernel_ridge.py
Original file line number Diff line number Diff line change
@@ -35,15 +35,16 @@
from scipy.sparse.construct import rand
from sklearn.metrics import pairwise, mean_squared_error as mse
from sklearn.datasets import make_regression
from hypothesis import note, given, settings, strategies as st
from sklearn.kernel_ridge import KernelRidge as sklKernelRidge
from hypothesis import note, given, settings, assume, strategies as st
from hypothesis.extra.numpy import arrays


def gradient_norm(X, y, model):
X = cp.array(X)
y = cp.array(y)
K = cp.array(model._get_kernel(X.get()))
w = cp.array(model.dual_coef_).reshape(y.shape)
def gradient_norm(X, y, model, K):
X = cp.array(X, dtype=np.float64)
y = cp.array(y, dtype=np.float64)
K = cp.array(K, dtype=np.float64)
w = cp.array(model.dual_coef_, dtype=np.float64).reshape(y.shape)
grad = -cp.dot(K, y)
grad += cp.dot(cp.dot(K, K), w)
grad += cp.dot(K * model.alpha, w)
@@ -144,23 +145,12 @@ def kernel_arg_strategy(draw):
def array_strategy(draw):
X_m = draw(st.integers(1, 20))
X_n = draw(st.integers(1, 10))
dtype = draw(st.sampled_from([np.float64
, np.float32]))
X = draw(
arrays(
dtype=dtype,
shape=(X_m, X_n),
elements=st.floats(0, 5, width=32),
)
)
dtype = draw(st.sampled_from([np.float64, np.float32]))
X = draw(arrays(dtype=dtype, shape=(X_m, X_n), elements=st.floats(0, 5, width=32),))
if draw(st.booleans()):
Y_m = draw(st.integers(1, 20))
Y = draw(
arrays(
dtype=dtype,
shape=(Y_m, X_n),
elements=st.floats(0, 5, width=32),
)
arrays(dtype=dtype, shape=(Y_m, X_n), elements=st.floats(0, 5, width=32),)
)
else:
Y = None
@@ -174,27 +164,65 @@ def test_pairwise_kernels(kernel_arg, XY):
kernel, args = kernel_arg
K = pairwise_kernels(X, Y, metric=kernel, **args)
skl_kernel = kernel.py_func if hasattr(kernel, "py_func") else kernel
K_sklearn = pairwise.pairwise_kernels(X, Y, metric=skl_kernel,**args)
K_sklearn = pairwise.pairwise_kernels(X, Y, metric=skl_kernel, **args)
assert np.allclose(K, K_sklearn, rtol=0.01)


@given(kernel_arg_strategy())
@st.composite
def Xy_strategy(draw):
X_m = draw(st.integers(5, 20))
X_n = draw(st.integers(2, 10))
dtype = draw(st.sampled_from([np.float64, np.float32]))
rs = np.random.RandomState(draw(st.integers(1, 10)))
X = rs.rand(X_m, X_n).astype(dtype)

a = draw(arrays(dtype=dtype, shape=X_n, elements=st.floats(0, 5, width=32),))
y = X.dot(a)
return (X, y)


@given(
kernel_arg_strategy(),
Xy_strategy(),
st.floats(0.0, 5.0),
st.floats(1.0, 5.0),
st.integers(1, 5),
st.floats(1.0, 5.0),
)
@settings(deadline=5000, max_examples=100)
def test_estimator(kernel_arg):
def test_estimator(kernel_arg, Xy, alpha, gamma, degree, coef0):
kernel, args = kernel_arg
model = cuKernelRidge(kernel=kernel, kernel_params=args)
X, y = make_regression(20, random_state=2)

model = cuKernelRidge(
kernel=kernel,
alpha=alpha,
gamma=gamma,
degree=degree,
coef0=coef0,
kernel_params=args,
)
skl_kernel = kernel.py_func if hasattr(kernel, "py_func") else kernel
skl_model = sklKernelRidge(
kernel=skl_kernel,
alpha=alpha,
gamma=gamma,
degree=degree,
coef0=coef0,
kernel_params=args,
)
X, y = Xy
if kernel == "chi2" or kernel == "additive_chi2":
# X must be positive
X = X + abs(X.min()) + 1.0

model.fit(X, y)
# For a convex optimisation problem we should arrive at gradient norm 0
# If the solution has converged correctly
assert gradient_norm(X, y, model) < 1e-5
K = model._get_kernel(X)
grad_norm = gradient_norm(X, y, model, K)

# model should do better than predicting average
assert grad_norm < 1e-2
pred = model.predict(X).get()
baseline_pred = np.full_like(pred, y.mean())
assert mse(y,pred) < mse(y, baseline_pred)
if X.dtype == np.float64:
skl_model.fit(X, y)
skl_pred = skl_model.predict(X)
assert np.allclose(pred, skl_pred, atol=1e-3, rtol=1e-3)