-
Notifications
You must be signed in to change notification settings - Fork 127
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add gradient calculation for the covariance between points in GPyModelWrapper #347
base: main
Are you sure you want to change the base?
Changes from 4 commits
49f5389
d0e25c8
bda97ad
6738faa
6ed9094
e7b436b
8e16990
38a0691
d6d6b0f
5bd6383
d34de42
fe9b0d7
bcf4416
26960ee
2bb36d2
9d52426
c8b9f83
f345342
c280f6d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -61,6 +61,19 @@ def get_joint_prediction_gradients(self, X: np.ndarray) -> Tuple[np.ndarray, np. | |
dvariance_dx = dSigma(X, self.model.X, self.model.kern, self.model.posterior.woodbury_inv) | ||
return dmean_dx, dvariance_dx | ||
|
||
def get_covariance_between_points_gradients(self, X1: np.ndarray, X2: np.ndarray) -> np.ndarray: | ||
""" | ||
Computes and returns model gradients of the covariance between outputs at points X1 and X2 with respect | ||
to X1. | ||
|
||
:param X1: points to compute gradients at, nd array of shape (q1, d) | ||
:param X2: points for the covariance of which to compute the gradient, nd array of shape (q2, d) | ||
:return: gradient of the covariance matrix of shape (q1, q2) between outputs at X1 and X2 | ||
(return shape is (q1, q2, q1, d)). | ||
""" | ||
dcov_dx1 = dCov(X1, X2, self.model.X, self.model.kern, self.model.posterior.woodbury_inv) | ||
return dcov_dx1 | ||
|
||
def set_data(self, X: np.ndarray, Y: np.ndarray) -> None: | ||
""" | ||
Sets training data in model | ||
|
@@ -171,24 +184,23 @@ def dSigma(x_predict: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_inv: np | |
:return: Gradient of the posterior covariance of shape (q, q, q, d) | ||
""" | ||
q, d, n = x_predict.shape[0], x_predict.shape[1], x_train.shape[0] | ||
dkxX_dx = np.empty((q, n, d)) | ||
dkxx_dx = np.empty((q, q, d)) | ||
# Tensor for the gradients of (q, n) covariance matrix between x_predict and x_train with respect to | ||
# x_predict (of shape (q, d)) | ||
dkxX_dx = np.zeros((d, q*q, n)) | ||
# Tensor for the gradients of full covariance matrix at points x_predict (of shape (q, q) with respect to | ||
# x_predict (of shape (q, d)) | ||
dkxx_dx = np.zeros((d, q*q, q)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Naming here is a bit unfortunate. I know it was already there, but do you have any ideas how to improve it? the only way to tell the two vars apart is one upper/lower case X, and that's so poor to read There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I renamed it to something more verbose, let me know if you prefer it! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried to improve it, let me know what you think! |
||
for i in range(d): | ||
dkxX_dx[:, :, i] = kern.dK_dX(x_predict, x_train, i) | ||
dkxx_dx[:, :, i] = kern.dK_dX(x_predict, x_predict, i) | ||
dkxX_dx[i, ::q + 1, :] = kern.dK_dX(x_predict, x_train, i) | ||
dkxx_dx[i, ::q + 1, :] = kern.dK_dX(x_predict, x_predict, i) | ||
dkxX_dx = dkxX_dx.reshape((d, q, q, n)) | ||
dkxx_dx = dkxx_dx.reshape((d, q, q, q)) | ||
dkxx_dx += dkxx_dx.transpose((0, 1, 3, 2)) | ||
dkxx_dx.reshape((d, q, -1))[:, :, ::q + 1] = 0. | ||
|
||
K = kern.K(x_predict, x_train) | ||
|
||
dsigma = np.zeros((q, q, q, d)) | ||
for i in range(q): | ||
for j in range(d): | ||
Ks = np.zeros((q, n)) | ||
Ks[i, :] = dkxX_dx[i, :, j] | ||
dKss_dxi = np.zeros((q, q)) | ||
dKss_dxi[i, :] = dkxx_dx[i, :, j] | ||
dKss_dxi[:, i] = dkxx_dx[i, :, j].T | ||
dKss_dxi[i, i] = 0 | ||
dsigma[:, :, i, j] = dKss_dxi - Ks @ w_inv @ K.T - K @ w_inv @ Ks.T | ||
return dsigma | ||
dsigma = dkxx_dx - K @ w_inv @ dkxX_dx.transpose((0, 1, 3, 2)) - dkxX_dx @ w_inv @ K.T | ||
return dsigma.transpose((2, 3, 1, 0)) | ||
|
||
|
||
def dmean(x_predict: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_vec: np.ndarray) -> np.ndarray: | ||
|
@@ -210,6 +222,34 @@ def dmean(x_predict: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_vec: np. | |
dmu[j, j, i] = (dkxX_dx[j, :, i][None, :] @ w_vec[:, None]).flatten() | ||
return dmu | ||
|
||
|
||
def dCov(x1: np.ndarray, x2: np.ndarray, x_train: np.ndarray, kern: GPy.kern, w_inv: np.ndarray) -> np.ndarray: | ||
""" | ||
Compute the derivative of the posterior covariance matrix between prediction inputs x1 and x2 | ||
(of shape (q1, q2)) with respect to x1 | ||
|
||
:param x1: Prediction inputs of shape (q1, d) | ||
:param x2: Prediction inputs of shape (q2, d) | ||
:param x_train: Training inputs of shape (n, d) | ||
:param kern: Covariance of the GP model | ||
:param w_inv: Woodbury inverse of the posterior fit of the GP | ||
:return: nd array of shape (q1, q2, q1, d) representing the gradient of the posterior covariance between x1 and x2, | ||
where res[:, :, i, j] is the gradient of the covariance between outputs at x1 and x2 with respect to x1[i, j] | ||
""" | ||
q1, q2, d, n = x1.shape[0], x2.shape[0], x1.shape[1], x_train.shape[0] | ||
dkx1X_dx = np.zeros((d, q1*q1, n)) | ||
dkx1x2_dx = np.zeros((d, q1*q1, q2)) | ||
for i in range(d): | ||
dkx1X_dx[i, ::q1 + 1, :] = kern.dK_dX(x1, x_train, i) | ||
dkx1x2_dx[i, ::q1 + 1, :] = kern.dK_dX(x1, x2, i) | ||
dkx1X_dx = dkx1X_dx.reshape((d, q1, q1, n)) | ||
dkx1x2_dx = dkx1x2_dx.reshape((d, q1, q1, q2)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Naming here is hard to get through: |
||
|
||
K_Xx2 = kern.K(x_train, x2) | ||
dcov = dkx1x2_dx - dkx1X_dx @ w_inv @ K_Xx2 | ||
return dcov.transpose((2, 3, 1, 0)) | ||
|
||
|
||
class GPyMultiOutputWrapper(IModel, IDifferentiable, ICalculateVarianceReduction, IEntropySearchModel): | ||
""" | ||
A wrapper around GPy multi-output models. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import GPy | ||
import numpy as np | ||
import pytest | ||
|
||
from emukit.model_wrappers.gpy_model_wrappers import GPyModelWrapper | ||
|
||
|
||
@pytest.fixture | ||
def test_data(gpy_model): | ||
np.random.seed(42) | ||
return np.random.randn(5, gpy_model.X.shape[1]) | ||
|
||
|
||
@pytest.fixture | ||
def test_data2(gpy_model): | ||
np.random.seed(42) | ||
return np.random.randn(4, gpy_model.X.shape[1]) | ||
|
||
|
||
def test_joint_prediction_gradients(gpy_model, test_data): | ||
epsilon = 1e-5 | ||
mean, cov = gpy_model.predict_with_full_covariance(test_data) | ||
# Get the gradients | ||
mean_dx, cov_dx = gpy_model.get_joint_prediction_gradients(test_data) | ||
|
||
for i in range(test_data.shape[0]): # Iterate over each test point | ||
for j in range(test_data.shape[1]): # Iterate over each dimension | ||
# Approximate the gradient numerically | ||
perturbed_input = test_data.copy() | ||
perturbed_input[i, j] += epsilon | ||
mean_perturbed, cov_perturbed = gpy_model.predict_with_full_covariance(perturbed_input) | ||
mean_dx_numerical = (mean_perturbed - mean) / epsilon | ||
cov_dx_numerical = (cov_perturbed - cov) / epsilon | ||
# Check that numerical approx. similar to true gradient | ||
assert pytest.approx(mean_dx_numerical.ravel(), abs=1e-8, rel=1e-2) == mean_dx[:, i, j] | ||
assert pytest.approx(cov_dx_numerical, abs=1e-8, rel=1e-2) == cov_dx[:, :, i, j] | ||
|
||
|
||
def test_get_covariance_between_points_gradients(gpy_model, test_data, test_data2): | ||
epsilon = 1e-5 | ||
cov = gpy_model.get_covariance_between_points(test_data, test_data2) | ||
# Get the gradients | ||
cov_dx = gpy_model.get_covariance_between_points_gradients(test_data, test_data2) | ||
|
||
for i in range(test_data.shape[0]): # Iterate over each test point | ||
for j in range(test_data.shape[1]): # Iterate over each dimension | ||
# Approximate the gradient numerically | ||
perturbed_input = test_data.copy() | ||
perturbed_input[i, j] += epsilon | ||
cov_perturbed = gpy_model.get_covariance_between_points(perturbed_input, test_data2) | ||
cov_dx_numerical = (cov_perturbed - cov) / epsilon | ||
# Check that numerical approx. similar to true gradient | ||
assert pytest.approx(cov_dx_numerical, abs=1e-8, rel=1e-2) == cov_dx[:, :, i, j] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Three inputs come directly from the model. I wander if it makes sense to just pass model in. Or not extract
dCov
as a stateless method at all. COnisder this: all this method is doing is just calling dCov, literally nothing else. And so far dCov is only used here. It seems pretty specific to me to not get reused. Is that impression correct?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think that's fair to say, I was primarily just trying to mirror the implementation of
dSigma
that was already in the module. Happy to not separate it out.