From 3a169917326bdacef9bb7003a8122766d7cf6472 Mon Sep 17 00:00:00 2001 From: Hugh Salimbeni Date: Tue, 19 Jun 2018 16:00:11 +0100 Subject: [PATCH] Removed jitter in nat grads (#768) --- gpflow/training/natgrad_optimizer.py | 15 ++------------- tests/test_optimizers.py | 27 ++++++++++++++++++++++++++- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/gpflow/training/natgrad_optimizer.py b/gpflow/training/natgrad_optimizer.py index bab7ab9fc..4bbfe7f61 100644 --- a/gpflow/training/natgrad_optimizer.py +++ b/gpflow/training/natgrad_optimizer.py @@ -347,7 +347,7 @@ def natural_to_meanvarsqrt(nat_1, nat_2): mu = tf.matmul(S, nat_1) # We need the decomposition of S as L L^T, not as L^T L, # hence we need another cholesky. - return mu, _cholesky_with_jitter(S) + return mu, tf.cholesky(S) @swap_dimensions @@ -370,7 +370,7 @@ def expectation_to_natural(eta_1, eta_2): @swap_dimensions def expectation_to_meanvarsqrt(eta_1, eta_2): var = eta_2 - tf.matmul(eta_1, eta_1, transpose_b=True) - return eta_1, _cholesky_with_jitter(var) + return eta_1, tf.cholesky(var) @swap_dimensions @@ -378,17 +378,6 @@ def meanvarsqrt_to_expectation(m, v_sqrt): v = tf.matmul(v_sqrt, v_sqrt, transpose_b=True) return m, v + tf.matmul(m, m, transpose_b=True) - -def _cholesky_with_jitter(M): - """ - Add jitter and take Cholesky - - :param M: Tensor of shape NxNx...N - :return: The Cholesky decomposition of the input `M`. It's a `tf.Tensor` of shape ...xNxN - """ - N = tf.shape(M)[-1] - return tf.cholesky(M + settings.jitter * tf.eye(N, dtype=M.dtype)) - def _inverse_lower_triangular(M): """ Take inverse of lower triangular (e.g. Cholesky) matrix. This function diff --git a/tests/test_optimizers.py b/tests/test_optimizers.py index 9ebbdd893..8a7161564 100644 --- a/tests/test_optimizers.py +++ b/tests/test_optimizers.py @@ -245,6 +245,31 @@ def test_scipy_optimizer_options(session_tf): assert o1.optimizer.optimizer_kwargs['options'][gtol] == gtol_value assert gtol not in o2.optimizer.optimizer_kwargs['options'] +def test_small_q_sqrt_handeled_correctly(session_tf): + """ + This is an extra test to make sure things still work when q_sqrt is small. This was breaking (#767) + """ + N, D = 3, 2 + X = np.random.randn(N, D) + Y = np.random.randn(N, 1) + kern = gpflow.kernels.RBF(D) + lik_var = 0.1 + lik = gpflow.likelihoods.Gaussian() + lik.variance = lik_var + + m_vgp = gpflow.models.VGP(X, Y, kern, lik) + m_gpr = gpflow.models.GPR(X, Y, kern) + m_gpr.likelihood.variance = lik_var + + m_vgp.set_trainable(False) + m_vgp.q_mu.set_trainable(True) + m_vgp.q_sqrt.set_trainable(True) + m_vgp.q_mu = np.random.randn(N, 1) + m_vgp.q_sqrt = np.eye(N)[None, :, :] * 1e-3 + NatGradOptimizer(1.).minimize(m_vgp, [(m_vgp.q_mu, m_vgp.q_sqrt)], maxiter=1) + + assert_allclose(m_gpr.compute_log_likelihood(), + m_vgp.compute_log_likelihood(), atol=1e-4) def test_VGP_vs_GPR(session_tf): """ @@ -269,7 +294,7 @@ def test_VGP_vs_GPR(session_tf): NatGradOptimizer(1.).minimize(m_vgp, [(m_vgp.q_mu, m_vgp.q_sqrt)], maxiter=1) assert_allclose(m_gpr.compute_log_likelihood(), - m_vgp.compute_log_likelihood(), atol=1e-5) + m_vgp.compute_log_likelihood(), atol=1e-4) def test_other_XiTransform_VGP_vs_GPR(session_tf, xi_transform=XiSqrtMeanVar()):