diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml new file mode 100644 index 00000000..323237ba --- /dev/null +++ b/.JuliaFormatter.toml @@ -0,0 +1 @@ +style = "blue" diff --git a/Project.toml b/Project.toml new file mode 100644 index 00000000..de63578f --- /dev/null +++ b/Project.toml @@ -0,0 +1,17 @@ +name = "SparseGPs" +uuid = "298c2ebc-0411-48ad-af38-99e88101b606" +authors = ["Ross Viljoen "] +version = "0.1.0" + +[deps] +AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918" +ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +FastGaussQuadrature = "442a2c76-b920-505d-bb47-c5924d526838" +FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b" +GPLikelihoods = "6031954c-0455-49d7-b3b9-3e1c99afaf40" +KLDivergences = "3c9cd921-3d3f-41e2-830c-e020174918cc" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" diff --git a/examples/classification.jl b/examples/classification.jl new file mode 100644 index 00000000..b1b7f6fc --- /dev/null +++ b/examples/classification.jl @@ -0,0 +1,138 @@ +# Recreation of https://gpflow.readthedocs.io/en/master/notebooks/basics/classification.html + +# %% +using SparseGPs +using AbstractGPs +using GPLikelihoods +using StatsFuns +using FastGaussQuadrature +using Distributions +using LinearAlgebra +using DelimitedFiles +using IterTools + +using Plots +default(; legend=:outertopright, size=(700, 400)) + +using Random +Random.seed!(1234) + +# %% +# Read in the classification data +data_file = pkgdir(SparseGPs) * "/examples/data/classif_1D.csv" +x, y = eachcol(readdlm(data_file)) +scatter(x, y) + +# %% +# First, create the GP kernel from given parameters k +function make_kernel(k) + return softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2]))) +end + +k = [10, 0.1] + +kernel = make_kernel(k) +f = LatentGP(GP(kernel), BernoulliLikelihood(), 0.1) +fx = f(x) + +# %% +# Then, plot some samples from the prior underlying GP +x_plot = 0:0.02:6 +prior_f_samples = rand(f.f(x_plot, 1e-6), 20) + +plt = plot(x_plot, prior_f_samples; seriescolor="red", linealpha=0.2, label="") +scatter!(plt, x, y; seriescolor="blue", label="Data points") + +# %% +# Plot the same samples, but pushed through a logistic sigmoid to constrain +# them in (0, 1). +prior_y_samples = mean.(f.lik.(prior_f_samples)) + +plt = plot(x_plot, prior_y_samples; seriescolor="red", linealpha=0.2, label="") +scatter!(plt, x, y; seriescolor="blue", label="Data points") + +# %% +# A simple Flux model +using Flux + +struct SVGPModel + k # kernel parameters + m # variational mean + A # variational covariance + z # inducing points +end + +Flux.@functor SVGPModel (k, m, A) # Don't train the inducing inputs + +lik = BernoulliLikelihood() +jitter = 1e-4 + +function (m::SVGPModel)(x) + kernel = make_kernel(m.k) + f = LatentGP(GP(kernel), lik, jitter) + q = MvNormal(m.m, m.A'm.A) + fx = f(x) + fu = f(m.z).fx + return fx, fu, q +end + +function flux_loss(x, y; n_data=length(y)) + fx, fu, q = model(x) + return -SparseGPs.elbo(fx, y, fu, q; n_data, method=MonteCarlo()) +end + +# %% +M = 15 # number of inducing points + +# Initialise the parameters +k = [10, 0.1] +m = zeros(M) +A = Matrix{Float64}(I, M, M) +z = x[1:M] + +model = SVGPModel(k, m, A, z) + +opt = ADAM(0.1) +parameters = Flux.params(model) + +# %% +# Negative ELBO before training +println(flux_loss(x, y)) + +# %% +# Train the model +Flux.train!( + (x, y) -> flux_loss(x, y), + parameters, + ncycle([(x, y)], 2000), # Train for 1000 epochs + opt, +) + +# %% +# Negative ELBO after training +println(flux_loss(x, y)) + +# %% +# After optimisation, plot samples from the underlying posterior GP. +fu = f(z).fx # want the underlying FiniteGP +post = SparseGPs.approx_posterior(SVGP(), fu, MvNormal(m, A'A)) +l_post = LatentGP(post, BernoulliLikelihood(), jitter) + +post_f_samples = rand(l_post.f(x_plot, 1e-6), 20) + +plt = plot(x_plot, post_f_samples; seriescolor="red", linealpha=0.2, legend=false) + +# %% +# As above, push these samples through a logistic sigmoid to get posterior predictions. +post_y_samples = mean.(l_post.lik.(post_f_samples)) + +plt = plot( + x_plot, + post_y_samples; + seriescolor="red", + linealpha=0.2, + # legend=false, + label="", +) +scatter!(plt, x, y; seriescolor="blue", label="Data points") +vline!(z; label="Pseudo-points") diff --git a/examples/data/classif_1D.csv b/examples/data/classif_1D.csv new file mode 100644 index 00000000..70ddb862 --- /dev/null +++ b/examples/data/classif_1D.csv @@ -0,0 +1,50 @@ +5.668341708542713242e+00 0.000000000000000000e+00 +5.758793969849246075e+00 0.000000000000000000e+00 +5.517587939698492150e+00 0.000000000000000000e+00 +2.954773869346733584e+00 1.000000000000000000e+00 +3.648241206030150785e+00 1.000000000000000000e+00 +2.110552763819095290e+00 1.000000000000000000e+00 +4.613065326633165597e+00 0.000000000000000000e+00 +4.793969849246231263e+00 0.000000000000000000e+00 +4.703517587939698430e+00 0.000000000000000000e+00 +6.030150753768843686e-01 1.000000000000000000e+00 +3.015075376884421843e-01 0.000000000000000000e+00 +3.979899497487437099e+00 0.000000000000000000e+00 +3.226130653266331638e+00 1.000000000000000000e+00 +1.899497487437185939e+00 1.000000000000000000e+00 +1.145728643216080256e+00 1.000000000000000000e+00 +3.316582914572864249e-01 0.000000000000000000e+00 +6.030150753768843686e-01 1.000000000000000000e+00 +2.231155778894472252e+00 1.000000000000000000e+00 +3.256281407035175768e+00 1.000000000000000000e+00 +1.085427135678391997e+00 1.000000000000000000e+00 +1.809045226130653106e+00 1.000000000000000000e+00 +4.492462311557789079e+00 0.000000000000000000e+00 +1.959798994974874198e+00 1.000000000000000000e+00 +0.000000000000000000e+00 0.000000000000000000e+00 +3.346733668341708601e+00 1.000000000000000000e+00 +1.507537688442210921e-01 0.000000000000000000e+00 +1.809045226130653328e-01 1.000000000000000000e+00 +5.517587939698492150e+00 0.000000000000000000e+00 +2.201005025125628123e+00 1.000000000000000000e+00 +5.577889447236180409e+00 0.000000000000000000e+00 +1.809045226130653328e-01 0.000000000000000000e+00 +1.688442211055276365e+00 1.000000000000000000e+00 +4.160804020100502321e+00 0.000000000000000000e+00 +2.170854271356783993e+00 1.000000000000000000e+00 +4.311557788944723413e+00 0.000000000000000000e+00 +3.075376884422110546e+00 1.000000000000000000e+00 +5.125628140703517133e+00 0.000000000000000000e+00 +1.989949748743718549e+00 1.000000000000000000e+00 +5.366834170854271058e+00 0.000000000000000000e+00 +4.100502512562814061e+00 0.000000000000000000e+00 +7.236180904522613311e-01 1.000000000000000000e+00 +2.261306532663316382e+00 1.000000000000000000e+00 +3.467336683417085119e+00 1.000000000000000000e+00 +1.085427135678391997e+00 1.000000000000000000e+00 +5.095477386934673447e+00 0.000000000000000000e+00 +5.185929648241205392e+00 0.000000000000000000e+00 +2.743718592964823788e+00 1.000000000000000000e+00 +2.773869346733668362e+00 1.000000000000000000e+00 +1.417085427135678311e+00 1.000000000000000000e+00 +1.989949748743718549e+00 1.000000000000000000e+00 diff --git a/examples/regression.jl b/examples/regression.jl new file mode 100644 index 00000000..518ee761 --- /dev/null +++ b/examples/regression.jl @@ -0,0 +1,174 @@ +# A recreation of https://gpflow.readthedocs.io/en/master/notebooks/advanced/gps_for_big_data.html + +using AbstractGPs +using SparseGPs +using Distributions +using LinearAlgebra +using Optim +using IterTools + +using Plots +default(; legend=:outertopright, size=(700, 400)) + +using Random +Random.seed!(1234) + +# %% +# The data generating function +function g(x) + return sin(3π * x) + 0.3 * cos(9π * x) + 0.5 * sin(7π * x) +end + +N = 10000 # Number of training points +x = rand(Uniform(-1, 1), N) +y = g.(x) + 0.3 * randn(N) + +scatter(x, y; xlabel="x", ylabel="y", legend=false) + +# %% +# A simple Flux model +using Flux + +lik_noise = 0.3 +jitter = 1e-5 + +struct SVGPModel + k # kernel parameters + m # variational mean + A # variational covariance + z # inducing points +end + +Flux.@functor SVGPModel (k, m, A) # Don't train the inducing inputs + +function make_kernel(k) + return softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2]))) +end + +# Create the 'model' from the parameters - i.e. return the FiniteGP at inputs x, +# the FiniteGP at inducing inputs z and the variational posterior over inducing +# points - q(u). +function (m::SVGPModel)(x) + kernel = make_kernel(m.k) + f = GP(kernel) + q = MvNormal(m.m, m.A'm.A) + fx = f(x, lik_noise) + fu = f(m.z, jitter) + return fx, fu, q +end + +# Create the posterior GP from the model parameters. +function posterior(m::SVGPModel) + kernel = make_kernel(m.k) + f = GP(kernel) + fu = f(m.z, jitter) + q = MvNormal(m.m, m.A'm.A) + return SparseGPs.approx_posterior(SVGP(), fu, q) +end + +# Return the loss given data - in this case the negative ELBO. +function flux_loss(x, y; n_data=length(y)) + fx, fu, q = model(x) + return -SparseGPs.elbo(fx, y, fu, q; n_data) +end + +# %% +M = 50 # number of inducing points + +# Select the first M inputs as inducing inputs +z = x[1:M] + +# Initialise the parameters +k = [0.3, 10] +m = zeros(M) +A = Matrix{Float64}(I, M, M) + +model = SVGPModel(k, m, A, z) + +b = 100 # minibatch size +opt = ADAM(0.001) +parameters = Flux.params(model) +data_loader = Flux.Data.DataLoader((x, y); batchsize=b) + +# %% +# Negative ELBO before training +println(flux_loss(x, y)) + +# %% +# Train the model +Flux.train!( + (x, y) -> flux_loss(x, y; n_data=N), + parameters, + ncycle(data_loader, 300), # Train for 300 epochs + opt, +) + +# %% +# Negative ELBO after training +println(flux_loss(x, y)) + +# %% +# Plot samples from the optmimised approximate posterior. +post = posterior(model) + +scatter( + x, + y; + markershape=:xcross, + markeralpha=0.1, + xlim=(-1, 1), + xlabel="x", + ylabel="y", + title="posterior (VI with sparse grid)", + label="Train Data", +) +plot!(-1:0.001:1, post; label="Posterior") +vline!(z; label="Pseudo-points") + +# %% There is a closed form optimal solution for the variational posterior q(u) +# (e.g. https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/ +# equations (11) & (12)). The SVGP posterior with this optimal q(u) should +# therefore be equivalent to the 'exact' sparse GP (Titsias) posterior. + +function exact_q(fu, fx, y) + σ² = fx.Σy[1] + Kuf = cov(fu, fx) + Kuu = Symmetric(cov(fu)) + Σ = (Symmetric(cov(fu) + (1 / σ²) * Kuf * Kuf')) + m = ((1 / σ²) * Kuu * (Σ \ Kuf)) * y + S = Symmetric(Kuu * (Σ \ Kuu)) + return MvNormal(m, S) +end + +kernel = make_kernel([0.3, 10]) +f = GP(kernel) +fx = f(x, lik_noise) +fu = f(z, jitter) +q_ex = exact_q(fu, fx, y) + +scatter(x, y) +scatter!(z, q_ex.μ) + +# These two should be the same - and they are, as the plot below shows +ap_ex = SparseGPs.approx_posterior(SVGP(), fu, q_ex) # Hensman (2013) exact posterior +ap_tits = AbstractGPs.approx_posterior(VFE(), fx, y, fu) # Titsias posterior + +# These are also approximately equal +SparseGPs.elbo(fx, y, fu, q_ex) +AbstractGPs.elbo(fx, y, fu) + +# %% +scatter( + x, + y; + markershape=:xcross, + markeralpha=0.1, + xlim=(-1, 1), + xlabel="x", + ylabel="y", + title="posterior (VI with sparse grid)", + label="Train Data", +) +plot!(-1:0.001:1, ap_ex; label="SVGP posterior") +plot!(-1:0.001:1, ap_tits; label="Titsias posterior") +vline!(z; label="Pseudo-points") diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl new file mode 100644 index 00000000..c0a165af --- /dev/null +++ b/src/SparseGPs.jl @@ -0,0 +1,23 @@ +module SparseGPs + +using AbstractGPs +using Distributions +using LinearAlgebra +using Statistics +using StatsBase +using FastGaussQuadrature +using GPLikelihoods +using SpecialFunctions +using ChainRulesCore +using FillArrays +using KLDivergences + +using AbstractGPs: + AbstractGP, FiniteGP, LatentFiniteGP, ApproxPosteriorGP, At_A, diag_At_A + +export SVGP, Default, Analytic, Quadrature, MonteCarlo + +include("elbo.jl") +include("svgp.jl") + +end diff --git a/src/elbo.jl b/src/elbo.jl new file mode 100644 index 00000000..38627ceb --- /dev/null +++ b/src/elbo.jl @@ -0,0 +1,226 @@ +"Likelihoods which take a scalar as input and return a scalar." +ScalarLikelihood = Union{ + BernoulliLikelihood, + PoissonLikelihood, + GaussianLikelihood, + ExponentialLikelihood, + GammaLikelihood, +} + +abstract type ExpectationMethod end +struct Default <: ExpectationMethod end +struct Analytic <: ExpectationMethod end + +struct Quadrature <: ExpectationMethod + n_points::Int +end +Quadrature() = Quadrature(20) + +struct MonteCarlo <: ExpectationMethod + n_samples::Int +end +MonteCarlo() = MonteCarlo(20) + +""" + elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), method=Default()) + +Compute the Evidence Lower BOund from [1] for the process `fx.f` where `y` are +observations of `fx`, pseudo-inputs are given by `z = fz.x` and `q(u)` is a +variational distribution over inducing points `u = f(z)`. + +`method` selects which method is used to calculate the expected loglikelihood in +the ELBO. The options are: `Default()`, `Analytic()`, `Quadrature()` and +`MonteCarlo()`. For likelihoods with an analytic solution, `Default()` uses this +exact solution. If there is no such solution, `Default()` either uses +`Quadrature()` or `MonteCarlo()`, depending on the likelihood. + +N.B. the likelihood is assumed to be Gaussian with observation noise `fx.Σy`. +Further, `fx.Σy` must be homoscedastic and uncorrelated - i.e. `fx.Σy = α * I`. + +[1] - Hensman, James, Alexander Matthews, and Zoubin Ghahramani. "Scalable +variational Gaussian process classification." Artificial Intelligence and +Statistics. PMLR, 2015. +""" +function AbstractGPs.elbo( + fx::FiniteGP{<:AbstractGP,<:AbstractVector,<:Diagonal{<:Real,<:Fill}}, + y::AbstractVector{<:Real}, + fz::FiniteGP, + q::AbstractMvNormal; + n_data=length(y), + method=Default(), +) + return _elbo(method, fx, y, fz, q, GaussianLikelihood(fx.Σy[1]), n_data) +end + +function AbstractGPs.elbo( + ::FiniteGP, ::AbstractVector, ::FiniteGP, ::AbstractMvNormal; kwargs... +) + return error( + "The observation noise fx.Σy must be homoscedastic.\n To avoid this error, construct fx using: f = GP(kernel); fx = f(x, σ²)", + ) +end + +""" + elbo(lfx::LatentFiniteGP, y::AbstractVector, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), method=Default()) + +Compute the ELBO for a LatentGP with a possibly non-conjugate likelihood. +""" +function AbstractGPs.elbo( + lfx::LatentFiniteGP, + y::AbstractVector, + fz::FiniteGP, + q::AbstractMvNormal; + n_data=length(y), + method=Default(), +) + return _elbo(method, lfx.fx, y, fz, q, lfx.lik, n_data) +end + +# Compute the common elements of the ELBO +function _elbo( + method::ExpectationMethod, + fx::FiniteGP, + y::AbstractVector, + fz::FiniteGP, + q::AbstractMvNormal, + lik::ScalarLikelihood, + n_data::Integer, +) + post = approx_posterior(SVGP(), fz, q) + q_f = marginals(post(fx.x)) + variational_exp = expected_loglik(method, y, q_f, lik) + + kl_term = kldivergence(q, fz) + + n_batch = length(y) + scale = n_data / n_batch + return sum(variational_exp) * scale - kl_term +end + +""" + expected_loglik(method::ExpectationMethod, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik) + +This function computes the expected log likelihood: + +```math + ∫ q(f) log p(y | f) df +``` +where `p(y | f)` is the process likelihood. + +`q(f)` is an approximation to the latent function values `f` given by: +```math + q(f) = ∫ p(f | u) q(u) du +``` +where `q(u)` is the variational distribution over inducing points (see +[`elbo`](@ref)). The marginal distributions of `q(f)` are given by `q_f`. + +`method` determines which method is used to calculate the expected log +likelihood - see [`elbo`](@ref) for more details. + +# Extended help + +`q(f)` is assumed to be an `MvNormal` distribution and `p(y | f)` is assumed to +have independent marginals such that only the marginals of `q(f)` are required. +""" +expected_loglik(method, y, q_f, lik) + +""" + expected_loglik(method::ExpectationMethod, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik::ScalarLikelihood) + +The expected log likelihood for a `ScalarLikelihood`, computed via `method`. +Defaults to a closed form solution if it exists, otherwise defaults to +Gauss-Hermite quadrature. +""" +function expected_loglik( + ::Default, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik::ScalarLikelihood +) + method = _default_method(lik) + return expected_loglik(method, y, q_f, lik) +end + +# The closed form solution for independent Gaussian noise +function expected_loglik( + ::Analytic, + y::AbstractVector{<:Real}, + q_f::AbstractVector{<:Normal}, + lik::GaussianLikelihood, +) + return sum( + -0.5 * (log(2π) .+ log.(lik.σ²) .+ ((y .- mean.(q_f)) .^ 2 .+ var.(q_f)) / lik.σ²) + ) +end + +# The closed form solution for a Poisson likelihood with an exponential inverse link function +function expected_loglik( + ::Analytic, + y::AbstractVector, + q_f::AbstractVector{<:Normal}, + ::PoissonLikelihood{ExpLink}, +) + f_μ = mean.(q_f) + return sum((y .* f_μ) - exp.(f_μ .+ (var.(q_f) / 2)) - loggamma.(y .+ 1)) +end + +# The closed form solution for an Exponential likelihood with an exponential inverse link function +function expected_loglik( + ::Analytic, + y::AbstractVector{<:Real}, + q_f::AbstractVector{<:Normal}, + ::ExponentialLikelihood{ExpLink}, +) + f_μ = mean.(q_f) + return sum(-f_μ - y .* exp.((var.(q_f) / 2) .- f_μ)) +end + +# The closed form solution for a Gamma likelihood with an exponential inverse link function +function expected_loglik( + ::Analytic, + y::AbstractVector{<:Real}, + q_f::AbstractVector{<:Normal}, + lik::GammaLikelihood{<:Any,ExpLink}, +) + f_μ = mean.(q_f) + return sum( + (lik.α - 1) * log.(y) .- y .* exp.((var.(q_f) / 2) .- f_μ) .- lik.α * f_μ .- + loggamma(lik.α), + ) +end + +function expected_loglik(::Analytic, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik) + return error( + "No analytic solution exists for ", + typeof(lik), + ". Use `Default()`, `Quadrature()` or `MonteCarlo()` instead.", + ) +end + +function expected_loglik( + mc::MonteCarlo, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik::ScalarLikelihood +) + # take 'n_samples' reparameterised samples + f_μ = mean.(q_f) + fs = f_μ .+ std.(q_f) .* randn(eltype(f_μ), length(q_f), mc.n_samples) + lls = loglikelihood.(lik.(fs), y) + return sum(lls) / mc.n_samples +end + +function expected_loglik( + gh::Quadrature, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik::ScalarLikelihood +) + # Compute the expectation via Gauss-Hermite quadrature + # using a reparameterisation by change of variable + # (see eg. en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature) + xs, ws = gausshermite(gh.n_points) + # size(fs): (length(y), n_points) + fs = √2 * std.(q_f) .* transpose(xs) .+ mean.(q_f) + lls = loglikelihood.(lik.(fs), y) + return sum((1 / √π) * lls * ws) +end + +ChainRulesCore.@non_differentiable gausshermite(n) + +AnalyticLikelihood = Union{ + PoissonLikelihood,GaussianLikelihood,ExponentialLikelihood,GammaLikelihood +} +_default_method(::AnalyticLikelihood) = Analytic() +_default_method(_) = Quadrature() diff --git a/src/svgp.jl b/src/svgp.jl new file mode 100644 index 00000000..0e71a265 --- /dev/null +++ b/src/svgp.jl @@ -0,0 +1,71 @@ +struct SVGP end + +raw""" + approx_posterior(::SVGP, fz::FiniteGP, q::MvNormal) + +Compute the approximate posterior [1] over the process `f = fz.f`, given inducing +inputs `z = fz.x` and a variational distribution over inducing points `q(u)` where `u = +f(z)`. The approximate posterior at test points ``x^*`` where ``f^* = f(x^*)`` +is then given by: + +```math +q(f^*) = \int p(f | u) q(u) du +``` +which can be found in closed form. + +[1] - Hensman, James, Alexander Matthews, and Zoubin Ghahramani. "Scalable +variational Gaussian process classification." Artificial Intelligence and +Statistics. PMLR, 2015. +""" +function AbstractGPs.approx_posterior(::SVGP, fz::FiniteGP, q::AbstractMvNormal) + m, A = mean(q), _chol_cov(q) + Kuu = _chol_cov(fz) + B = Kuu.L \ A.L + α = Kuu \ (m - mean(fz)) + data = (A=A, m=m, Kuu=Kuu, B=B, α=α, u=fz.x) + return ApproxPosteriorGP(SVGP(), fz.f, data) +end + +function Statistics.mean(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) + return mean(f.prior, x) + cov(f.prior, x, f.data.u) * f.data.α +end + +function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) + Cux = cov(f.prior, f.data.u, x) + D = f.data.Kuu.L \ Cux + return cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) +end + +function Statistics.var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) + Cux = cov(f.prior, f.data.u, x) + D = f.data.Kuu.L \ Cux + return var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) +end + +function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector, y::AbstractVector) + B = f.data.B + Cxu = cov(f.prior, x, f.data.u) + Cuy = cov(f.prior, f.data.u, y) + D = f.data.Kuu.L \ Cuy + E = Cxu / f.data.Kuu.L' + return cov(f.prior, x, y) - (E * D) + (E * B * B' * D) +end + +function StatsBase.mean_and_cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) + Cux = cov(f.prior, f.data.u, x) + D = f.data.Kuu.L \ Cux + μ = Cux' * f.data.α + Σ = cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) + return μ, Σ +end + +function StatsBase.mean_and_var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) + Cux = cov(f.prior, f.data.u, x) + D = f.data.Kuu.L \ Cux + μ = Cux' * f.data.α + Σ_diag = var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) + return μ, Σ_diag +end + +_chol_cov(q::AbstractMvNormal) = cholesky(Symmetric(cov(q))) +_chol_cov(q::MvNormal) = cholesky(q.Σ) diff --git a/test/Project.toml b/test/Project.toml new file mode 100644 index 00000000..47a7de77 --- /dev/null +++ b/test/Project.toml @@ -0,0 +1,8 @@ +[deps] +AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918" +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" +IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/elbo.jl b/test/elbo.jl new file mode 100644 index 00000000..0f20367c --- /dev/null +++ b/test/elbo.jl @@ -0,0 +1,36 @@ +@testset "elbo" begin + rng, N = MersenneTwister(654321), 20 + x = rand(rng, N) * 10 + y = sin.(x) + 0.9 * cos.(x * 1.6) + 0.4 * rand(rng, N) + z = x[begin:5] + + kernel = make_kernel([0.2, 0.6]) + f = GP(kernel) + fx = f(x, 0.1) + fz = f(z) + q_ex = exact_variational_posterior(fz, fx, y) + + @test elbo(fx, y, fz, q_ex) isa Real + @test elbo(fx, y, fz, q_ex) ≤ logpdf(fx, y) + + fx_bad = f(x, fill(0.1, N)) + @test_throws ErrorException elbo(fx_bad, y, fz, q_ex) + + # Test that the various methods of computing expectations return the same + # result. + rng = MersenneTwister(123456) + q_f = Normal.(zeros(10), ones(10)) + + @testset "$lik" for lik in Base.uniontypes(SparseGPs.ScalarLikelihood) + l = lik() + methods = [Quadrature(100), MonteCarlo(1000000)] + def = SparseGPs._default_method(l) + if def isa Analytic + push!(methods, def) + end + y = rand.(rng, l.(zeros(10))) + + results = map(m -> SparseGPs.expected_loglik(m, y, q_f, l), methods) + @test all(x -> isapprox(x, results[end]; rtol=1e-3), results) + end +end diff --git a/test/equivalences.jl b/test/equivalences.jl new file mode 100644 index 00000000..3bb4c7bc --- /dev/null +++ b/test/equivalences.jl @@ -0,0 +1,117 @@ +@testset "equivalences" begin + rng, N = MersenneTwister(654321), 20 + x = rand(rng, N) * 10 + y = sin.(x) + 0.9 * cos.(x * 1.6) + 0.4 * rand(rng, N) + + z = copy(x) # Set inducing inputs == training inputs + + k_init = [0.2, 0.6] # initial kernel parameters + lik_noise = 0.1 # The (fixed) Gaussian likelihood noise + + @testset "exact posterior" begin + # There is a closed form optimal solution for the variational posterior + # q(u) (e.g. # https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/ + # equations (11) & (12)). The SVGP posterior with this optimal q(u) + # should therefore be equivalent to the sparse GP (Titsias) posterior + # and exact GP regression (when z == x). + + kernel = make_kernel(k_init) + f = GP(kernel) + fx = f(x, lik_noise) + fz = f(z) + q_ex = exact_variational_posterior(fz, fx, y) + + gpr_post = posterior(fx, y) # Exact GP regression + vfe_post = approx_posterior(VFE(), fx, y, fz) # Titsias posterior + svgp_post = approx_posterior(SVGP(), fz, q_ex) # Hensman (2013) exact posterior + + @test mean(gpr_post, x) ≈ mean(svgp_post, x) atol = 1e-10 + @test cov(gpr_post, x) ≈ cov(svgp_post, x) atol = 1e-10 + + @test mean(vfe_post, x) ≈ mean(svgp_post, x) atol = 1e-10 + @test cov(vfe_post, x) ≈ cov(svgp_post, x) atol = 1e-10 + + @test elbo(fx, y, fz, q_ex) ≈ logpdf(fx, y) + end + + @testset "optimised posterior" begin + jitter = 1e-5 + + make_gp(kernel) = GP(kernel) + + ## FIRST - define the models + # GPR - Exact GP regression + struct GPRModel + k # kernel parameters + end + Flux.@functor GPRModel + + function (m::GPRModel)(x) + f = make_gp(make_kernel(m.k)) + fx = f(x, lik_noise) + return fx + end + + # SVGP - Sparse variational GP regression (Hensman 2014) + struct SVGPModel + k # kernel parameters + z # inducing points + m # variational mean + A # variational covariance sqrt (Σ = A'A) + end + Flux.@functor SVGPModel (k, m, A) # Don't train the inducing inputs + + function (m::SVGPModel)(x) + f = make_gp(make_kernel(m.k)) + q = MvNormal(m.m, m.A'm.A) + fx = f(x, lik_noise) + fz = f(m.z, jitter) + return fx, fz, q + end + + ## SECOND - create the models and associated training losses + gpr = GPRModel(copy(k_init)) + function GPR_loss(x, y) + fx = gpr(x) + return -logpdf(fx, y) + end + + m, A = zeros(N), Matrix{Float64}(I, N, N) # initialise the variational parameters + svgp = SVGPModel(copy(k_init), copy(z), m, A) + function SVGP_loss(x, y) + fx, fz, q = svgp(x) + return -elbo(fx, y, fz, q) + end + + ## THIRD - train the models + data = [(x, y)] + opt = ADAM(0.001) + + svgp_ps = Flux.params(svgp) + delete!(svgp_ps, svgp.k) # Don't train the kernel parameters + + # Optimise q(u) + Flux.train!((x, y) -> SVGP_loss(x, y), svgp_ps, ncycle(data, 20000), opt) + + ## FOURTH - construct the posteriors + function posterior(m::GPRModel, x, y) + f = make_gp(make_kernel(m.k)) + fx = f(x, lik_noise) + return AbstractGPs.posterior(fx, y) + end + + function posterior(m::SVGPModel) + f = make_gp(make_kernel(m.k)) + fz = f(m.z, jitter) + q = MvNormal(m.m, m.A'm.A) + return approx_posterior(SVGP(), fz, q) + end + + gpr_post = posterior(gpr, x, y) + svgp_post = posterior(svgp) + + ## FIFTH - test equivalences + @test all(isapprox.(mean(gpr_post, x), mean(svgp_post, x), atol=1e-4)) + @test all(isapprox.(cov(gpr_post, x), cov(svgp_post, x), atol=1e-4)) + end +end diff --git a/test/runtests.jl b/test/runtests.jl new file mode 100644 index 00000000..3a60e76b --- /dev/null +++ b/test/runtests.jl @@ -0,0 +1,27 @@ +using Random +using Test +using SparseGPs +using Flux +using IterTools +using AbstractGPs +using Distributions +using LinearAlgebra + +const GROUP = get(ENV, "GROUP", "All") +const PKGDIR = dirname(dirname(pathof(SparseGPs))) + +include("test_utils.jl") + +@testset "SparseGPs" begin + include("svgp.jl") + println(" ") + @info "Ran svgp tests" + + include("elbo.jl") + println(" ") + @info "Ran elbo tests" + + include("equivalences.jl") + println(" ") + @info "Ran equivalences tests" +end diff --git a/test/svgp.jl b/test/svgp.jl new file mode 100644 index 00000000..7dd90692 --- /dev/null +++ b/test/svgp.jl @@ -0,0 +1,20 @@ +@testset "svgp" begin + rng = MersenneTwister(123456) + N_cond = 5 + N_a = 6 + N_b = 7 + + # Specify prior. + f = GP(Matern32Kernel()) + # Sample from prior. + x = collect(range(-1.0, 1.0; length=N_cond)) + fx = f(x, 1e-15) + y = rand(rng, fx) + + q = exact_variational_posterior(fx, fx, y) + f_approx_post = approx_posterior(SVGP(), fx, q) + + a = collect(range(-1.0, 1.0; length=N_a)) + b = randn(rng, N_b) + AbstractGPs.TestUtils.test_internal_abstractgps_interface(rng, f_approx_post, a, b) +end diff --git a/test/test_utils.jl b/test/test_utils.jl new file mode 100644 index 00000000..805c799d --- /dev/null +++ b/test/test_utils.jl @@ -0,0 +1,17 @@ +# Create a default kernel from two parameters k[1] and k[2] +make_kernel(k) = softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2]))) + +# Computes the optimal closed form solution for the variational posterior +# q(u) (e.g. # https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/ +# equations (11) & (12)). Assumes a ZeroMean function. +function exact_variational_posterior(fu, fx, y) + fu.f.mean isa AbstractGPs.ZeroMean || + error("The exact posterior requires a GP with ZeroMean.") + σ² = fx.Σy[1] + Kuf = cov(fu, fx) + Kuu = Symmetric(cov(fu)) + Σ = (Symmetric(cov(fu) + (1 / σ²) * Kuf * Kuf')) + m = ((1 / σ²) * Kuu * (Σ \ Kuf)) * y + S = Symmetric(Kuu * (Σ \ Kuu)) + return MvNormal(m, S) +end