diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml
new file mode 100644
index 00000000..323237ba
--- /dev/null
+++ b/.JuliaFormatter.toml
@@ -0,0 +1 @@
+style = "blue"
diff --git a/Project.toml b/Project.toml
new file mode 100644
index 00000000..de63578f
--- /dev/null
+++ b/Project.toml
@@ -0,0 +1,17 @@
+name = "SparseGPs"
+uuid = "298c2ebc-0411-48ad-af38-99e88101b606"
+authors = ["Ross Viljoen <ross@viljoen.co.uk>"]
+version = "0.1.0"
+
+[deps]
+AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918"
+ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+FastGaussQuadrature = "442a2c76-b920-505d-bb47-c5924d526838"
+FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
+GPLikelihoods = "6031954c-0455-49d7-b3b9-3e1c99afaf40"
+KLDivergences = "3c9cd921-3d3f-41e2-830c-e020174918cc"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
diff --git a/examples/classification.jl b/examples/classification.jl
new file mode 100644
index 00000000..b1b7f6fc
--- /dev/null
+++ b/examples/classification.jl
@@ -0,0 +1,138 @@
+# Recreation of https://gpflow.readthedocs.io/en/master/notebooks/basics/classification.html
+
+# %%
+using SparseGPs
+using AbstractGPs
+using GPLikelihoods
+using StatsFuns
+using FastGaussQuadrature
+using Distributions
+using LinearAlgebra
+using DelimitedFiles
+using IterTools
+
+using Plots
+default(; legend=:outertopright, size=(700, 400))
+
+using Random
+Random.seed!(1234)
+
+# %%
+# Read in the classification data
+data_file = pkgdir(SparseGPs) * "/examples/data/classif_1D.csv"
+x, y = eachcol(readdlm(data_file))
+scatter(x, y)
+
+# %%
+# First, create the GP kernel from given parameters k
+function make_kernel(k)
+    return softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2])))
+end
+
+k = [10, 0.1]
+
+kernel = make_kernel(k)
+f = LatentGP(GP(kernel), BernoulliLikelihood(), 0.1)
+fx = f(x)
+
+# %%
+# Then, plot some samples from the prior underlying GP
+x_plot = 0:0.02:6
+prior_f_samples = rand(f.f(x_plot, 1e-6), 20)
+
+plt = plot(x_plot, prior_f_samples; seriescolor="red", linealpha=0.2, label="")
+scatter!(plt, x, y; seriescolor="blue", label="Data points")
+
+# %%
+# Plot the same samples, but pushed through a logistic sigmoid to constrain
+# them in (0, 1).
+prior_y_samples = mean.(f.lik.(prior_f_samples))
+
+plt = plot(x_plot, prior_y_samples; seriescolor="red", linealpha=0.2, label="")
+scatter!(plt, x, y; seriescolor="blue", label="Data points")
+
+# %%
+# A simple Flux model
+using Flux
+
+struct SVGPModel
+    k # kernel parameters
+    m # variational mean
+    A # variational covariance
+    z # inducing points
+end
+
+Flux.@functor SVGPModel (k, m, A) # Don't train the inducing inputs
+
+lik = BernoulliLikelihood()
+jitter = 1e-4
+
+function (m::SVGPModel)(x)
+    kernel = make_kernel(m.k)
+    f = LatentGP(GP(kernel), lik, jitter)
+    q = MvNormal(m.m, m.A'm.A)
+    fx = f(x)
+    fu = f(m.z).fx
+    return fx, fu, q
+end
+
+function flux_loss(x, y; n_data=length(y))
+    fx, fu, q = model(x)
+    return -SparseGPs.elbo(fx, y, fu, q; n_data, method=MonteCarlo())
+end
+
+# %%
+M = 15 # number of inducing points
+
+# Initialise the parameters
+k = [10, 0.1]
+m = zeros(M)
+A = Matrix{Float64}(I, M, M)
+z = x[1:M]
+
+model = SVGPModel(k, m, A, z)
+
+opt = ADAM(0.1)
+parameters = Flux.params(model)
+
+# %%
+# Negative ELBO before training
+println(flux_loss(x, y))
+
+# %%
+# Train the model
+Flux.train!(
+    (x, y) -> flux_loss(x, y),
+    parameters,
+    ncycle([(x, y)], 2000), # Train for 1000 epochs
+    opt,
+)
+
+# %%
+# Negative ELBO after training
+println(flux_loss(x, y))
+
+# %%
+# After optimisation, plot samples from the underlying posterior GP.
+fu = f(z).fx # want the underlying FiniteGP
+post = SparseGPs.approx_posterior(SVGP(), fu, MvNormal(m, A'A))
+l_post = LatentGP(post, BernoulliLikelihood(), jitter)
+
+post_f_samples = rand(l_post.f(x_plot, 1e-6), 20)
+
+plt = plot(x_plot, post_f_samples; seriescolor="red", linealpha=0.2, legend=false)
+
+# %%
+# As above, push these samples through a logistic sigmoid to get posterior predictions.
+post_y_samples = mean.(l_post.lik.(post_f_samples))
+
+plt = plot(
+    x_plot,
+    post_y_samples;
+    seriescolor="red",
+    linealpha=0.2,
+    # legend=false,
+    label="",
+)
+scatter!(plt, x, y; seriescolor="blue", label="Data points")
+vline!(z; label="Pseudo-points")
diff --git a/examples/data/classif_1D.csv b/examples/data/classif_1D.csv
new file mode 100644
index 00000000..70ddb862
--- /dev/null
+++ b/examples/data/classif_1D.csv
@@ -0,0 +1,50 @@
+5.668341708542713242e+00 0.000000000000000000e+00
+5.758793969849246075e+00 0.000000000000000000e+00
+5.517587939698492150e+00 0.000000000000000000e+00
+2.954773869346733584e+00 1.000000000000000000e+00
+3.648241206030150785e+00 1.000000000000000000e+00
+2.110552763819095290e+00 1.000000000000000000e+00
+4.613065326633165597e+00 0.000000000000000000e+00
+4.793969849246231263e+00 0.000000000000000000e+00
+4.703517587939698430e+00 0.000000000000000000e+00
+6.030150753768843686e-01 1.000000000000000000e+00
+3.015075376884421843e-01 0.000000000000000000e+00
+3.979899497487437099e+00 0.000000000000000000e+00
+3.226130653266331638e+00 1.000000000000000000e+00
+1.899497487437185939e+00 1.000000000000000000e+00
+1.145728643216080256e+00 1.000000000000000000e+00
+3.316582914572864249e-01 0.000000000000000000e+00
+6.030150753768843686e-01 1.000000000000000000e+00
+2.231155778894472252e+00 1.000000000000000000e+00
+3.256281407035175768e+00 1.000000000000000000e+00
+1.085427135678391997e+00 1.000000000000000000e+00
+1.809045226130653106e+00 1.000000000000000000e+00
+4.492462311557789079e+00 0.000000000000000000e+00
+1.959798994974874198e+00 1.000000000000000000e+00
+0.000000000000000000e+00 0.000000000000000000e+00
+3.346733668341708601e+00 1.000000000000000000e+00
+1.507537688442210921e-01 0.000000000000000000e+00
+1.809045226130653328e-01 1.000000000000000000e+00
+5.517587939698492150e+00 0.000000000000000000e+00
+2.201005025125628123e+00 1.000000000000000000e+00
+5.577889447236180409e+00 0.000000000000000000e+00
+1.809045226130653328e-01 0.000000000000000000e+00
+1.688442211055276365e+00 1.000000000000000000e+00
+4.160804020100502321e+00 0.000000000000000000e+00
+2.170854271356783993e+00 1.000000000000000000e+00
+4.311557788944723413e+00 0.000000000000000000e+00
+3.075376884422110546e+00 1.000000000000000000e+00
+5.125628140703517133e+00 0.000000000000000000e+00
+1.989949748743718549e+00 1.000000000000000000e+00
+5.366834170854271058e+00 0.000000000000000000e+00
+4.100502512562814061e+00 0.000000000000000000e+00
+7.236180904522613311e-01 1.000000000000000000e+00
+2.261306532663316382e+00 1.000000000000000000e+00
+3.467336683417085119e+00 1.000000000000000000e+00
+1.085427135678391997e+00 1.000000000000000000e+00
+5.095477386934673447e+00 0.000000000000000000e+00
+5.185929648241205392e+00 0.000000000000000000e+00
+2.743718592964823788e+00 1.000000000000000000e+00
+2.773869346733668362e+00 1.000000000000000000e+00
+1.417085427135678311e+00 1.000000000000000000e+00
+1.989949748743718549e+00 1.000000000000000000e+00
diff --git a/examples/regression.jl b/examples/regression.jl
new file mode 100644
index 00000000..518ee761
--- /dev/null
+++ b/examples/regression.jl
@@ -0,0 +1,174 @@
+# A recreation of https://gpflow.readthedocs.io/en/master/notebooks/advanced/gps_for_big_data.html
+
+using AbstractGPs
+using SparseGPs
+using Distributions
+using LinearAlgebra
+using Optim
+using IterTools
+
+using Plots
+default(; legend=:outertopright, size=(700, 400))
+
+using Random
+Random.seed!(1234)
+
+# %%
+# The data generating function
+function g(x)
+    return sin(3π * x) + 0.3 * cos(9π * x) + 0.5 * sin(7π * x)
+end
+
+N = 10000 # Number of training points
+x = rand(Uniform(-1, 1), N)
+y = g.(x) + 0.3 * randn(N)
+
+scatter(x, y; xlabel="x", ylabel="y", legend=false)
+
+# %%
+# A simple Flux model
+using Flux
+
+lik_noise = 0.3
+jitter = 1e-5
+
+struct SVGPModel
+    k # kernel parameters
+    m # variational mean
+    A # variational covariance
+    z # inducing points
+end
+
+Flux.@functor SVGPModel (k, m, A) # Don't train the inducing inputs
+
+function make_kernel(k)
+    return softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2])))
+end
+
+# Create the 'model' from the parameters - i.e. return the FiniteGP at inputs x,
+# the FiniteGP at inducing inputs z and the variational posterior over inducing
+# points - q(u).
+function (m::SVGPModel)(x)
+    kernel = make_kernel(m.k)
+    f = GP(kernel)
+    q = MvNormal(m.m, m.A'm.A)
+    fx = f(x, lik_noise)
+    fu = f(m.z, jitter)
+    return fx, fu, q
+end
+
+# Create the posterior GP from the model parameters.
+function posterior(m::SVGPModel)
+    kernel = make_kernel(m.k)
+    f = GP(kernel)
+    fu = f(m.z, jitter)
+    q = MvNormal(m.m, m.A'm.A)
+    return SparseGPs.approx_posterior(SVGP(), fu, q)
+end
+
+# Return the loss given data - in this case the negative ELBO.
+function flux_loss(x, y; n_data=length(y))
+    fx, fu, q = model(x)
+    return -SparseGPs.elbo(fx, y, fu, q; n_data)
+end
+
+# %%
+M = 50 # number of inducing points
+
+# Select the first M inputs as inducing inputs
+z = x[1:M]
+
+# Initialise the parameters
+k = [0.3, 10]
+m = zeros(M)
+A = Matrix{Float64}(I, M, M)
+
+model = SVGPModel(k, m, A, z)
+
+b = 100 # minibatch size
+opt = ADAM(0.001)
+parameters = Flux.params(model)
+data_loader = Flux.Data.DataLoader((x, y); batchsize=b)
+
+# %%
+# Negative ELBO before training
+println(flux_loss(x, y))
+
+# %%
+# Train the model
+Flux.train!(
+    (x, y) -> flux_loss(x, y; n_data=N),
+    parameters,
+    ncycle(data_loader, 300), # Train for 300 epochs
+    opt,
+)
+
+# %%
+# Negative ELBO after training
+println(flux_loss(x, y))
+
+# %%
+# Plot samples from the optmimised approximate posterior.
+post = posterior(model)
+
+scatter(
+    x,
+    y;
+    markershape=:xcross,
+    markeralpha=0.1,
+    xlim=(-1, 1),
+    xlabel="x",
+    ylabel="y",
+    title="posterior (VI with sparse grid)",
+    label="Train Data",
+)
+plot!(-1:0.001:1, post; label="Posterior")
+vline!(z; label="Pseudo-points")
+
+# %% There is a closed form optimal solution for the variational posterior q(u)
+# (e.g. https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/
+# equations (11) & (12)). The SVGP posterior with this optimal q(u) should
+# therefore be equivalent to the 'exact' sparse GP (Titsias) posterior.
+
+function exact_q(fu, fx, y)
+    σ² = fx.Σy[1]
+    Kuf = cov(fu, fx)
+    Kuu = Symmetric(cov(fu))
+    Σ = (Symmetric(cov(fu) + (1 / σ²) * Kuf * Kuf'))
+    m = ((1 / σ²) * Kuu * (Σ \ Kuf)) * y
+    S = Symmetric(Kuu * (Σ \ Kuu))
+    return MvNormal(m, S)
+end
+
+kernel = make_kernel([0.3, 10])
+f = GP(kernel)
+fx = f(x, lik_noise)
+fu = f(z, jitter)
+q_ex = exact_q(fu, fx, y)
+
+scatter(x, y)
+scatter!(z, q_ex.μ)
+
+# These two should be the same - and they are, as the plot below shows
+ap_ex = SparseGPs.approx_posterior(SVGP(), fu, q_ex) # Hensman (2013) exact posterior
+ap_tits = AbstractGPs.approx_posterior(VFE(), fx, y, fu) # Titsias posterior
+
+# These are also approximately equal
+SparseGPs.elbo(fx, y, fu, q_ex)
+AbstractGPs.elbo(fx, y, fu)
+
+# %%
+scatter(
+    x,
+    y;
+    markershape=:xcross,
+    markeralpha=0.1,
+    xlim=(-1, 1),
+    xlabel="x",
+    ylabel="y",
+    title="posterior (VI with sparse grid)",
+    label="Train Data",
+)
+plot!(-1:0.001:1, ap_ex; label="SVGP posterior")
+plot!(-1:0.001:1, ap_tits; label="Titsias posterior")
+vline!(z; label="Pseudo-points")
diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl
new file mode 100644
index 00000000..c0a165af
--- /dev/null
+++ b/src/SparseGPs.jl
@@ -0,0 +1,23 @@
+module SparseGPs
+
+using AbstractGPs
+using Distributions
+using LinearAlgebra
+using Statistics
+using StatsBase
+using FastGaussQuadrature
+using GPLikelihoods
+using SpecialFunctions
+using ChainRulesCore
+using FillArrays
+using KLDivergences
+
+using AbstractGPs:
+    AbstractGP, FiniteGP, LatentFiniteGP, ApproxPosteriorGP, At_A, diag_At_A
+
+export SVGP, Default, Analytic, Quadrature, MonteCarlo
+
+include("elbo.jl")
+include("svgp.jl")
+
+end
diff --git a/src/elbo.jl b/src/elbo.jl
new file mode 100644
index 00000000..38627ceb
--- /dev/null
+++ b/src/elbo.jl
@@ -0,0 +1,226 @@
+"Likelihoods which take a scalar as input and return a scalar."
+ScalarLikelihood = Union{
+    BernoulliLikelihood,
+    PoissonLikelihood,
+    GaussianLikelihood,
+    ExponentialLikelihood,
+    GammaLikelihood,
+}
+
+abstract type ExpectationMethod end
+struct Default <: ExpectationMethod end
+struct Analytic <: ExpectationMethod end
+
+struct Quadrature <: ExpectationMethod
+    n_points::Int
+end
+Quadrature() = Quadrature(20)
+
+struct MonteCarlo <: ExpectationMethod
+    n_samples::Int
+end
+MonteCarlo() = MonteCarlo(20)
+
+"""
+    elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), method=Default())
+
+Compute the Evidence Lower BOund from [1] for the process `fx.f` where `y` are
+observations of `fx`, pseudo-inputs are given by `z = fz.x` and `q(u)` is a
+variational distribution over inducing points `u = f(z)`.
+
+`method` selects which method is used to calculate the expected loglikelihood in
+the ELBO. The options are: `Default()`, `Analytic()`, `Quadrature()` and
+`MonteCarlo()`. For likelihoods with an analytic solution, `Default()` uses this
+exact solution. If there is no such solution, `Default()` either uses
+`Quadrature()` or `MonteCarlo()`, depending on the likelihood.
+
+N.B. the likelihood is assumed to be Gaussian with observation noise `fx.Σy`.
+Further, `fx.Σy` must be homoscedastic and uncorrelated - i.e. `fx.Σy = α * I`.
+
+[1] - Hensman, James, Alexander Matthews, and Zoubin Ghahramani. "Scalable
+variational Gaussian process classification." Artificial Intelligence and
+Statistics. PMLR, 2015.
+"""
+function AbstractGPs.elbo(
+    fx::FiniteGP{<:AbstractGP,<:AbstractVector,<:Diagonal{<:Real,<:Fill}},
+    y::AbstractVector{<:Real},
+    fz::FiniteGP,
+    q::AbstractMvNormal;
+    n_data=length(y),
+    method=Default(),
+)
+    return _elbo(method, fx, y, fz, q, GaussianLikelihood(fx.Σy[1]), n_data)
+end
+
+function AbstractGPs.elbo(
+    ::FiniteGP, ::AbstractVector, ::FiniteGP, ::AbstractMvNormal; kwargs...
+)
+    return error(
+        "The observation noise fx.Σy must be homoscedastic.\n To avoid this error, construct fx using: f = GP(kernel); fx = f(x, σ²)",
+    )
+end
+
+"""
+    elbo(lfx::LatentFiniteGP, y::AbstractVector, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), method=Default())
+
+Compute the ELBO for a LatentGP with a possibly non-conjugate likelihood.
+"""
+function AbstractGPs.elbo(
+    lfx::LatentFiniteGP,
+    y::AbstractVector,
+    fz::FiniteGP,
+    q::AbstractMvNormal;
+    n_data=length(y),
+    method=Default(),
+)
+    return _elbo(method, lfx.fx, y, fz, q, lfx.lik, n_data)
+end
+
+# Compute the common elements of the ELBO
+function _elbo(
+    method::ExpectationMethod,
+    fx::FiniteGP,
+    y::AbstractVector,
+    fz::FiniteGP,
+    q::AbstractMvNormal,
+    lik::ScalarLikelihood,
+    n_data::Integer,
+)
+    post = approx_posterior(SVGP(), fz, q)
+    q_f = marginals(post(fx.x))
+    variational_exp = expected_loglik(method, y, q_f, lik)
+
+    kl_term = kldivergence(q, fz)
+
+    n_batch = length(y)
+    scale = n_data / n_batch
+    return sum(variational_exp) * scale - kl_term
+end
+
+"""
+    expected_loglik(method::ExpectationMethod, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik)
+
+This function computes the expected log likelihood:
+
+```math
+    ∫ q(f) log p(y | f) df
+```
+where `p(y | f)` is the process likelihood.
+
+`q(f)` is an approximation to the latent function values `f` given by:
+```math
+    q(f) = ∫ p(f | u) q(u) du
+```
+where `q(u)` is the variational distribution over inducing points (see
+[`elbo`](@ref)). The marginal distributions of `q(f)` are given by `q_f`.
+
+`method` determines which method is used to calculate the expected log
+likelihood - see [`elbo`](@ref) for more details.
+
+# Extended help
+
+`q(f)` is assumed to be an `MvNormal` distribution and `p(y | f)` is assumed to
+have independent marginals such that only the marginals of `q(f)` are required.
+"""
+expected_loglik(method, y, q_f, lik)
+
+"""
+    expected_loglik(method::ExpectationMethod, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik::ScalarLikelihood)
+
+The expected log likelihood for a `ScalarLikelihood`, computed via `method`.
+Defaults to a closed form solution if it exists, otherwise defaults to
+Gauss-Hermite quadrature.
+"""
+function expected_loglik(
+    ::Default, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik::ScalarLikelihood
+)
+    method = _default_method(lik)
+    return expected_loglik(method, y, q_f, lik)
+end
+
+# The closed form solution for independent Gaussian noise
+function expected_loglik(
+    ::Analytic,
+    y::AbstractVector{<:Real},
+    q_f::AbstractVector{<:Normal},
+    lik::GaussianLikelihood,
+)
+    return sum(
+        -0.5 * (log(2π) .+ log.(lik.σ²) .+ ((y .- mean.(q_f)) .^ 2 .+ var.(q_f)) / lik.σ²)
+    )
+end
+
+# The closed form solution for a Poisson likelihood with an exponential inverse link function
+function expected_loglik(
+    ::Analytic,
+    y::AbstractVector,
+    q_f::AbstractVector{<:Normal},
+    ::PoissonLikelihood{ExpLink},
+)
+    f_μ = mean.(q_f)
+    return sum((y .* f_μ) - exp.(f_μ .+ (var.(q_f) / 2)) - loggamma.(y .+ 1))
+end
+
+# The closed form solution for an Exponential likelihood with an exponential inverse link function
+function expected_loglik(
+    ::Analytic,
+    y::AbstractVector{<:Real},
+    q_f::AbstractVector{<:Normal},
+    ::ExponentialLikelihood{ExpLink},
+)
+    f_μ = mean.(q_f)
+    return sum(-f_μ - y .* exp.((var.(q_f) / 2) .- f_μ))
+end
+
+# The closed form solution for a Gamma likelihood with an exponential inverse link function
+function expected_loglik(
+    ::Analytic,
+    y::AbstractVector{<:Real},
+    q_f::AbstractVector{<:Normal},
+    lik::GammaLikelihood{<:Any,ExpLink},
+)
+    f_μ = mean.(q_f)
+    return sum(
+        (lik.α - 1) * log.(y) .- y .* exp.((var.(q_f) / 2) .- f_μ) .- lik.α * f_μ .-
+        loggamma(lik.α),
+    )
+end
+
+function expected_loglik(::Analytic, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik)
+    return error(
+        "No analytic solution exists for ",
+        typeof(lik),
+        ". Use `Default()`, `Quadrature()` or `MonteCarlo()` instead.",
+    )
+end
+
+function expected_loglik(
+    mc::MonteCarlo, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik::ScalarLikelihood
+)
+    # take 'n_samples' reparameterised samples
+    f_μ = mean.(q_f)
+    fs = f_μ .+ std.(q_f) .* randn(eltype(f_μ), length(q_f), mc.n_samples)
+    lls = loglikelihood.(lik.(fs), y)
+    return sum(lls) / mc.n_samples
+end
+
+function expected_loglik(
+    gh::Quadrature, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik::ScalarLikelihood
+)
+    # Compute the expectation via Gauss-Hermite quadrature
+    # using a reparameterisation by change of variable
+    # (see eg. en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature)
+    xs, ws = gausshermite(gh.n_points)
+    # size(fs): (length(y), n_points)
+    fs = √2 * std.(q_f) .* transpose(xs) .+ mean.(q_f)
+    lls = loglikelihood.(lik.(fs), y)
+    return sum((1 / √π) * lls * ws)
+end
+
+ChainRulesCore.@non_differentiable gausshermite(n)
+
+AnalyticLikelihood = Union{
+    PoissonLikelihood,GaussianLikelihood,ExponentialLikelihood,GammaLikelihood
+}
+_default_method(::AnalyticLikelihood) = Analytic()
+_default_method(_) = Quadrature()
diff --git a/src/svgp.jl b/src/svgp.jl
new file mode 100644
index 00000000..0e71a265
--- /dev/null
+++ b/src/svgp.jl
@@ -0,0 +1,71 @@
+struct SVGP end
+
+raw"""
+    approx_posterior(::SVGP, fz::FiniteGP, q::MvNormal)
+
+Compute the approximate posterior [1] over the process `f = fz.f`, given inducing
+inputs `z = fz.x` and a variational distribution over inducing points `q(u)` where `u =
+f(z)`. The approximate posterior at test points ``x^*`` where ``f^* = f(x^*)``
+is then given by:
+
+```math
+q(f^*) = \int p(f | u) q(u) du
+```
+which can be found in closed form.
+
+[1] - Hensman, James, Alexander Matthews, and Zoubin Ghahramani. "Scalable
+variational Gaussian process classification." Artificial Intelligence and
+Statistics. PMLR, 2015.
+"""
+function AbstractGPs.approx_posterior(::SVGP, fz::FiniteGP, q::AbstractMvNormal)
+    m, A = mean(q), _chol_cov(q)
+    Kuu = _chol_cov(fz)
+    B = Kuu.L \ A.L
+    α = Kuu \ (m - mean(fz))
+    data = (A=A, m=m, Kuu=Kuu, B=B, α=α, u=fz.x)
+    return ApproxPosteriorGP(SVGP(), fz.f, data)
+end
+
+function Statistics.mean(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
+    return mean(f.prior, x) + cov(f.prior, x, f.data.u) * f.data.α
+end
+
+function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
+    Cux = cov(f.prior, f.data.u, x)
+    D = f.data.Kuu.L \ Cux
+    return cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D)
+end
+
+function Statistics.var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
+    Cux = cov(f.prior, f.data.u, x)
+    D = f.data.Kuu.L \ Cux
+    return var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D)
+end
+
+function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector, y::AbstractVector)
+    B = f.data.B
+    Cxu = cov(f.prior, x, f.data.u)
+    Cuy = cov(f.prior, f.data.u, y)
+    D = f.data.Kuu.L \ Cuy
+    E = Cxu / f.data.Kuu.L'
+    return cov(f.prior, x, y) - (E * D) + (E * B * B' * D)
+end
+
+function StatsBase.mean_and_cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
+    Cux = cov(f.prior, f.data.u, x)
+    D = f.data.Kuu.L \ Cux
+    μ = Cux' * f.data.α
+    Σ = cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D)
+    return μ, Σ
+end
+
+function StatsBase.mean_and_var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
+    Cux = cov(f.prior, f.data.u, x)
+    D = f.data.Kuu.L \ Cux
+    μ = Cux' * f.data.α
+    Σ_diag = var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D)
+    return μ, Σ_diag
+end
+
+_chol_cov(q::AbstractMvNormal) = cholesky(Symmetric(cov(q)))
+_chol_cov(q::MvNormal) = cholesky(q.Σ)
diff --git a/test/Project.toml b/test/Project.toml
new file mode 100644
index 00000000..47a7de77
--- /dev/null
+++ b/test/Project.toml
@@ -0,0 +1,8 @@
+[deps]
+AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918"
+Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/elbo.jl b/test/elbo.jl
new file mode 100644
index 00000000..0f20367c
--- /dev/null
+++ b/test/elbo.jl
@@ -0,0 +1,36 @@
+@testset "elbo" begin
+    rng, N = MersenneTwister(654321), 20
+    x = rand(rng, N) * 10
+    y = sin.(x) + 0.9 * cos.(x * 1.6) + 0.4 * rand(rng, N)
+    z = x[begin:5]
+
+    kernel = make_kernel([0.2, 0.6])
+    f = GP(kernel)
+    fx = f(x, 0.1)
+    fz = f(z)
+    q_ex = exact_variational_posterior(fz, fx, y)
+
+    @test elbo(fx, y, fz, q_ex) isa Real
+    @test elbo(fx, y, fz, q_ex) ≤ logpdf(fx, y)
+
+    fx_bad = f(x, fill(0.1, N))
+    @test_throws ErrorException elbo(fx_bad, y, fz, q_ex)
+
+    # Test that the various methods of computing expectations return the same
+    # result.
+    rng = MersenneTwister(123456)
+    q_f = Normal.(zeros(10), ones(10))
+
+    @testset "$lik" for lik in Base.uniontypes(SparseGPs.ScalarLikelihood)
+        l = lik()
+        methods = [Quadrature(100), MonteCarlo(1000000)]
+        def = SparseGPs._default_method(l)
+        if def isa Analytic
+            push!(methods, def)
+        end
+        y = rand.(rng, l.(zeros(10)))
+
+        results = map(m -> SparseGPs.expected_loglik(m, y, q_f, l), methods)
+        @test all(x -> isapprox(x, results[end]; rtol=1e-3), results)
+    end
+end
diff --git a/test/equivalences.jl b/test/equivalences.jl
new file mode 100644
index 00000000..3bb4c7bc
--- /dev/null
+++ b/test/equivalences.jl
@@ -0,0 +1,117 @@
+@testset "equivalences" begin
+    rng, N = MersenneTwister(654321), 20
+    x = rand(rng, N) * 10
+    y = sin.(x) + 0.9 * cos.(x * 1.6) + 0.4 * rand(rng, N)
+
+    z = copy(x) # Set inducing inputs == training inputs
+
+    k_init = [0.2, 0.6] # initial kernel parameters
+    lik_noise = 0.1 # The (fixed) Gaussian likelihood noise
+
+    @testset "exact posterior" begin
+        # There is a closed form optimal solution for the variational posterior
+        # q(u) (e.g. # https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/
+        # equations (11) & (12)). The SVGP posterior with this optimal q(u)
+        # should therefore be equivalent to the sparse GP (Titsias) posterior
+        # and exact GP regression (when z == x).
+
+        kernel = make_kernel(k_init)
+        f = GP(kernel)
+        fx = f(x, lik_noise)
+        fz = f(z)
+        q_ex = exact_variational_posterior(fz, fx, y)
+
+        gpr_post = posterior(fx, y) # Exact GP regression
+        vfe_post = approx_posterior(VFE(), fx, y, fz) # Titsias posterior
+        svgp_post = approx_posterior(SVGP(), fz, q_ex) # Hensman (2013) exact posterior
+
+        @test mean(gpr_post, x) ≈ mean(svgp_post, x) atol = 1e-10
+        @test cov(gpr_post, x) ≈ cov(svgp_post, x) atol = 1e-10
+
+        @test mean(vfe_post, x) ≈ mean(svgp_post, x) atol = 1e-10
+        @test cov(vfe_post, x) ≈ cov(svgp_post, x) atol = 1e-10
+
+        @test elbo(fx, y, fz, q_ex) ≈ logpdf(fx, y)
+    end
+
+    @testset "optimised posterior" begin
+        jitter = 1e-5
+
+        make_gp(kernel) = GP(kernel)
+
+        ## FIRST - define the models
+        # GPR - Exact GP regression
+        struct GPRModel
+            k # kernel parameters
+        end
+        Flux.@functor GPRModel
+
+        function (m::GPRModel)(x)
+            f = make_gp(make_kernel(m.k))
+            fx = f(x, lik_noise)
+            return fx
+        end
+
+        # SVGP - Sparse variational GP regression (Hensman 2014)
+        struct SVGPModel
+            k # kernel parameters
+            z # inducing points
+            m # variational mean
+            A # variational covariance sqrt (Σ = A'A)
+        end
+        Flux.@functor SVGPModel (k, m, A) # Don't train the inducing inputs
+
+        function (m::SVGPModel)(x)
+            f = make_gp(make_kernel(m.k))
+            q = MvNormal(m.m, m.A'm.A)
+            fx = f(x, lik_noise)
+            fz = f(m.z, jitter)
+            return fx, fz, q
+        end
+
+        ## SECOND - create the models and associated training losses
+        gpr = GPRModel(copy(k_init))
+        function GPR_loss(x, y)
+            fx = gpr(x)
+            return -logpdf(fx, y)
+        end
+
+        m, A = zeros(N), Matrix{Float64}(I, N, N) # initialise the variational parameters
+        svgp = SVGPModel(copy(k_init), copy(z), m, A)
+        function SVGP_loss(x, y)
+            fx, fz, q = svgp(x)
+            return -elbo(fx, y, fz, q)
+        end
+
+        ## THIRD - train the models
+        data = [(x, y)]
+        opt = ADAM(0.001)
+
+        svgp_ps = Flux.params(svgp)
+        delete!(svgp_ps, svgp.k) # Don't train the kernel parameters
+
+        # Optimise q(u)
+        Flux.train!((x, y) -> SVGP_loss(x, y), svgp_ps, ncycle(data, 20000), opt)
+
+        ## FOURTH - construct the posteriors
+        function posterior(m::GPRModel, x, y)
+            f = make_gp(make_kernel(m.k))
+            fx = f(x, lik_noise)
+            return AbstractGPs.posterior(fx, y)
+        end
+
+        function posterior(m::SVGPModel)
+            f = make_gp(make_kernel(m.k))
+            fz = f(m.z, jitter)
+            q = MvNormal(m.m, m.A'm.A)
+            return approx_posterior(SVGP(), fz, q)
+        end
+
+        gpr_post = posterior(gpr, x, y)
+        svgp_post = posterior(svgp)
+
+        ## FIFTH - test equivalences
+        @test all(isapprox.(mean(gpr_post, x), mean(svgp_post, x), atol=1e-4))
+        @test all(isapprox.(cov(gpr_post, x), cov(svgp_post, x), atol=1e-4))
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
new file mode 100644
index 00000000..3a60e76b
--- /dev/null
+++ b/test/runtests.jl
@@ -0,0 +1,27 @@
+using Random
+using Test
+using SparseGPs
+using Flux
+using IterTools
+using AbstractGPs
+using Distributions
+using LinearAlgebra
+
+const GROUP = get(ENV, "GROUP", "All")
+const PKGDIR = dirname(dirname(pathof(SparseGPs)))
+
+include("test_utils.jl")
+
+@testset "SparseGPs" begin
+    include("svgp.jl")
+    println(" ")
+    @info "Ran svgp tests"
+
+    include("elbo.jl")
+    println(" ")
+    @info "Ran elbo tests"
+
+    include("equivalences.jl")
+    println(" ")
+    @info "Ran equivalences tests"
+end
diff --git a/test/svgp.jl b/test/svgp.jl
new file mode 100644
index 00000000..7dd90692
--- /dev/null
+++ b/test/svgp.jl
@@ -0,0 +1,20 @@
+@testset "svgp" begin
+    rng = MersenneTwister(123456)
+    N_cond = 5
+    N_a = 6
+    N_b = 7
+
+    # Specify prior.
+    f = GP(Matern32Kernel())
+    # Sample from prior.
+    x = collect(range(-1.0, 1.0; length=N_cond))
+    fx = f(x, 1e-15)
+    y = rand(rng, fx)
+
+    q = exact_variational_posterior(fx, fx, y)
+    f_approx_post = approx_posterior(SVGP(), fx, q)
+
+    a = collect(range(-1.0, 1.0; length=N_a))
+    b = randn(rng, N_b)
+    AbstractGPs.TestUtils.test_internal_abstractgps_interface(rng, f_approx_post, a, b)
+end
diff --git a/test/test_utils.jl b/test/test_utils.jl
new file mode 100644
index 00000000..805c799d
--- /dev/null
+++ b/test/test_utils.jl
@@ -0,0 +1,17 @@
+# Create a default kernel from two parameters k[1] and k[2]
+make_kernel(k) = softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2])))
+
+# Computes the optimal closed form solution for the variational posterior
+# q(u) (e.g. # https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/
+# equations (11) & (12)). Assumes a ZeroMean function.
+function exact_variational_posterior(fu, fx, y)
+    fu.f.mean isa AbstractGPs.ZeroMean ||
+        error("The exact posterior requires a GP with ZeroMean.")
+    σ² = fx.Σy[1]
+    Kuf = cov(fu, fx)
+    Kuu = Symmetric(cov(fu))
+    Σ = (Symmetric(cov(fu) + (1 / σ²) * Kuf * Kuf'))
+    m = ((1 / σ²) * Kuu * (Σ \ Kuf)) * y
+    S = Symmetric(Kuu * (Σ \ Kuu))
+    return MvNormal(m, S)
+end