From 3cec6a0935a483ebce5e5fea2b6e4e4729acf60a Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Sat, 19 Jun 2021 17:32:22 +0100
Subject: [PATCH 01/66] Add files

---
 Manifest.toml           | 434 ++++++++++++++++++++++++++++++++++++++++
 Project.toml            |  13 ++
 examples/gpflow_svgp.jl | 132 ++++++++++++
 src/SparseGPs.jl        |  19 ++
 src/svgp.jl             |  53 +++++
 5 files changed, 651 insertions(+)
 create mode 100644 Manifest.toml
 create mode 100644 Project.toml
 create mode 100644 examples/gpflow_svgp.jl
 create mode 100644 src/SparseGPs.jl
 create mode 100644 src/svgp.jl

diff --git a/Manifest.toml b/Manifest.toml
new file mode 100644
index 00000000..6953aae4
--- /dev/null
+++ b/Manifest.toml
@@ -0,0 +1,434 @@
+# This file is machine-generated - editing it directly is not advised
+
+[[AbstractGPs]]
+deps = ["ChainRulesCore", "Distributions", "FillArrays", "KernelFunctions", "LinearAlgebra", "Random", "RecipesBase", "Reexport", "Statistics", "StatsBase"]
+git-tree-sha1 = "d3700bd0201d2ec29c0b18d6f3f971f7072fe491"
+uuid = "99985d1d-32ba-4be9-9821-2ec096f28918"
+version = "0.3.5"
+
+[[ArgTools]]
+uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
+
+[[ArrayInterface]]
+deps = ["IfElse", "LinearAlgebra", "Requires", "SparseArrays", "Static"]
+git-tree-sha1 = "045ff5e1bc8c6fb1ecb28694abba0a0d55b5f4f5"
+uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
+version = "3.1.17"
+
+[[Artifacts]]
+uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
+
+[[Base64]]
+uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
+
+[[ChainRulesCore]]
+deps = ["Compat", "LinearAlgebra", "SparseArrays"]
+git-tree-sha1 = "4289a76df5a8568cca9970e54dd585c6c395c496"
+uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+version = "0.10.7"
+
+[[CommonSubexpressions]]
+deps = ["MacroTools", "Test"]
+git-tree-sha1 = "7b8a93dba8af7e3b42fecabf646260105ac373f7"
+uuid = "bbf7d656-a473-5ed7-a52c-81e309532950"
+version = "0.3.0"
+
+[[Compat]]
+deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
+git-tree-sha1 = "e4e2b39db08f967cc1360951f01e8a75ec441cab"
+uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
+version = "3.30.0"
+
+[[CompilerSupportLibraries_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
+
+[[CompositionsBase]]
+git-tree-sha1 = "f3955eb38944e5dd0fabf8ca1e267d94941d34a5"
+uuid = "a33af91c-f02d-484b-be07-31d278c5ca2b"
+version = "0.1.0"
+
+[[DataAPI]]
+git-tree-sha1 = "dfb3b7e89e395be1e25c2ad6d7690dc29cc53b1d"
+uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
+version = "1.6.0"
+
+[[DataStructures]]
+deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
+git-tree-sha1 = "4437b64df1e0adccc3e5d1adbc3ac741095e4677"
+uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
+version = "0.18.9"
+
+[[Dates]]
+deps = ["Printf"]
+uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
+
+[[DelimitedFiles]]
+deps = ["Mmap"]
+uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+
+[[DiffResults]]
+deps = ["StaticArrays"]
+git-tree-sha1 = "c18e98cba888c6c25d1c3b048e4b3380ca956805"
+uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
+version = "1.0.3"
+
+[[DiffRules]]
+deps = ["NaNMath", "Random", "SpecialFunctions"]
+git-tree-sha1 = "214c3fcac57755cfda163d91c58893a8723f93e9"
+uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
+version = "1.0.2"
+
+[[Distances]]
+deps = ["LinearAlgebra", "Statistics", "StatsAPI"]
+git-tree-sha1 = "abe4ad222b26af3337262b8afb28fab8d215e9f8"
+uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
+version = "0.10.3"
+
+[[Distributed]]
+deps = ["Random", "Serialization", "Sockets"]
+uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
+
+[[Distributions]]
+deps = ["FillArrays", "LinearAlgebra", "PDMats", "Printf", "QuadGK", "Random", "SparseArrays", "SpecialFunctions", "Statistics", "StatsBase", "StatsFuns"]
+git-tree-sha1 = "62e1ac52e9adf4234285cd88c94954924aa3f9ef"
+uuid = "31c24e10-a181-5473-b8eb-7969acd0382f"
+version = "0.25.5"
+
+[[DocStringExtensions]]
+deps = ["LibGit2"]
+git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f"
+uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
+version = "0.8.5"
+
+[[Downloads]]
+deps = ["ArgTools", "LibCURL", "NetworkOptions"]
+uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
+
+[[FillArrays]]
+deps = ["LinearAlgebra", "Random", "SparseArrays"]
+git-tree-sha1 = "31939159aeb8ffad1d4d8ee44d07f8558273120a"
+uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
+version = "0.11.7"
+
+[[FiniteDiff]]
+deps = ["ArrayInterface", "LinearAlgebra", "Requires", "SparseArrays", "StaticArrays"]
+git-tree-sha1 = "f6f80c8f934efd49a286bb5315360be66956dfc4"
+uuid = "6a86dc24-6348-571c-b903-95158fe2bd41"
+version = "2.8.0"
+
+[[ForwardDiff]]
+deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "LinearAlgebra", "NaNMath", "Printf", "Random", "SpecialFunctions", "StaticArrays"]
+git-tree-sha1 = "e2af66012e08966366a43251e1fd421522908be6"
+uuid = "f6369f11-7733-5829-9624-2563aa707210"
+version = "0.10.18"
+
+[[Functors]]
+deps = ["MacroTools"]
+git-tree-sha1 = "a7bb2af991c43dcf5c3455d276dd83976799634f"
+uuid = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
+version = "0.2.1"
+
+[[IfElse]]
+git-tree-sha1 = "28e837ff3e7a6c3cdb252ce49fb412c8eb3caeef"
+uuid = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
+version = "0.1.0"
+
+[[InteractiveUtils]]
+deps = ["Markdown"]
+uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+
+[[JLLWrappers]]
+deps = ["Preferences"]
+git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e"
+uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
+version = "1.3.0"
+
+[[KernelFunctions]]
+deps = ["ChainRulesCore", "Compat", "CompositionsBase", "Distances", "FillArrays", "Functors", "LinearAlgebra", "Random", "Requires", "SpecialFunctions", "StatsBase", "StatsFuns", "TensorCore", "Test", "ZygoteRules"]
+git-tree-sha1 = "c7b25bc625ca2ee217021d29e3ddf031967bf0ff"
+uuid = "ec8451be-7e33-11e9-00cf-bbf324bd1392"
+version = "0.10.5"
+
+[[LibCURL]]
+deps = ["LibCURL_jll", "MozillaCACerts_jll"]
+uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
+
+[[LibCURL_jll]]
+deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
+uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
+
+[[LibGit2]]
+deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
+uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
+
+[[LibSSH2_jll]]
+deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
+uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
+
+[[Libdl]]
+uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
+
+[[LineSearches]]
+deps = ["LinearAlgebra", "NLSolversBase", "NaNMath", "Parameters", "Printf"]
+git-tree-sha1 = "f27132e551e959b3667d8c93eae90973225032dd"
+uuid = "d3d80556-e9d4-5f37-9878-2ab0fcc64255"
+version = "7.1.1"
+
+[[LinearAlgebra]]
+deps = ["Libdl"]
+uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+
+[[LogExpFunctions]]
+deps = ["DocStringExtensions", "LinearAlgebra"]
+git-tree-sha1 = "1ba664552f1ef15325e68dc4c05c3ef8c2d5d885"
+uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
+version = "0.2.4"
+
+[[Logging]]
+uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
+
+[[MacroTools]]
+deps = ["Markdown", "Random"]
+git-tree-sha1 = "6a8a2a625ab0dea913aba95c11370589e0239ff0"
+uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
+version = "0.5.6"
+
+[[Markdown]]
+deps = ["Base64"]
+uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
+
+[[MbedTLS_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
+
+[[Missings]]
+deps = ["DataAPI"]
+git-tree-sha1 = "4ea90bd5d3985ae1f9a908bd4500ae88921c5ce7"
+uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
+version = "1.0.0"
+
+[[Mmap]]
+uuid = "a63ad114-7e13-5084-954f-fe012c677804"
+
+[[MozillaCACerts_jll]]
+uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
+
+[[NLSolversBase]]
+deps = ["DiffResults", "Distributed", "FiniteDiff", "ForwardDiff"]
+git-tree-sha1 = "50608f411a1e178e0129eab4110bd56efd08816f"
+uuid = "d41bc354-129a-5804-8e4c-c37616107c6c"
+version = "7.8.0"
+
+[[NaNMath]]
+git-tree-sha1 = "bfe47e760d60b82b66b61d2d44128b62e3a369fb"
+uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
+version = "0.3.5"
+
+[[NetworkOptions]]
+uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
+
+[[OpenSpecFun_jll]]
+deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
+uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
+version = "0.5.5+0"
+
+[[Optim]]
+deps = ["Compat", "FillArrays", "LineSearches", "LinearAlgebra", "NLSolversBase", "NaNMath", "Parameters", "PositiveFactorizations", "Printf", "SparseArrays", "StatsBase"]
+git-tree-sha1 = "d34366a3abc25c41f88820762ef7dfdfe9306711"
+uuid = "429524aa-4258-5aef-a3af-852621145aeb"
+version = "1.3.0"
+
+[[OrderedCollections]]
+git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
+uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
+version = "1.4.1"
+
+[[PDMats]]
+deps = ["LinearAlgebra", "SparseArrays", "SuiteSparse"]
+git-tree-sha1 = "4dd403333bcf0909341cfe57ec115152f937d7d8"
+uuid = "90014a1f-27ba-587c-ab20-58faa44d9150"
+version = "0.11.1"
+
+[[Parameters]]
+deps = ["OrderedCollections", "UnPack"]
+git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345"
+uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
+version = "0.12.2"
+
+[[Pkg]]
+deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
+uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+
+[[PositiveFactorizations]]
+deps = ["LinearAlgebra"]
+git-tree-sha1 = "17275485f373e6673f7e7f97051f703ed5b15b20"
+uuid = "85a6dd25-e78a-55b7-8502-1745935b8125"
+version = "0.2.4"
+
+[[Preferences]]
+deps = ["TOML"]
+git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a"
+uuid = "21216c6a-2e73-6563-6e65-726566657250"
+version = "1.2.2"
+
+[[Printf]]
+deps = ["Unicode"]
+uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+
+[[QuadGK]]
+deps = ["DataStructures", "LinearAlgebra"]
+git-tree-sha1 = "12fbe86da16df6679be7521dfb39fbc861e1dc7b"
+uuid = "1fd47b50-473d-5c70-9696-f719f8f3bcdc"
+version = "2.4.1"
+
+[[REPL]]
+deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
+uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
+
+[[Random]]
+deps = ["Serialization"]
+uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+
+[[RecipesBase]]
+git-tree-sha1 = "b3fb709f3c97bfc6e948be68beeecb55a0b340ae"
+uuid = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
+version = "1.1.1"
+
+[[Reexport]]
+git-tree-sha1 = "5f6c21241f0f655da3952fd60aa18477cf96c220"
+uuid = "189a3867-3050-52da-a836-e630ba90ab69"
+version = "1.1.0"
+
+[[Requires]]
+deps = ["UUIDs"]
+git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621"
+uuid = "ae029012-a4dd-5104-9daa-d747884805df"
+version = "1.1.3"
+
+[[Rmath]]
+deps = ["Random", "Rmath_jll"]
+git-tree-sha1 = "bf3188feca147ce108c76ad82c2792c57abe7b1f"
+uuid = "79098fc4-a85e-5d69-aa6a-4863f24498fa"
+version = "0.7.0"
+
+[[Rmath_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "68db32dff12bb6127bac73c209881191bf0efbb7"
+uuid = "f50d1b31-88e8-58de-be2c-1cc44531875f"
+version = "0.3.0+0"
+
+[[SHA]]
+uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
+
+[[Serialization]]
+uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+
+[[SharedArrays]]
+deps = ["Distributed", "Mmap", "Random", "Serialization"]
+uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
+
+[[Sockets]]
+uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
+
+[[SortingAlgorithms]]
+deps = ["DataStructures"]
+git-tree-sha1 = "2ec1962eba973f383239da22e75218565c390a96"
+uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
+version = "1.0.0"
+
+[[SparseArrays]]
+deps = ["LinearAlgebra", "Random"]
+uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+
+[[SpecialFunctions]]
+deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"]
+git-tree-sha1 = "a50550fa3164a8c46747e62063b4d774ac1bcf49"
+uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
+version = "1.5.1"
+
+[[Static]]
+deps = ["IfElse"]
+git-tree-sha1 = "2740ea27b66a41f9d213561a04573da5d3823d4b"
+uuid = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
+version = "0.2.5"
+
+[[StaticArrays]]
+deps = ["LinearAlgebra", "Random", "Statistics"]
+git-tree-sha1 = "57a9b3c69933e15e5b7041b6a57d1533ef1a9882"
+uuid = "90137ffa-7385-5640-81b9-e52037218182"
+version = "1.2.3"
+
+[[Statistics]]
+deps = ["LinearAlgebra", "SparseArrays"]
+uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+
+[[StatsAPI]]
+git-tree-sha1 = "1958272568dc176a1d881acb797beb909c785510"
+uuid = "82ae8749-77ed-4fe6-ae5f-f523153014b0"
+version = "1.0.0"
+
+[[StatsBase]]
+deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics", "StatsAPI"]
+git-tree-sha1 = "2f6792d523d7448bbe2fec99eca9218f06cc746d"
+uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
+version = "0.33.8"
+
+[[StatsFuns]]
+deps = ["LogExpFunctions", "Rmath", "SpecialFunctions"]
+git-tree-sha1 = "30cd8c360c54081f806b1ee14d2eecbef3c04c49"
+uuid = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
+version = "0.9.8"
+
+[[SuiteSparse]]
+deps = ["Libdl", "LinearAlgebra", "Serialization", "SparseArrays"]
+uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9"
+
+[[TOML]]
+deps = ["Dates"]
+uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
+
+[[Tar]]
+deps = ["ArgTools", "SHA"]
+uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
+
+[[TensorCore]]
+deps = ["LinearAlgebra"]
+git-tree-sha1 = "1feb45f88d133a655e001435632f019a9a1bcdb6"
+uuid = "62fd8b95-f654-4bbd-a8a5-9c27f68ccd50"
+version = "0.1.1"
+
+[[Test]]
+deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
+uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[[UUIDs]]
+deps = ["Random", "SHA"]
+uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
+
+[[UnPack]]
+git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
+uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
+version = "1.0.2"
+
+[[Unicode]]
+uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
+
+[[Zlib_jll]]
+deps = ["Libdl"]
+uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
+
+[[ZygoteRules]]
+deps = ["MacroTools"]
+git-tree-sha1 = "9e7a1e8ca60b742e508a315c17eef5211e7fbfd7"
+uuid = "700de1a5-db45-46bc-99cf-38207098b444"
+version = "0.2.1"
+
+[[nghttp2_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
+
+[[p7zip_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
diff --git a/Project.toml b/Project.toml
new file mode 100644
index 00000000..e97fe5a2
--- /dev/null
+++ b/Project.toml
@@ -0,0 +1,13 @@
+name = "SparseGPs"
+uuid = "298c2ebc-0411-48ad-af38-99e88101b606"
+authors = ["Ross Viljoen <ross@viljoen.co.uk>"]
+version = "0.1.0"
+
+[deps]
+AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918"
+Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Optim = "429524aa-4258-5aef-a3af-852621145aeb"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
+StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
diff --git a/examples/gpflow_svgp.jl b/examples/gpflow_svgp.jl
new file mode 100644
index 00000000..935f5330
--- /dev/null
+++ b/examples/gpflow_svgp.jl
@@ -0,0 +1,132 @@
+# An attempted recreation of https://gpflow.readthedocs.io/en/master/notebooks/advanced/gps_for_big_data.html
+
+using AbstractGPs
+using SparseGPs
+using Distributions
+using LinearAlgebra
+using StatsFuns
+using Optim
+
+using Plots
+default(; legend=:outertopright, size=(700, 400))
+
+using Random
+Random.seed!(1234)
+
+# %%
+function g(x)
+    return sin(3π * x) + 0.3 * cos(9π * x) + 0.5 * sin(7π * x)
+end
+
+N = 1000 # Number of training points
+x = rand(Uniform(-1, 1), N)
+y = g.(x) + 0.3 * randn(N)
+
+scatter(x, y; xlabel="x", ylabel="y", legend=false)
+
+# %%
+M = 30 # number of inducing points
+
+function pack_params(θ, m, A)
+    return vcat(θ, m, vec(A))
+end
+
+function unpack_params(params, m; include_z=false)
+    if include_z
+        k = params[1:2]
+        z = params[3:m+2]
+        μ = params[m+3:2m+2]
+        s = params[2m+3:end]
+        Σ = reshape(s, (M, M))
+        return k, z, μ, Σ
+    else
+        k = params[1:2]
+        μ = params[3:m+2]
+        s = params[m+3:end]
+        Σ = reshape(s, (M, M))
+        return k, μ, Σ
+    end
+end
+
+x0 = pack_params(rand(2), zeros(M), vec(Matrix{Float64}(I, M, M)))
+z = x[1:M]
+
+# %%
+function objective_function(x, y)
+    function neg_elbo(params)
+        # k, z, qμ, qΣ_L = split_params(params, M)
+        k, m, A = unpack_params(params, M)
+        kernel =
+            (softplus(k[1])) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2])))
+        f = GP(kernel)
+        fx = f(x, 0.1)
+        q = MvNormal(m, A'A)
+        return -SparseGPs.elbo(fx, y, f(z), q)
+    end
+    return neg_elbo
+end
+
+# Currently fails at the cholesky factorisation of cov(f(z))
+opt = optimize(objective_function(x, y), x0, LBFGS())
+
+# %%
+opt_k, opt_μ, opt_Σ_L = unpack_params(opt.minimizer, M)
+opt_kernel =
+    softplus(opt_k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(opt_k[2]) + 0.01))
+opt_f = GP(opt_kernel)
+opt_q = MvNormal(opt_μ, opt_Σ_L * opt_Σ_L')
+ap = SparseGPs.approx_posterior(SVGP(), opt_f(z), opt_q)
+logpdf(ap(x), y)
+
+# %%
+scatter(
+    x,
+    y;
+    xlim=(0, 1),
+    xlabel="x",
+    ylabel="y",
+    title="posterior (VI with sparse grid)",
+    label="Train Data",
+)
+# scatter!(x, y; label="Test Data")
+plot!(-1:0.001:1, ap; label=false)
+vline!(z; label="Pseudo-points")
+
+
+# %% Find the exact posterior over u (e.g.
+# https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/ equations
+# (11) & (12)) As a sanity check -- this seems to work.
+
+function exact_q(fu, fx, y)
+    σ² = fx.Σy[1]
+    Kuf = cov(fu, fx)
+    Kuu = Symmetric(cov(fu))
+    Σ = (Symmetric(cov(fu) + (1/σ²) * Kuf * Kuf'))
+    m = ((1/σ²)*Kuu* (Σ\Kuf)) * y
+    A = Symmetric(Kuu * (Σ \ Kuu))
+    return MvNormal(m, A)
+end
+
+kernel = 0.3 * (SqExponentialKernel() ∘ ScaleTransform(10))
+f = GP(kernel)
+fx = f(x)
+fu = f(z)
+q_ex = exact_q(fu, fx, y)
+
+scatter(x, y)
+scatter!(z, q_ex.μ)
+
+ap_ex = SparseGPs.approx_posterior(SVGP(), fu, q_ex)
+
+# %%
+scatter(
+    x,
+    y;
+    xlim=(0, 1),
+    xlabel="x",
+    ylabel="y",
+    title="posterior (VI with sparse grid)",
+    label="Train Data",
+)
+plot!(-1:0.001:1, ap_ex; label=false)
+vline!(z; label="Pseudo-points")
diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl
new file mode 100644
index 00000000..24f7faa3
--- /dev/null
+++ b/src/SparseGPs.jl
@@ -0,0 +1,19 @@
+module SparseGPs
+
+using AbstractGPs
+using Distributions
+using Optim
+using StatsFuns
+using LinearAlgebra
+using Statistics
+using StatsBase
+
+using AbstractGPs: FiniteGP, ApproxPosteriorGP, _cholesky, _symmetric, Xt_invA_X, diag_At_A
+
+export elbo,
+    approx_posterior,
+    SVGP
+
+include("svgp.jl")
+
+end
diff --git a/src/svgp.jl b/src/svgp.jl
new file mode 100644
index 00000000..64a19d63
--- /dev/null
+++ b/src/svgp.jl
@@ -0,0 +1,53 @@
+struct SVGP end # TODO: should probably just be VFE?
+
+function approx_posterior(::SVGP, fu::FiniteGP, q::MvNormal)
+    m, A = q.μ, q.Σ.chol
+    Kuu = cholesky(Symmetric(cov(fu)))
+    B = Kuu.L \ A.L  
+    data = (A=A, m=m, Kuu=Kuu, B=B, α=Kuu \ m, u=fu.x)
+    return ApproxPosteriorGP(SVGP(), fu.f, data)
+end
+
+function Statistics.var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
+    # TODO: Don't compute the full covar
+    return diag(cov(f, x))
+end
+
+function Statistics.mean(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
+    return cov(f.prior, x, f.data.u) * f.data.α
+end
+
+function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
+    Cux = cov(f.prior, f.data.u, x)
+    Kuu = f.data.Kuu
+    B = f.data.B
+    D = f.data.Kuu.L \ Cux
+    return cov(f.prior, x) - D' * B * B' * D
+end
+
+function StatsBase.mean_and_cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
+    # TODO: implement properly
+    return mean(f, x), cov(f, x)
+end
+
+function kl_divergence(q::MvNormal, p::AbstractMvNormal)
+    (1/2) * (logdet(q.Σ.chol)
+             - logdet(cov(p)) - length(mean(p))
+             + tr(inv(q.Σ.chol) * cov(p)) + Xt_invA_X(q.Σ.chol, (mean(q)-mean(p))))
+end
+
+function elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal)
+    kl_term = kl_divergence(q, fu)
+    post = approx_posterior(SVGP(), fu, q)
+    f_mean = mean(post, fx.x)
+    f_var = var(post, fx.x)
+
+    Σy = diag(fx.Σy)
+
+    # TODO: general method for likelihoods - quadrature like GPFlow?
+    variational_exp = -0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy)
+
+    # TODO: rescale for minibatches
+    return sum(variational_exp) - kl_term
+end
+

From 6c814a6c23e35e833b879b28f7712b2ea7585265 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Sun, 20 Jun 2021 23:54:06 +0100
Subject: [PATCH 02/66] Fixed KL and posterior covariance.

---
 .gitignore    |  11 ++
 Manifest.toml | 434 --------------------------------------------------
 src/svgp.jl   |  14 +-
 3 files changed, 18 insertions(+), 441 deletions(-)
 create mode 100644 .gitignore
 delete mode 100644 Manifest.toml

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..47e36092
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,11 @@
+.DS_Store
+.idea
+*.log
+tmp/
+
+*.result
+*.json
+*.jld2
+*.cov
+*.info
+Manifest.toml
diff --git a/Manifest.toml b/Manifest.toml
deleted file mode 100644
index 6953aae4..00000000
--- a/Manifest.toml
+++ /dev/null
@@ -1,434 +0,0 @@
-# This file is machine-generated - editing it directly is not advised
-
-[[AbstractGPs]]
-deps = ["ChainRulesCore", "Distributions", "FillArrays", "KernelFunctions", "LinearAlgebra", "Random", "RecipesBase", "Reexport", "Statistics", "StatsBase"]
-git-tree-sha1 = "d3700bd0201d2ec29c0b18d6f3f971f7072fe491"
-uuid = "99985d1d-32ba-4be9-9821-2ec096f28918"
-version = "0.3.5"
-
-[[ArgTools]]
-uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
-
-[[ArrayInterface]]
-deps = ["IfElse", "LinearAlgebra", "Requires", "SparseArrays", "Static"]
-git-tree-sha1 = "045ff5e1bc8c6fb1ecb28694abba0a0d55b5f4f5"
-uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
-version = "3.1.17"
-
-[[Artifacts]]
-uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
-
-[[Base64]]
-uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
-
-[[ChainRulesCore]]
-deps = ["Compat", "LinearAlgebra", "SparseArrays"]
-git-tree-sha1 = "4289a76df5a8568cca9970e54dd585c6c395c496"
-uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-version = "0.10.7"
-
-[[CommonSubexpressions]]
-deps = ["MacroTools", "Test"]
-git-tree-sha1 = "7b8a93dba8af7e3b42fecabf646260105ac373f7"
-uuid = "bbf7d656-a473-5ed7-a52c-81e309532950"
-version = "0.3.0"
-
-[[Compat]]
-deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
-git-tree-sha1 = "e4e2b39db08f967cc1360951f01e8a75ec441cab"
-uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
-version = "3.30.0"
-
-[[CompilerSupportLibraries_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
-
-[[CompositionsBase]]
-git-tree-sha1 = "f3955eb38944e5dd0fabf8ca1e267d94941d34a5"
-uuid = "a33af91c-f02d-484b-be07-31d278c5ca2b"
-version = "0.1.0"
-
-[[DataAPI]]
-git-tree-sha1 = "dfb3b7e89e395be1e25c2ad6d7690dc29cc53b1d"
-uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
-version = "1.6.0"
-
-[[DataStructures]]
-deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
-git-tree-sha1 = "4437b64df1e0adccc3e5d1adbc3ac741095e4677"
-uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.18.9"
-
-[[Dates]]
-deps = ["Printf"]
-uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
-
-[[DelimitedFiles]]
-deps = ["Mmap"]
-uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
-
-[[DiffResults]]
-deps = ["StaticArrays"]
-git-tree-sha1 = "c18e98cba888c6c25d1c3b048e4b3380ca956805"
-uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
-version = "1.0.3"
-
-[[DiffRules]]
-deps = ["NaNMath", "Random", "SpecialFunctions"]
-git-tree-sha1 = "214c3fcac57755cfda163d91c58893a8723f93e9"
-uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
-version = "1.0.2"
-
-[[Distances]]
-deps = ["LinearAlgebra", "Statistics", "StatsAPI"]
-git-tree-sha1 = "abe4ad222b26af3337262b8afb28fab8d215e9f8"
-uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
-version = "0.10.3"
-
-[[Distributed]]
-deps = ["Random", "Serialization", "Sockets"]
-uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
-
-[[Distributions]]
-deps = ["FillArrays", "LinearAlgebra", "PDMats", "Printf", "QuadGK", "Random", "SparseArrays", "SpecialFunctions", "Statistics", "StatsBase", "StatsFuns"]
-git-tree-sha1 = "62e1ac52e9adf4234285cd88c94954924aa3f9ef"
-uuid = "31c24e10-a181-5473-b8eb-7969acd0382f"
-version = "0.25.5"
-
-[[DocStringExtensions]]
-deps = ["LibGit2"]
-git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f"
-uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
-version = "0.8.5"
-
-[[Downloads]]
-deps = ["ArgTools", "LibCURL", "NetworkOptions"]
-uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
-
-[[FillArrays]]
-deps = ["LinearAlgebra", "Random", "SparseArrays"]
-git-tree-sha1 = "31939159aeb8ffad1d4d8ee44d07f8558273120a"
-uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
-version = "0.11.7"
-
-[[FiniteDiff]]
-deps = ["ArrayInterface", "LinearAlgebra", "Requires", "SparseArrays", "StaticArrays"]
-git-tree-sha1 = "f6f80c8f934efd49a286bb5315360be66956dfc4"
-uuid = "6a86dc24-6348-571c-b903-95158fe2bd41"
-version = "2.8.0"
-
-[[ForwardDiff]]
-deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "LinearAlgebra", "NaNMath", "Printf", "Random", "SpecialFunctions", "StaticArrays"]
-git-tree-sha1 = "e2af66012e08966366a43251e1fd421522908be6"
-uuid = "f6369f11-7733-5829-9624-2563aa707210"
-version = "0.10.18"
-
-[[Functors]]
-deps = ["MacroTools"]
-git-tree-sha1 = "a7bb2af991c43dcf5c3455d276dd83976799634f"
-uuid = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
-version = "0.2.1"
-
-[[IfElse]]
-git-tree-sha1 = "28e837ff3e7a6c3cdb252ce49fb412c8eb3caeef"
-uuid = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
-version = "0.1.0"
-
-[[InteractiveUtils]]
-deps = ["Markdown"]
-uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
-
-[[JLLWrappers]]
-deps = ["Preferences"]
-git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e"
-uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
-version = "1.3.0"
-
-[[KernelFunctions]]
-deps = ["ChainRulesCore", "Compat", "CompositionsBase", "Distances", "FillArrays", "Functors", "LinearAlgebra", "Random", "Requires", "SpecialFunctions", "StatsBase", "StatsFuns", "TensorCore", "Test", "ZygoteRules"]
-git-tree-sha1 = "c7b25bc625ca2ee217021d29e3ddf031967bf0ff"
-uuid = "ec8451be-7e33-11e9-00cf-bbf324bd1392"
-version = "0.10.5"
-
-[[LibCURL]]
-deps = ["LibCURL_jll", "MozillaCACerts_jll"]
-uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
-
-[[LibCURL_jll]]
-deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
-uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
-
-[[LibGit2]]
-deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
-uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
-
-[[LibSSH2_jll]]
-deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
-uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
-
-[[Libdl]]
-uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
-
-[[LineSearches]]
-deps = ["LinearAlgebra", "NLSolversBase", "NaNMath", "Parameters", "Printf"]
-git-tree-sha1 = "f27132e551e959b3667d8c93eae90973225032dd"
-uuid = "d3d80556-e9d4-5f37-9878-2ab0fcc64255"
-version = "7.1.1"
-
-[[LinearAlgebra]]
-deps = ["Libdl"]
-uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-
-[[LogExpFunctions]]
-deps = ["DocStringExtensions", "LinearAlgebra"]
-git-tree-sha1 = "1ba664552f1ef15325e68dc4c05c3ef8c2d5d885"
-uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
-version = "0.2.4"
-
-[[Logging]]
-uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
-
-[[MacroTools]]
-deps = ["Markdown", "Random"]
-git-tree-sha1 = "6a8a2a625ab0dea913aba95c11370589e0239ff0"
-uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
-version = "0.5.6"
-
-[[Markdown]]
-deps = ["Base64"]
-uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
-
-[[MbedTLS_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
-
-[[Missings]]
-deps = ["DataAPI"]
-git-tree-sha1 = "4ea90bd5d3985ae1f9a908bd4500ae88921c5ce7"
-uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
-version = "1.0.0"
-
-[[Mmap]]
-uuid = "a63ad114-7e13-5084-954f-fe012c677804"
-
-[[MozillaCACerts_jll]]
-uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
-
-[[NLSolversBase]]
-deps = ["DiffResults", "Distributed", "FiniteDiff", "ForwardDiff"]
-git-tree-sha1 = "50608f411a1e178e0129eab4110bd56efd08816f"
-uuid = "d41bc354-129a-5804-8e4c-c37616107c6c"
-version = "7.8.0"
-
-[[NaNMath]]
-git-tree-sha1 = "bfe47e760d60b82b66b61d2d44128b62e3a369fb"
-uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
-version = "0.3.5"
-
-[[NetworkOptions]]
-uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
-
-[[OpenSpecFun_jll]]
-deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
-uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
-version = "0.5.5+0"
-
-[[Optim]]
-deps = ["Compat", "FillArrays", "LineSearches", "LinearAlgebra", "NLSolversBase", "NaNMath", "Parameters", "PositiveFactorizations", "Printf", "SparseArrays", "StatsBase"]
-git-tree-sha1 = "d34366a3abc25c41f88820762ef7dfdfe9306711"
-uuid = "429524aa-4258-5aef-a3af-852621145aeb"
-version = "1.3.0"
-
-[[OrderedCollections]]
-git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
-uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
-version = "1.4.1"
-
-[[PDMats]]
-deps = ["LinearAlgebra", "SparseArrays", "SuiteSparse"]
-git-tree-sha1 = "4dd403333bcf0909341cfe57ec115152f937d7d8"
-uuid = "90014a1f-27ba-587c-ab20-58faa44d9150"
-version = "0.11.1"
-
-[[Parameters]]
-deps = ["OrderedCollections", "UnPack"]
-git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345"
-uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
-version = "0.12.2"
-
-[[Pkg]]
-deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
-uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-
-[[PositiveFactorizations]]
-deps = ["LinearAlgebra"]
-git-tree-sha1 = "17275485f373e6673f7e7f97051f703ed5b15b20"
-uuid = "85a6dd25-e78a-55b7-8502-1745935b8125"
-version = "0.2.4"
-
-[[Preferences]]
-deps = ["TOML"]
-git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a"
-uuid = "21216c6a-2e73-6563-6e65-726566657250"
-version = "1.2.2"
-
-[[Printf]]
-deps = ["Unicode"]
-uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
-
-[[QuadGK]]
-deps = ["DataStructures", "LinearAlgebra"]
-git-tree-sha1 = "12fbe86da16df6679be7521dfb39fbc861e1dc7b"
-uuid = "1fd47b50-473d-5c70-9696-f719f8f3bcdc"
-version = "2.4.1"
-
-[[REPL]]
-deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
-uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
-
-[[Random]]
-deps = ["Serialization"]
-uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-
-[[RecipesBase]]
-git-tree-sha1 = "b3fb709f3c97bfc6e948be68beeecb55a0b340ae"
-uuid = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
-version = "1.1.1"
-
-[[Reexport]]
-git-tree-sha1 = "5f6c21241f0f655da3952fd60aa18477cf96c220"
-uuid = "189a3867-3050-52da-a836-e630ba90ab69"
-version = "1.1.0"
-
-[[Requires]]
-deps = ["UUIDs"]
-git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621"
-uuid = "ae029012-a4dd-5104-9daa-d747884805df"
-version = "1.1.3"
-
-[[Rmath]]
-deps = ["Random", "Rmath_jll"]
-git-tree-sha1 = "bf3188feca147ce108c76ad82c2792c57abe7b1f"
-uuid = "79098fc4-a85e-5d69-aa6a-4863f24498fa"
-version = "0.7.0"
-
-[[Rmath_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "68db32dff12bb6127bac73c209881191bf0efbb7"
-uuid = "f50d1b31-88e8-58de-be2c-1cc44531875f"
-version = "0.3.0+0"
-
-[[SHA]]
-uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
-
-[[Serialization]]
-uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
-
-[[SharedArrays]]
-deps = ["Distributed", "Mmap", "Random", "Serialization"]
-uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
-
-[[Sockets]]
-uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
-
-[[SortingAlgorithms]]
-deps = ["DataStructures"]
-git-tree-sha1 = "2ec1962eba973f383239da22e75218565c390a96"
-uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
-version = "1.0.0"
-
-[[SparseArrays]]
-deps = ["LinearAlgebra", "Random"]
-uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
-
-[[SpecialFunctions]]
-deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"]
-git-tree-sha1 = "a50550fa3164a8c46747e62063b4d774ac1bcf49"
-uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
-version = "1.5.1"
-
-[[Static]]
-deps = ["IfElse"]
-git-tree-sha1 = "2740ea27b66a41f9d213561a04573da5d3823d4b"
-uuid = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
-version = "0.2.5"
-
-[[StaticArrays]]
-deps = ["LinearAlgebra", "Random", "Statistics"]
-git-tree-sha1 = "57a9b3c69933e15e5b7041b6a57d1533ef1a9882"
-uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "1.2.3"
-
-[[Statistics]]
-deps = ["LinearAlgebra", "SparseArrays"]
-uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
-
-[[StatsAPI]]
-git-tree-sha1 = "1958272568dc176a1d881acb797beb909c785510"
-uuid = "82ae8749-77ed-4fe6-ae5f-f523153014b0"
-version = "1.0.0"
-
-[[StatsBase]]
-deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics", "StatsAPI"]
-git-tree-sha1 = "2f6792d523d7448bbe2fec99eca9218f06cc746d"
-uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
-version = "0.33.8"
-
-[[StatsFuns]]
-deps = ["LogExpFunctions", "Rmath", "SpecialFunctions"]
-git-tree-sha1 = "30cd8c360c54081f806b1ee14d2eecbef3c04c49"
-uuid = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
-version = "0.9.8"
-
-[[SuiteSparse]]
-deps = ["Libdl", "LinearAlgebra", "Serialization", "SparseArrays"]
-uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9"
-
-[[TOML]]
-deps = ["Dates"]
-uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
-
-[[Tar]]
-deps = ["ArgTools", "SHA"]
-uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
-
-[[TensorCore]]
-deps = ["LinearAlgebra"]
-git-tree-sha1 = "1feb45f88d133a655e001435632f019a9a1bcdb6"
-uuid = "62fd8b95-f654-4bbd-a8a5-9c27f68ccd50"
-version = "0.1.1"
-
-[[Test]]
-deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
-uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-
-[[UUIDs]]
-deps = ["Random", "SHA"]
-uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
-
-[[UnPack]]
-git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
-uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
-version = "1.0.2"
-
-[[Unicode]]
-uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
-
-[[Zlib_jll]]
-deps = ["Libdl"]
-uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
-
-[[ZygoteRules]]
-deps = ["MacroTools"]
-git-tree-sha1 = "9e7a1e8ca60b742e508a315c17eef5211e7fbfd7"
-uuid = "700de1a5-db45-46bc-99cf-38207098b444"
-version = "0.2.1"
-
-[[nghttp2_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
-
-[[p7zip_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
diff --git a/src/svgp.jl b/src/svgp.jl
index 64a19d63..47fa7b57 100644
--- a/src/svgp.jl
+++ b/src/svgp.jl
@@ -1,7 +1,7 @@
 struct SVGP end # TODO: should probably just be VFE?
 
 function approx_posterior(::SVGP, fu::FiniteGP, q::MvNormal)
-    m, A = q.μ, q.Σ.chol
+    m, A = q.μ, cholesky(q.Σ)
     Kuu = cholesky(Symmetric(cov(fu)))
     B = Kuu.L \ A.L  
     data = (A=A, m=m, Kuu=Kuu, B=B, α=Kuu \ m, u=fu.x)
@@ -22,7 +22,7 @@ function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
     Kuu = f.data.Kuu
     B = f.data.B
     D = f.data.Kuu.L \ Cux
-    return cov(f.prior, x) - D' * B * B' * D
+    return cov(f.prior, x) - D'D + D' * B * B' * D 
 end
 
 function StatsBase.mean_and_cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
@@ -31,9 +31,8 @@ function StatsBase.mean_and_cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
 end
 
 function kl_divergence(q::MvNormal, p::AbstractMvNormal)
-    (1/2) * (logdet(q.Σ.chol)
-             - logdet(cov(p)) - length(mean(p))
-             + tr(inv(q.Σ.chol) * cov(p)) + Xt_invA_X(q.Σ.chol, (mean(q)-mean(p))))
+    (1/2) .* (logdet(cov(p)) - logdet(cov(q)) - length(mean(p)) + tr(cov(p) \ cov(q)) +
+              AbstractGPs.Xt_invA_X(cholesky(q.Σ), (mean(q) - mean(p))))
 end
 
 function elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal)
@@ -45,8 +44,9 @@ function elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal
     Σy = diag(fx.Σy)
 
     # TODO: general method for likelihoods - quadrature like GPFlow?
-    variational_exp = -0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy)
-
+    variational_exp = -0.5 * (
+        log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy
+    )
     # TODO: rescale for minibatches
     return sum(variational_exp) - kl_term
 end

From 798f77a6f742e69a286507e807298aac3a98c6b4 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Mon, 21 Jun 2021 00:53:48 +0100
Subject: [PATCH 03/66] Update example to use Flux

---
 Project.toml            |   1 +
 examples/gpflow_svgp.jl | 118 ++++++++++++++++++++++------------------
 2 files changed, 67 insertions(+), 52 deletions(-)

diff --git a/Project.toml b/Project.toml
index e97fe5a2..92dd78b6 100644
--- a/Project.toml
+++ b/Project.toml
@@ -6,6 +6,7 @@ version = "0.1.0"
 [deps]
 AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
diff --git a/examples/gpflow_svgp.jl b/examples/gpflow_svgp.jl
index 935f5330..158001e9 100644
--- a/examples/gpflow_svgp.jl
+++ b/examples/gpflow_svgp.jl
@@ -4,7 +4,6 @@ using AbstractGPs
 using SparseGPs
 using Distributions
 using LinearAlgebra
-using StatsFuns
 using Optim
 
 using Plots
@@ -25,59 +24,67 @@ y = g.(x) + 0.3 * randn(N)
 scatter(x, y; xlabel="x", ylabel="y", legend=false)
 
 # %%
-M = 30 # number of inducing points
+M = 50 # number of inducing points
 
-function pack_params(θ, m, A)
-    return vcat(θ, m, vec(A))
+# TODO: incorporate better inducing point selection from
+# https://github.com/JuliaGaussianProcesses/InducingPoints.jl?
+z = x[1:M]
+
+# %%
+# A simple Flux model
+using Flux
+
+struct SVGPModel
+    k
+    m
+    A
+    z
 end
 
-function unpack_params(params, m; include_z=false)
-    if include_z
-        k = params[1:2]
-        z = params[3:m+2]
-        μ = params[m+3:2m+2]
-        s = params[2m+3:end]
-        Σ = reshape(s, (M, M))
-        return k, z, μ, Σ
-    else
-        k = params[1:2]
-        μ = params[3:m+2]
-        s = params[m+3:end]
-        Σ = reshape(s, (M, M))
-        return k, μ, Σ
-    end
+function make_kernel(k)
+    return Flux.softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(Flux.softplus(k[2])))
 end
 
-x0 = pack_params(rand(2), zeros(M), vec(Matrix{Float64}(I, M, M)))
-z = x[1:M]
+function (m::SVGPModel)(x, y)
+    kernel = make_kernel(m.k)
+    f = GP(kernel)
+    q = MvNormal(m.m, m.A'm.A + 0.001I)
+    fx = f(x, 0.1)
+    fu = f(m.z, 0.1)
+    return -SparseGPs.elbo(fx, y, fu, q)
+end
 
-# %%
-function objective_function(x, y)
-    function neg_elbo(params)
-        # k, z, qμ, qΣ_L = split_params(params, M)
-        k, m, A = unpack_params(params, M)
-        kernel =
-            (softplus(k[1])) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2])))
-        f = GP(kernel)
-        fx = f(x, 0.1)
-        q = MvNormal(m, A'A)
-        return -SparseGPs.elbo(fx, y, f(z), q)
-    end
-    return neg_elbo
+function posterior(m::SVGPModel)
+    kernel = make_kernel(m.k)
+    f = GP(kernel)
+    fu = f(m.z, 0.1)
+    q = MvNormal(m.m, m.A'm.A + 0.0001I)
+    return SparseGPs.approx_posterior(SVGP(), fu, q)
 end
 
-# Currently fails at the cholesky factorisation of cov(f(z))
-opt = optimize(objective_function(x, y), x0, LBFGS())
+k = [0.3, 10]
+m = zeros(M)
+A = Matrix{Float64}(I, M, M)
 
-# %%
-opt_k, opt_μ, opt_Σ_L = unpack_params(opt.minimizer, M)
-opt_kernel =
-    softplus(opt_k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(opt_k[2]) + 0.01))
-opt_f = GP(opt_kernel)
-opt_q = MvNormal(opt_μ, opt_Σ_L * opt_Σ_L')
-ap = SparseGPs.approx_posterior(SVGP(), opt_f(z), opt_q)
-logpdf(ap(x), y)
+model = SVGPModel(k, m, A, z)
+
+function flux_loss(x, y)
+    return model(x, y)
+end
+
+data = [(x, y)]
+opt = ADAM(0.01)
+parameters = Flux.params(k, m, A)
+
+println(flux_loss(x, y))
+
+for epoch in 1:300
+    Flux.train!(flux_loss, parameters, data, opt)
+end
+
+println(flux_loss(x, y))
 
+post = posterior(model)
 # %%
 scatter(
     x,
@@ -88,14 +95,13 @@ scatter(
     title="posterior (VI with sparse grid)",
     label="Train Data",
 )
-# scatter!(x, y; label="Test Data")
-plot!(-1:0.001:1, ap; label=false)
+plot!(-1:0.001:1, post; label="Posterior")
 vline!(z; label="Pseudo-points")
 
 
 # %% Find the exact posterior over u (e.g.
 # https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/ equations
-# (11) & (12)) As a sanity check -- this seems to work.
+# (11) & (12)) As a sanity check.
 
 function exact_q(fu, fx, y)
     σ² = fx.Σy[1]
@@ -107,16 +113,22 @@ function exact_q(fu, fx, y)
     return MvNormal(m, A)
 end
 
-kernel = 0.3 * (SqExponentialKernel() ∘ ScaleTransform(10))
+kernel = make_kernel([0.2, 11])
 f = GP(kernel)
-fx = f(x)
-fu = f(z)
+fx = f(x, 0.1)
+fu = f(z, 0.1)
 q_ex = exact_q(fu, fx, y)
 
 scatter(x, y)
 scatter!(z, q_ex.μ)
 
-ap_ex = SparseGPs.approx_posterior(SVGP(), fu, q_ex)
+# These two should be the same - and they are, the plot below shows almost identical predictions
+ap_ex = SparseGPs.approx_posterior(SVGP(), fu, q_ex) # Hensman 2013 (exact) posterior
+ap_tits = AbstractGPs.approx_posterior(VFE(), fx, y, fu) # Titsias posterior
+
+# Should these be the same? (they currently aren't)
+SparseGPs.elbo(fx, y, fu, q_ex)
+AbstractGPs.elbo(fx, y, fu)
 
 # %%
 scatter(
@@ -128,5 +140,7 @@ scatter(
     title="posterior (VI with sparse grid)",
     label="Train Data",
 )
-plot!(-1:0.001:1, ap_ex; label=false)
+plot!(-1:0.001:1, ap_ex; label="SVGP posterior")
+plot!(-1:0.001:1, ap_tits; label="Titsias posterior")
 vline!(z; label="Pseudo-points")
+

From 0641423c1cfbee868ca66ceb7fabba3cd026bba9 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Mon, 21 Jun 2021 20:01:27 +0100
Subject: [PATCH 04/66] Remove Flux as a dep & factor out expected_loglik

---
 Project.toml |  1 -
 src/svgp.jl  | 18 ++++++++++++------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/Project.toml b/Project.toml
index 92dd78b6..e97fe5a2 100644
--- a/Project.toml
+++ b/Project.toml
@@ -6,7 +6,6 @@ version = "0.1.0"
 [deps]
 AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
-Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
diff --git a/src/svgp.jl b/src/svgp.jl
index 47fa7b57..5e57c721 100644
--- a/src/svgp.jl
+++ b/src/svgp.jl
@@ -26,8 +26,13 @@ function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
 end
 
 function StatsBase.mean_and_cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
-    # TODO: implement properly
-    return mean(f, x), cov(f, x)
+    Cux = cov(f.prior, f.data.u, x)
+    Kuu = f.data.Kuu
+    B = f.data.B
+    D = f.data.Kuu.L \ Cux
+    μ = Cux' * f.data.α
+    Σ = cov(f.prior, x) - D'D + D' * B * B' * D 
+    return μ, Σ
 end
 
 function kl_divergence(q::MvNormal, p::AbstractMvNormal)
@@ -35,6 +40,10 @@ function kl_divergence(q::MvNormal, p::AbstractMvNormal)
               AbstractGPs.Xt_invA_X(cholesky(q.Σ), (mean(q) - mean(p))))
 end
 
+function expected_loglik(y::AbstractVector{<:Real}, f_mean::AbstractVector, f_var::AbstractVector, Σy::AbstractVector)
+    return -0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy)
+end
+
 function elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal)
     kl_term = kl_divergence(q, fu)
     post = approx_posterior(SVGP(), fu, q)
@@ -43,10 +52,7 @@ function elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal
 
     Σy = diag(fx.Σy)
 
-    # TODO: general method for likelihoods - quadrature like GPFlow?
-    variational_exp = -0.5 * (
-        log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy
-    )
+    variational_exp = expected_loglik(y, f_mean, f_var, Σy)
     # TODO: rescale for minibatches
     return sum(variational_exp) - kl_term
 end

From 1e4fc90986ed5e7ba5296a59e2b732f1de9e9853 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Mon, 21 Jun 2021 21:32:00 +0100
Subject: [PATCH 05/66] Update example to use basic Flux layer

---
 examples/gpflow_svgp.jl | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/examples/gpflow_svgp.jl b/examples/gpflow_svgp.jl
index 158001e9..4f56b605 100644
--- a/examples/gpflow_svgp.jl
+++ b/examples/gpflow_svgp.jl
@@ -34,31 +34,31 @@ z = x[1:M]
 # A simple Flux model
 using Flux
 
-struct SVGPModel
-    k
-    m
-    A
-    z
+struct SVGPLayer
+    k # kernel parameters
+    m # variational mean
+    A # variational covariance
+    z # inducing points
 end
 
 function make_kernel(k)
     return Flux.softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(Flux.softplus(k[2])))
 end
 
-function (m::SVGPModel)(x, y)
+function (m::SVGPLayer)(x)
     kernel = make_kernel(m.k)
     f = GP(kernel)
-    q = MvNormal(m.m, m.A'm.A + 0.001I)
+    q = MvNormal(m.m, m.A'm.A)
     fx = f(x, 0.1)
     fu = f(m.z, 0.1)
-    return -SparseGPs.elbo(fx, y, fu, q)
+    return fx, fu, q
 end
 
-function posterior(m::SVGPModel)
+function posterior(m::SVGPLayer)
     kernel = make_kernel(m.k)
     f = GP(kernel)
     fu = f(m.z, 0.1)
-    q = MvNormal(m.m, m.A'm.A + 0.0001I)
+    q = MvNormal(m.m, m.A'm.A)
     return SparseGPs.approx_posterior(SVGP(), fu, q)
 end
 
@@ -66,10 +66,11 @@ k = [0.3, 10]
 m = zeros(M)
 A = Matrix{Float64}(I, M, M)
 
-model = SVGPModel(k, m, A, z)
+model = SVGPLayer(k, m, A, z)
 
 function flux_loss(x, y)
-    return model(x, y)
+    fx, fu, q = model(x)
+    return -SparseGPs.elbo(fx, y, fu, q)
 end
 
 data = [(x, y)]

From bb42044683078cc106a2dc0dbfb9daf62fd3f321 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Tue, 22 Jun 2021 02:49:55 +0100
Subject: [PATCH 06/66] Add minibatching.

---
 examples/gpflow_svgp.jl | 47 ++++++++++++++++++++++++++---------------
 src/svgp.jl             |  7 +++---
 2 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/examples/gpflow_svgp.jl b/examples/gpflow_svgp.jl
index 4f56b605..63f06668 100644
--- a/examples/gpflow_svgp.jl
+++ b/examples/gpflow_svgp.jl
@@ -5,6 +5,7 @@ using SparseGPs
 using Distributions
 using LinearAlgebra
 using Optim
+using IterTools
 
 using Plots
 default(; legend=:outertopright, size=(700, 400))
@@ -17,7 +18,7 @@ function g(x)
     return sin(3π * x) + 0.3 * cos(9π * x) + 0.5 * sin(7π * x)
 end
 
-N = 1000 # Number of training points
+N = 10000 # Number of training points
 x = rand(Uniform(-1, 1), N)
 y = g.(x) + 0.3 * randn(N)
 
@@ -41,56 +42,68 @@ struct SVGPLayer
     z # inducing points
 end
 
+@Flux.functor SVGPLayer
+
 function make_kernel(k)
-    return Flux.softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(Flux.softplus(k[2])))
+    return softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2])))
 end
 
 function (m::SVGPLayer)(x)
     kernel = make_kernel(m.k)
     f = GP(kernel)
     q = MvNormal(m.m, m.A'm.A)
-    fx = f(x, 0.1)
-    fu = f(m.z, 0.1)
+    fx = f(x, 0.3)
+    fu = f(m.z, 0.3)
     return fx, fu, q
 end
 
 function posterior(m::SVGPLayer)
     kernel = make_kernel(m.k)
     f = GP(kernel)
-    fu = f(m.z, 0.1)
+    fu = f(m.z, 0.3)
     q = MvNormal(m.m, m.A'm.A)
     return SparseGPs.approx_posterior(SVGP(), fu, q)
 end
 
+function flux_loss(x, y; n_data=1, n_batch=1)
+    fx, fu, q = model(x)
+    return -SparseGPs.elbo(fx, y, fu, q; n_data, n_batch)
+end
+
+# Initialise the parameters
 k = [0.3, 10]
 m = zeros(M)
 A = Matrix{Float64}(I, M, M)
 
 model = SVGPLayer(k, m, A, z)
 
-function flux_loss(x, y)
-    fx, fu, q = model(x)
-    return -SparseGPs.elbo(fx, y, fu, q)
-end
-
-data = [(x, y)]
+b = 100 # minibatch size
 opt = ADAM(0.01)
-parameters = Flux.params(k, m, A)
+# parameters = Flux.params(k, s, m, A)
+parameters = Flux.params(model)
+data_loader = Flux.Data.DataLoader((x, y), batchsize=b)
 
+# %%
 println(flux_loss(x, y))
 
-for epoch in 1:300
-    Flux.train!(flux_loss, parameters, data, opt)
-end
+Flux.train!(
+    (x, y) -> flux_loss(x, y; n_data=N, n_batch=b),
+    parameters,
+    ncycle(data_loader, 100), # Train for 100 epochs
+    opt
+)
 
 println(flux_loss(x, y))
 
-post = posterior(model)
 # %%
+post = posterior(model)
+
 scatter(
     x,
     y;
-    xlim=(0, 1),
+    markershape=:xcross,
+    markeralpha=0.1,
+    xlim=(-1, 1),
     xlabel="x",
     ylabel="y",
     title="posterior (VI with sparse grid)",
diff --git a/src/svgp.jl b/src/svgp.jl
index 5e57c721..2b050616 100644
--- a/src/svgp.jl
+++ b/src/svgp.jl
@@ -44,16 +44,15 @@ function expected_loglik(y::AbstractVector{<:Real}, f_mean::AbstractVector, f_va
     return -0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy)
 end
 
-function elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal)
+function elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal; n_data=1, n_batch=1)
     kl_term = kl_divergence(q, fu)
     post = approx_posterior(SVGP(), fu, q)
     f_mean = mean(post, fx.x)
     f_var = var(post, fx.x)
-
     Σy = diag(fx.Σy)
 
     variational_exp = expected_loglik(y, f_mean, f_var, Σy)
-    # TODO: rescale for minibatches
-    return sum(variational_exp) - kl_term
+    scale = n_data / n_batch
+    return sum(variational_exp) * scale - kl_term
 end
 

From 102d8128369f57a462b18ee67db3370dc86fcdd9 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Thu, 24 Jun 2021 00:28:44 +0100
Subject: [PATCH 07/66] Improved variance calculation.

---
 src/SparseGPs.jl |  2 +-
 src/svgp.jl      | 32 ++++++++++++++++++--------------
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl
index 24f7faa3..57697c8d 100644
--- a/src/SparseGPs.jl
+++ b/src/SparseGPs.jl
@@ -8,7 +8,7 @@ using LinearAlgebra
 using Statistics
 using StatsBase
 
-using AbstractGPs: FiniteGP, ApproxPosteriorGP, _cholesky, _symmetric, Xt_invA_X, diag_At_A
+using AbstractGPs: FiniteGP, ApproxPosteriorGP, _cholesky, _symmetric, At_A, diag_At_A
 
 export elbo,
     approx_posterior,
diff --git a/src/svgp.jl b/src/svgp.jl
index 2b050616..c082a83c 100644
--- a/src/svgp.jl
+++ b/src/svgp.jl
@@ -1,16 +1,17 @@
-struct SVGP end # TODO: should probably just be VFE?
+struct SVGP end
 
 function approx_posterior(::SVGP, fu::FiniteGP, q::MvNormal)
-    m, A = q.μ, cholesky(q.Σ)
+    m, A = mean(q), cholesky(cov(q))
     Kuu = cholesky(Symmetric(cov(fu)))
-    B = Kuu.L \ A.L  
+    B = Kuu.L \ A.L
     data = (A=A, m=m, Kuu=Kuu, B=B, α=Kuu \ m, u=fu.x)
     return ApproxPosteriorGP(SVGP(), fu.f, data)
 end
 
 function Statistics.var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
-    # TODO: Don't compute the full covar
-    return diag(cov(f, x))
+    Cux = cov(f.prior, f.data.u, x)
+    D = f.data.Kuu.L \ Cux
+    return var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) 
 end
 
 function Statistics.mean(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
@@ -19,25 +20,29 @@ end
 
 function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
     Cux = cov(f.prior, f.data.u, x)
-    Kuu = f.data.Kuu
-    B = f.data.B
     D = f.data.Kuu.L \ Cux
-    return cov(f.prior, x) - D'D + D' * B * B' * D 
+    return cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) 
 end
 
 function StatsBase.mean_and_cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
     Cux = cov(f.prior, f.data.u, x)
-    Kuu = f.data.Kuu
-    B = f.data.B
     D = f.data.Kuu.L \ Cux
     μ = Cux' * f.data.α
-    Σ = cov(f.prior, x) - D'D + D' * B * B' * D 
+    Σ = cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) 
     return μ, Σ
 end
 
+function StatsBase.mean_and_var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
+    Cux = cov(f.prior, f.data.u, x)
+    D = f.data.Kuu.L \ Cux
+    μ = Cux' * f.data.α
+    Σ_diag = var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) 
+    return μ, Σ_diag
+end
+
 function kl_divergence(q::MvNormal, p::AbstractMvNormal)
     (1/2) .* (logdet(cov(p)) - logdet(cov(q)) - length(mean(p)) + tr(cov(p) \ cov(q)) +
-              AbstractGPs.Xt_invA_X(cholesky(q.Σ), (mean(q) - mean(p))))
+              AbstractGPs.Xt_invA_X(cholesky(cov(q)), (mean(q) - mean(p))))
 end
 
 function expected_loglik(y::AbstractVector{<:Real}, f_mean::AbstractVector, f_var::AbstractVector, Σy::AbstractVector)
@@ -47,8 +52,7 @@ end
 function elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal; n_data=1, n_batch=1)
     kl_term = kl_divergence(q, fu)
     post = approx_posterior(SVGP(), fu, q)
-    f_mean = mean(post, fx.x)
-    f_var = var(post, fx.x)
+    f_mean, f_var = mean_and_var(post, fx.x)
     Σy = diag(fx.Σy)
 
     variational_exp = expected_loglik(y, f_mean, f_var, Σy)

From 3089d939eb4c9dc34da953c6605942d69435e848 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Thu, 24 Jun 2021 15:11:55 +0100
Subject: [PATCH 08/66] Initial quadrature implementation

---
 Project.toml     |  2 ++
 src/SparseGPs.jl | 12 +++++++++++-
 src/svgp.jl      | 39 ++++++++++++++++++++++++++++++++++++---
 3 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/Project.toml b/Project.toml
index e97fe5a2..f0fe1a01 100644
--- a/Project.toml
+++ b/Project.toml
@@ -6,6 +6,8 @@ version = "0.1.0"
 [deps]
 AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+FastGaussQuadrature = "442a2c76-b920-505d-bb47-c5924d526838"
+GPLikelihoods = "6031954c-0455-49d7-b3b9-3e1c99afaf40"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl
index 57697c8d..309864b2 100644
--- a/src/SparseGPs.jl
+++ b/src/SparseGPs.jl
@@ -7,8 +7,18 @@ using StatsFuns
 using LinearAlgebra
 using Statistics
 using StatsBase
+using FastGaussQuadrature
+using GPLikelihoods
 
-using AbstractGPs: FiniteGP, ApproxPosteriorGP, _cholesky, _symmetric, At_A, diag_At_A
+using AbstractGPs:
+    FiniteGP,
+    LatentFiniteGP,
+    ApproxPosteriorGP,
+    _cholesky,
+    _symmetric,
+    At_A,
+    diag_At_A,
+    Xt_invA_X
 
 export elbo,
     approx_posterior,
diff --git a/src/svgp.jl b/src/svgp.jl
index c082a83c..264b88dc 100644
--- a/src/svgp.jl
+++ b/src/svgp.jl
@@ -41,14 +41,37 @@ function StatsBase.mean_and_var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
 end
 
 function kl_divergence(q::MvNormal, p::AbstractMvNormal)
-    (1/2) .* (logdet(cov(p)) - logdet(cov(q)) - length(mean(p)) + tr(cov(p) \ cov(q)) +
-              AbstractGPs.Xt_invA_X(cholesky(cov(q)), (mean(q) - mean(p))))
+    p_μ, p_Σ = mean(p), cov(p)
+    (1/2) .* (logdet(p_Σ) - logdet(q.Σ) - length(p_μ) + tr(p_Σ \ q.Σ) +
+              Xt_invA_X(cholesky(q.Σ), (q.μ - p_μ)))
 end
 
-function expected_loglik(y::AbstractVector{<:Real}, f_mean::AbstractVector, f_var::AbstractVector, Σy::AbstractVector)
+# The closed form expected loglikelihood for a Gaussian likelihood
+function expected_loglik(
+    y::AbstractVector{<:Real},
+    f_mean::AbstractVector,
+    f_var::AbstractVector,
+    Σy::AbstractVector
+)
     return -0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy)
 end
 
+function expected_loglik(
+    y::AbstractVector{<:Real},
+    f_mean::AbstractVector,
+    f_var::AbstractVector,
+    lik::BernoulliLikelihood;
+    n_quad_points=20
+)
+    # Compute the expectation via Gauss-Hermite quadrature
+    # using a reparameterisation by change of variable
+    # (see eg. en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature)
+    v, w = gausshermite(n_quad_points)
+    h = √2 * .√f_var' .* v .+ f_mean'
+    lls = loglikelihood.(lik.(h), y')
+    return ((1/√π) * w'lls)'
+end
+
 function elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal; n_data=1, n_batch=1)
     kl_term = kl_divergence(q, fu)
     post = approx_posterior(SVGP(), fu, q)
@@ -60,3 +83,13 @@ function elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal
     return sum(variational_exp) * scale - kl_term
 end
 
+function elbo(fx::LatentFiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal; n_data=1, n_batch=1)
+    kl_term = kl_divergence(q, fu)
+    post = approx_posterior(SVGP(), fu, q)
+    f_mean, f_var = mean_and_var(post, fx.fx.x)
+    
+    variational_exp = expected_loglik(y, f_mean, f_var, fx.lik)
+    scale = n_data / n_batch
+    return sum(variational_exp) * scale - kl_term
+end
+

From 59474c5e357a3700eb58757d2bc060043bcf5060 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Fri, 2 Jul 2021 01:01:39 +0100
Subject: [PATCH 09/66] Moved quadrature to new file.

---
 Project.toml               |  1 +
 examples/classification.jl | 41 ++++++++++++++++++++++++++++++++++++++
 src/SparseGPs.jl           |  2 ++
 src/quadrature.jl          | 18 +++++++++++++++++
 src/svgp.jl                | 12 +++--------
 5 files changed, 65 insertions(+), 9 deletions(-)
 create mode 100644 examples/classification.jl
 create mode 100644 src/quadrature.jl

diff --git a/Project.toml b/Project.toml
index f0fe1a01..45712703 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,7 @@ version = "0.1.0"
 
 [deps]
 AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918"
+ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 FastGaussQuadrature = "442a2c76-b920-505d-bb47-c5924d526838"
 GPLikelihoods = "6031954c-0455-49d7-b3b9-3e1c99afaf40"
diff --git a/examples/classification.jl b/examples/classification.jl
new file mode 100644
index 00000000..a522f016
--- /dev/null
+++ b/examples/classification.jl
@@ -0,0 +1,41 @@
+# Recreation of https://gpflow.readthedocs.io/en/master/notebooks/basics/classification.html
+
+using SparseGPs
+using AbstractGPs
+using GPLikelihoods
+using StatsFuns
+using FastGaussQuadrature
+using Distributions
+using LinearAlgebra
+
+using Plots
+
+x = [5.668341708542713242, 5.758793969849246075, 5.517587939698492150, 2.954773869346733584, 3.648241206030150785, 2.110552763819095290, 4.613065326633165597, 4.793969849246231263, 4.703517587939698430, 6.030150753768843686, 3.015075376884421843, 3.979899497487437099, 3.226130653266331638, 1.899497487437185939, 1.145728643216080256, 3.316582914572864249, 6.030150753768843686, 2.231155778894472252, 3.256281407035175768, 1.085427135678391997, 1.809045226130653106, 4.492462311557789079, 1.959798994974874198, 0.000000000000000000, 3.346733668341708601, 1.507537688442210921, 1.809045226130653328, 5.517587939698492150, 2.201005025125628123, 5.577889447236180409, 1.809045226130653328, 1.688442211055276365, 4.160804020100502321, 2.170854271356783993, 4.311557788944723413, 3.075376884422110546, 5.125628140703517133, 1.989949748743718549, 5.366834170854271058, 4.100502512562814061, 7.236180904522613311, 2.261306532663316382, 3.467336683417085119, 1.085427135678391997, 5.095477386934673447, 5.185929648241205392, 2.743718592964823788, 2.773869346733668362, 1.417085427135678311, 1.989949748743718549]
+y = [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1]
+
+function make_kernel(k)
+    return softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2])))
+end
+
+k = [0.1, 0.1]
+
+kernel = make_kernel(k)
+f = LatentGP(GP(kernel), BernoulliLikelihood(), 0.1)
+fx = f(x)
+z = x[1:10]
+fu = f(z).fx # want the underlying FiniteGP
+q = MvNormal(zeros(length(z)), I)
+
+SparseGPs.kl_divergence(q, fu)
+SparseGPs.elbo(fx, y, fu, q)
+
+post = SparseGPs.approx_posterior(SVGP(), fu, q)
+f_mean, f_var = mean_and_var(post, fx.fx.x)
+
+
+# v = inputs to evaluate
+# w = weights
+v, w = gausshermite(20);
+h = √2 * .√f_var' .* v .+ f_mean'
+lls = loglikelihood.(f.lik.(h), y')
+var_exp = (1/√π) * sum(w'lls)
diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl
index 309864b2..335af352 100644
--- a/src/SparseGPs.jl
+++ b/src/SparseGPs.jl
@@ -9,6 +9,7 @@ using Statistics
 using StatsBase
 using FastGaussQuadrature
 using GPLikelihoods
+using ChainRulesCore
 
 using AbstractGPs:
     FiniteGP,
@@ -24,6 +25,7 @@ export elbo,
     approx_posterior,
     SVGP
 
+include("quadrature.jl")
 include("svgp.jl")
 
 end
diff --git a/src/quadrature.jl b/src/quadrature.jl
new file mode 100644
index 00000000..7ba26e20
--- /dev/null
+++ b/src/quadrature.jl
@@ -0,0 +1,18 @@
+
+function gauss_hermite_quadrature(
+    y::AbstractVector,
+    f_mean::AbstractVector,
+    f_var::AbstractVector,
+    lik::BernoulliLikelihood;
+    n_points=20
+)
+    # Compute the expectation via Gauss-Hermite quadrature
+    # using a reparameterisation by change of variable
+    # (see eg. en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature)
+    xs, ws = gausshermite(n_points)
+    fs = √2 * .√f_var' .* xs .+ f_mean'
+    lls = loglikelihood.(lik.(fs), y')
+    return ((1/√π) * ws'lls)'
+end
+
+ChainRulesCore.@non_differentiable gausshermite(n)
diff --git a/src/svgp.jl b/src/svgp.jl
index 264b88dc..7cc31515 100644
--- a/src/svgp.jl
+++ b/src/svgp.jl
@@ -57,19 +57,13 @@ function expected_loglik(
 end
 
 function expected_loglik(
-    y::AbstractVector{<:Real},
+    y::AbstractVector,
     f_mean::AbstractVector,
     f_var::AbstractVector,
     lik::BernoulliLikelihood;
-    n_quad_points=20
+    n_points=20
 )
-    # Compute the expectation via Gauss-Hermite quadrature
-    # using a reparameterisation by change of variable
-    # (see eg. en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature)
-    v, w = gausshermite(n_quad_points)
-    h = √2 * .√f_var' .* v .+ f_mean'
-    lls = loglikelihood.(lik.(h), y')
-    return ((1/√π) * w'lls)'
+    return gauss_hermite_quadrature(y, f_mean, f_var, lik; n_points=n_points)
 end
 
 function elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal; n_data=1, n_batch=1)

From 25e662791c1ba444803b618a7afb06fd9124e301 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Sat, 3 Jul 2021 23:48:05 +0100
Subject: [PATCH 10/66] Fixed AD for quadrature.

---
 src/quadrature.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/quadrature.jl b/src/quadrature.jl
index 7ba26e20..268e450d 100644
--- a/src/quadrature.jl
+++ b/src/quadrature.jl
@@ -1,18 +1,18 @@
-
 function gauss_hermite_quadrature(
     y::AbstractVector,
     f_mean::AbstractVector,
     f_var::AbstractVector,
-    lik::BernoulliLikelihood;
+    lik;
     n_points=20
 )
     # Compute the expectation via Gauss-Hermite quadrature
     # using a reparameterisation by change of variable
     # (see eg. en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature)
     xs, ws = gausshermite(n_points)
-    fs = √2 * .√f_var' .* xs .+ f_mean'
-    lls = loglikelihood.(lik.(fs), y')
-    return ((1/√π) * ws'lls)'
+    # size(fs): (n_points, length(y))
+    fs = √2 * .√f_var .* transpose(xs) .+ f_mean
+    lls = loglikelihood.(lik.(fs), y)
+    return (1/√π) * lls * ws
 end
 
 ChainRulesCore.@non_differentiable gausshermite(n)

From 54b5470bc470f1422464b423887756e5ba656f7c Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Sat, 3 Jul 2021 23:48:29 +0100
Subject: [PATCH 11/66] Fixed AD for KL divergence.

---
 src/svgp.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/svgp.jl b/src/svgp.jl
index 7cc31515..6962a148 100644
--- a/src/svgp.jl
+++ b/src/svgp.jl
@@ -42,7 +42,7 @@ end
 
 function kl_divergence(q::MvNormal, p::AbstractMvNormal)
     p_μ, p_Σ = mean(p), cov(p)
-    (1/2) .* (logdet(p_Σ) - logdet(q.Σ) - length(p_μ) + tr(p_Σ \ q.Σ) +
+    (1/2) .* (logdet(p_Σ) - logdet(q.Σ) - length(p_μ) + tr(p_Σ \ cov(q)) +
               Xt_invA_X(cholesky(q.Σ), (q.μ - p_μ)))
 end
 

From 5e1c8829eee86dd6139ade8fc8c264952ff7511a Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Sun, 4 Jul 2021 01:15:07 +0100
Subject: [PATCH 12/66] Added classification example.

---
 examples/classification.jl   | 143 +++++++++++++++++++++++++++++++----
 examples/data/classif_1D.csv |  50 ++++++++++++
 src/quadrature.jl            |   2 +-
 3 files changed, 179 insertions(+), 16 deletions(-)
 create mode 100644 examples/data/classif_1D.csv

diff --git a/examples/classification.jl b/examples/classification.jl
index a522f016..21266bda 100644
--- a/examples/classification.jl
+++ b/examples/classification.jl
@@ -1,5 +1,6 @@
 # Recreation of https://gpflow.readthedocs.io/en/master/notebooks/basics/classification.html
 
+# %%
 using SparseGPs
 using AbstractGPs
 using GPLikelihoods
@@ -7,35 +8,147 @@ using StatsFuns
 using FastGaussQuadrature
 using Distributions
 using LinearAlgebra
+using DelimitedFiles
+using IterTools
 
 using Plots
 
-x = [5.668341708542713242, 5.758793969849246075, 5.517587939698492150, 2.954773869346733584, 3.648241206030150785, 2.110552763819095290, 4.613065326633165597, 4.793969849246231263, 4.703517587939698430, 6.030150753768843686, 3.015075376884421843, 3.979899497487437099, 3.226130653266331638, 1.899497487437185939, 1.145728643216080256, 3.316582914572864249, 6.030150753768843686, 2.231155778894472252, 3.256281407035175768, 1.085427135678391997, 1.809045226130653106, 4.492462311557789079, 1.959798994974874198, 0.000000000000000000, 3.346733668341708601, 1.507537688442210921, 1.809045226130653328, 5.517587939698492150, 2.201005025125628123, 5.577889447236180409, 1.809045226130653328, 1.688442211055276365, 4.160804020100502321, 2.170854271356783993, 4.311557788944723413, 3.075376884422110546, 5.125628140703517133, 1.989949748743718549, 5.366834170854271058, 4.100502512562814061, 7.236180904522613311, 2.261306532663316382, 3.467336683417085119, 1.085427135678391997, 5.095477386934673447, 5.185929648241205392, 2.743718592964823788, 2.773869346733668362, 1.417085427135678311, 1.989949748743718549]
-y = [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1]
 
+# %%
+# Read in the classification data
+data_file = pkgdir(SparseGPs) * "/examples/data/classif_1D.csv"
+x, y = eachcol(readdlm(data_file))
+scatter(x, y)
+
+
+# %%
+# First, create the GP kernel from given parameters k
 function make_kernel(k)
     return softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2])))
 end
 
-k = [0.1, 0.1]
+k = [10, 0.1]
 
 kernel = make_kernel(k)
 f = LatentGP(GP(kernel), BernoulliLikelihood(), 0.1)
 fx = f(x)
-z = x[1:10]
+
+
+# %%
+# Then, plot some samples from the prior underlying GP
+x_plot = 0:0.02:6
+prior_f_samples = rand(f.f(x_plot, 1e-6),20)
+
+plt = plot(
+    x_plot,
+    prior_f_samples;
+    seriescolor="red",
+    linealpha=0.2,
+    label=""
+)
+scatter!(plt, x, y; seriescolor="blue", label="Data points")
+
+# %%
+# Plot the same samples, but pushed through a logistic sigmoid to constrain
+# them in (0, 1).
+prior_y_samples = mean.(f.lik.(prior_f_samples))
+
+plt = plot(
+    x_plot,
+    prior_y_samples;
+    seriescolor="red",
+    linealpha=0.2,
+    label=""
+)
+scatter!(plt, x, y; seriescolor="blue", label="Data points")
+
+
+# %%
+using Flux
+
+struct SVGPLayer
+    k # kernel parameters
+    m # variational mean
+    A # variational covariance
+    z # inducing points
+end
+
+@Flux.functor SVGPLayer (k, m, A,) # Don't train the inducing inputs
+
+lik = BernoulliLikelihood()
+function (m::SVGPLayer)(x)
+    kernel = make_kernel(m.k)
+    f = LatentGP(GP(kernel), BernoulliLikelihood(), 0.1)
+    q = MvNormal(m.m, m.A'm.A)
+    fx = f(x)
+    fu = f(m.z).fx
+    return fx, fu, q
+end
+
+function flux_loss(x, y; n_data=1, n_batch=1)
+    fx, fu, q = model(x)
+    return -SparseGPs.elbo(fx, y, fu, q; n_data, n_batch)
+end
+
+# %%
+M = 15 # number of inducing points
+
+# Initialise the parameters
+k = [10, 0.1]
+m = zeros(M)
+A = Matrix{Float64}(I, M, M)
+z = x[1:M]
+
+model = SVGPLayer(k, m, A, z)
+
+opt = ADAM(0.1)
+parameters = Flux.params(model)
+
+# %%
+# Negative ELBO before training
+println(flux_loss(x, y))
+
+# %%
+# Train the model
+Flux.train!(
+    (x, y) -> flux_loss(x, y),
+    parameters,
+    ncycle([(x, y)], 500), # Train for 1000 epochs
+    opt
+)
+
+# %%
+# Negative ELBO after training
+println(flux_loss(x, y))
+
+# %%
+# After optimisation, plot samples from the underlying posterior GP.
+
 fu = f(z).fx # want the underlying FiniteGP
-q = MvNormal(zeros(length(z)), I)
+post = SparseGPs.approx_posterior(SVGP(), fu, MvNormal(m, A'A))
+l_post = LatentGP(post, BernoulliLikelihood(), 0.1)
 
-SparseGPs.kl_divergence(q, fu)
-SparseGPs.elbo(fx, y, fu, q)
+post_f_samples = rand(l_post.f(x_plot, 1e-6),20)
 
-post = SparseGPs.approx_posterior(SVGP(), fu, q)
-f_mean, f_var = mean_and_var(post, fx.fx.x)
+plt = plot(
+    x_plot,
+    post_f_samples;
+    seriescolor="red",
+    linealpha=0.2,
+    legend=false
+)
 
+# %%
+# As above, push these samples through a logistic sigmoid to get posterior predictions.
+post_y_samples = mean.(l_post.lik.(post_f_samples))
 
-# v = inputs to evaluate
-# w = weights
-v, w = gausshermite(20);
-h = √2 * .√f_var' .* v .+ f_mean'
-lls = loglikelihood.(f.lik.(h), y')
-var_exp = (1/√π) * sum(w'lls)
+plt = plot(
+    x_plot,
+    post_y_samples;
+    seriescolor="red",
+    linealpha=0.2,
+    # legend=false,
+    label=""
+)
+scatter!(plt, x, y; seriescolor="blue", label="Data points")
+vline!(z; label="Pseudo-points")
diff --git a/examples/data/classif_1D.csv b/examples/data/classif_1D.csv
new file mode 100644
index 00000000..70ddb862
--- /dev/null
+++ b/examples/data/classif_1D.csv
@@ -0,0 +1,50 @@
+5.668341708542713242e+00 0.000000000000000000e+00
+5.758793969849246075e+00 0.000000000000000000e+00
+5.517587939698492150e+00 0.000000000000000000e+00
+2.954773869346733584e+00 1.000000000000000000e+00
+3.648241206030150785e+00 1.000000000000000000e+00
+2.110552763819095290e+00 1.000000000000000000e+00
+4.613065326633165597e+00 0.000000000000000000e+00
+4.793969849246231263e+00 0.000000000000000000e+00
+4.703517587939698430e+00 0.000000000000000000e+00
+6.030150753768843686e-01 1.000000000000000000e+00
+3.015075376884421843e-01 0.000000000000000000e+00
+3.979899497487437099e+00 0.000000000000000000e+00
+3.226130653266331638e+00 1.000000000000000000e+00
+1.899497487437185939e+00 1.000000000000000000e+00
+1.145728643216080256e+00 1.000000000000000000e+00
+3.316582914572864249e-01 0.000000000000000000e+00
+6.030150753768843686e-01 1.000000000000000000e+00
+2.231155778894472252e+00 1.000000000000000000e+00
+3.256281407035175768e+00 1.000000000000000000e+00
+1.085427135678391997e+00 1.000000000000000000e+00
+1.809045226130653106e+00 1.000000000000000000e+00
+4.492462311557789079e+00 0.000000000000000000e+00
+1.959798994974874198e+00 1.000000000000000000e+00
+0.000000000000000000e+00 0.000000000000000000e+00
+3.346733668341708601e+00 1.000000000000000000e+00
+1.507537688442210921e-01 0.000000000000000000e+00
+1.809045226130653328e-01 1.000000000000000000e+00
+5.517587939698492150e+00 0.000000000000000000e+00
+2.201005025125628123e+00 1.000000000000000000e+00
+5.577889447236180409e+00 0.000000000000000000e+00
+1.809045226130653328e-01 0.000000000000000000e+00
+1.688442211055276365e+00 1.000000000000000000e+00
+4.160804020100502321e+00 0.000000000000000000e+00
+2.170854271356783993e+00 1.000000000000000000e+00
+4.311557788944723413e+00 0.000000000000000000e+00
+3.075376884422110546e+00 1.000000000000000000e+00
+5.125628140703517133e+00 0.000000000000000000e+00
+1.989949748743718549e+00 1.000000000000000000e+00
+5.366834170854271058e+00 0.000000000000000000e+00
+4.100502512562814061e+00 0.000000000000000000e+00
+7.236180904522613311e-01 1.000000000000000000e+00
+2.261306532663316382e+00 1.000000000000000000e+00
+3.467336683417085119e+00 1.000000000000000000e+00
+1.085427135678391997e+00 1.000000000000000000e+00
+5.095477386934673447e+00 0.000000000000000000e+00
+5.185929648241205392e+00 0.000000000000000000e+00
+2.743718592964823788e+00 1.000000000000000000e+00
+2.773869346733668362e+00 1.000000000000000000e+00
+1.417085427135678311e+00 1.000000000000000000e+00
+1.989949748743718549e+00 1.000000000000000000e+00
diff --git a/src/quadrature.jl b/src/quadrature.jl
index 268e450d..7a1de617 100644
--- a/src/quadrature.jl
+++ b/src/quadrature.jl
@@ -9,7 +9,7 @@ function gauss_hermite_quadrature(
     # using a reparameterisation by change of variable
     # (see eg. en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature)
     xs, ws = gausshermite(n_points)
-    # size(fs): (n_points, length(y))
+    # size(fs): (length(y), n_points)
     fs = √2 * .√f_var .* transpose(xs) .+ f_mean
     lls = loglikelihood.(lik.(fs), y)
     return (1/√π) * lls * ws

From ce20ebac3395c595fe59599a54fbd840781ac851 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Sun, 4 Jul 2021 01:43:58 +0100
Subject: [PATCH 13/66] Updated examples.

---
 examples/classification.jl                 |  4 ++
 examples/{gpflow_svgp.jl => regression.jl} | 55 +++++++++++++---------
 2 files changed, 38 insertions(+), 21 deletions(-)
 rename examples/{gpflow_svgp.jl => regression.jl} (66%)

diff --git a/examples/classification.jl b/examples/classification.jl
index 21266bda..211c01af 100644
--- a/examples/classification.jl
+++ b/examples/classification.jl
@@ -12,7 +12,10 @@ using DelimitedFiles
 using IterTools
 
 using Plots
+default(; legend=:outertopright, size=(700, 400))
 
+using Random
+Random.seed!(1234)
 
 # %%
 # Read in the classification data
@@ -64,6 +67,7 @@ scatter!(plt, x, y; seriescolor="blue", label="Data points")
 
 
 # %%
+# A simple Flux model
 using Flux
 
 struct SVGPLayer
diff --git a/examples/gpflow_svgp.jl b/examples/regression.jl
similarity index 66%
rename from examples/gpflow_svgp.jl
rename to examples/regression.jl
index 63f06668..59b10b35 100644
--- a/examples/gpflow_svgp.jl
+++ b/examples/regression.jl
@@ -1,4 +1,4 @@
-# An attempted recreation of https://gpflow.readthedocs.io/en/master/notebooks/advanced/gps_for_big_data.html
+# A recreation of https://gpflow.readthedocs.io/en/master/notebooks/advanced/gps_for_big_data.html
 
 using AbstractGPs
 using SparseGPs
@@ -14,6 +14,7 @@ using Random
 Random.seed!(1234)
 
 # %%
+# The data generating function
 function g(x)
     return sin(3π * x) + 0.3 * cos(9π * x) + 0.5 * sin(7π * x)
 end
@@ -24,31 +25,28 @@ y = g.(x) + 0.3 * randn(N)
 
 scatter(x, y; xlabel="x", ylabel="y", legend=false)
 
-# %%
-M = 50 # number of inducing points
-
-# TODO: incorporate better inducing point selection from
-# https://github.com/JuliaGaussianProcesses/InducingPoints.jl?
-z = x[1:M]
 
 # %%
 # A simple Flux model
 using Flux
 
-struct SVGPLayer
+struct SVGPModel
     k # kernel parameters
     m # variational mean
     A # variational covariance
     z # inducing points
 end
 
-@Flux.functor SVGPLayer
+@Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs
 
 function make_kernel(k)
     return softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2])))
 end
 
-function (m::SVGPLayer)(x)
+# Create the 'model' from the parameters - i.e. return the FiniteGP at inputs x,
+# the FiniteGP at inducing inputs z and the variational posterior over inducing
+# points - q(u).
+function (m::SVGPModel)(x)
     kernel = make_kernel(m.k)
     f = GP(kernel)
     q = MvNormal(m.m, m.A'm.A)
@@ -57,7 +55,8 @@ function (m::SVGPLayer)(x)
     return fx, fu, q
 end
 
-function posterior(m::SVGPLayer)
+# Create the posterior GP from the model parameters.
+function posterior(m::SVGPModel)
     kernel = make_kernel(m.k)
     f = GP(kernel)
     fu = f(m.z, 0.3)
@@ -65,37 +64,50 @@ function posterior(m::SVGPLayer)
     return SparseGPs.approx_posterior(SVGP(), fu, q)
 end
 
+# Return the loss given data - in this case the negative ELBO.
 function flux_loss(x, y; n_data=1, n_batch=1)
     fx, fu, q = model(x)
     return -SparseGPs.elbo(fx, y, fu, q; n_data, n_batch)
 end
 
+
+# %%
+M = 50 # number of inducing points
+
+# Select the first M inputs as inducing inputs
+z = x[1:M]
+
 # Initialise the parameters
 k = [0.3, 10]
 m = zeros(M)
 A = Matrix{Float64}(I, M, M)
 
-model = SVGPLayer(k, m, A, z)
+model = SVGPModel(k, m, A, z)
 
 b = 100 # minibatch size
 opt = ADAM(0.01)
-# parameters = Flux.params(k, s, m, A)
 parameters = Flux.params(model)
 data_loader = Flux.Data.DataLoader((x, y), batchsize=b)
 
 # %%
+# Negative ELBO before training
 println(flux_loss(x, y))
 
+# %%
+# Train the model
 Flux.train!(
     (x, y) -> flux_loss(x, y; n_data=N, n_batch=b),
     parameters,
-    ncycle(data_loader, 100), # Train for 100 epochs
+    ncycle(data_loader, 300), # Train for 400 epochs
     opt
 )
 
+# %%
+# Negative ELBO after training
 println(flux_loss(x, y))
 
 # %%
+# Plot samples from the optmimised approximate posterior.
 post = posterior(model)
 
 scatter(
@@ -113,9 +125,10 @@ plot!(-1:0.001:1, post; label="Posterior")
 vline!(z; label="Pseudo-points")
 
 
-# %% Find the exact posterior over u (e.g.
-# https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/ equations
-# (11) & (12)) As a sanity check.
+# %% There is a closed form optimal solution for the variational posterior q(u)
+# (e.g. https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/
+# equations (11) & (12)). The SVGP posterior with this optimal q(u) should
+# therefore be equivalent to the 'exact' sparse GP (Titsias) posterior.
 
 function exact_q(fu, fx, y)
     σ² = fx.Σy[1]
@@ -123,8 +136,8 @@ function exact_q(fu, fx, y)
     Kuu = Symmetric(cov(fu))
     Σ = (Symmetric(cov(fu) + (1/σ²) * Kuf * Kuf'))
     m = ((1/σ²)*Kuu* (Σ\Kuf)) * y
-    A = Symmetric(Kuu * (Σ \ Kuu))
-    return MvNormal(m, A)
+    S = Symmetric(Kuu * (Σ \ Kuu))
+    return MvNormal(m, S)
 end
 
 kernel = make_kernel([0.2, 11])
@@ -136,8 +149,8 @@ q_ex = exact_q(fu, fx, y)
 scatter(x, y)
 scatter!(z, q_ex.μ)
 
-# These two should be the same - and they are, the plot below shows almost identical predictions
-ap_ex = SparseGPs.approx_posterior(SVGP(), fu, q_ex) # Hensman 2013 (exact) posterior
+# These two should be the same - and they are, as the plot below shows
+ap_ex = SparseGPs.approx_posterior(SVGP(), fu, q_ex) # Hensman (2013) exact posterior
 ap_tits = AbstractGPs.approx_posterior(VFE(), fx, y, fu) # Titsias posterior
 
 # Should these be the same? (they currently aren't)

From 359b3d54d621c5a0b6af21c2ee2071506e58e45c Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Sun, 4 Jul 2021 01:49:12 +0100
Subject: [PATCH 14/66] Renamed SVGPLayer to SVGPModel.

---
 examples/classification.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/classification.jl b/examples/classification.jl
index 211c01af..85b875ca 100644
--- a/examples/classification.jl
+++ b/examples/classification.jl
@@ -70,17 +70,17 @@ scatter!(plt, x, y; seriescolor="blue", label="Data points")
 # A simple Flux model
 using Flux
 
-struct SVGPLayer
+struct SVGPModel
     k # kernel parameters
     m # variational mean
     A # variational covariance
     z # inducing points
 end
 
-@Flux.functor SVGPLayer (k, m, A,) # Don't train the inducing inputs
+@Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs
 
 lik = BernoulliLikelihood()
-function (m::SVGPLayer)(x)
+function (m::SVGPModel)(x)
     kernel = make_kernel(m.k)
     f = LatentGP(GP(kernel), BernoulliLikelihood(), 0.1)
     q = MvNormal(m.m, m.A'm.A)
@@ -103,7 +103,7 @@ m = zeros(M)
 A = Matrix{Float64}(I, M, M)
 z = x[1:M]
 
-model = SVGPLayer(k, m, A, z)
+model = SVGPModel(k, m, A, z)
 
 opt = ADAM(0.1)
 parameters = Flux.params(model)

From 3bdbedb4a4fe8e05f79960c10618548f53f8371a Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Sun, 4 Jul 2021 13:39:00 +0100
Subject: [PATCH 15/66] Added basic test structure.

---
 test/Project.toml |  3 +++
 test/runtests.jl  | 12 ++++++++++++
 test/svgp.jl      |  4 ++++
 3 files changed, 19 insertions(+)
 create mode 100644 test/Project.toml
 create mode 100644 test/runtests.jl
 create mode 100644 test/svgp.jl

diff --git a/test/Project.toml b/test/Project.toml
new file mode 100644
index 00000000..7a21f898
--- /dev/null
+++ b/test/Project.toml
@@ -0,0 +1,3 @@
+[deps]
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/runtests.jl b/test/runtests.jl
new file mode 100644
index 00000000..914269c5
--- /dev/null
+++ b/test/runtests.jl
@@ -0,0 +1,12 @@
+using Random
+using Test
+using SparseGPs
+
+const GROUP = get(ENV, "GROUP", "All")
+const PKGDIR = dirname(dirname(pathof(SparseGPs)))
+
+@testset "SparseGPs" begin
+    include("svgp.jl")
+    println(" ")
+    @info "Ran svgp tests"
+end
diff --git a/test/svgp.jl b/test/svgp.jl
new file mode 100644
index 00000000..b5e84b04
--- /dev/null
+++ b/test/svgp.jl
@@ -0,0 +1,4 @@
+@testset "svgp" begin
+    x = 4
+    @test x == 4
+end

From cb3a341d6f2bcb1b4474d7e3b2c8b8f48ef94681 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Sun, 4 Jul 2021 14:57:54 +0100
Subject: [PATCH 16/66] Started equivalence tests

---
 test/equivalences.jl | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 test/equivalences.jl

diff --git a/test/equivalences.jl b/test/equivalences.jl
new file mode 100644
index 00000000..69e83a76
--- /dev/null
+++ b/test/equivalences.jl
@@ -0,0 +1,8 @@
+@testset "equivalences" begin
+    rng, N = MersenneTwister(654321), 20
+    x = rand(rng, N)
+    y = sin.(x) + 0.9 * cos.(x * 1.6) + 0.4 * rand(rng, N)
+
+    z = copy(x) # Set inducing inputs == training inputs
+    
+end

From 3a2c8a921fc2ab339a00196bcb23034419bc6537 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Sun, 4 Jul 2021 16:55:29 +0100
Subject: [PATCH 17/66] First pass (doesn't work yet)

---
 test/Project.toml    |  2 +
 test/equivalences.jl | 88 +++++++++++++++++++++++++++++++++++++++++++-
 test/runtests.jl     | 10 +++++
 test/test_utils.jl   |  0
 4 files changed, 99 insertions(+), 1 deletion(-)
 create mode 100644 test/test_utils.jl

diff --git a/test/Project.toml b/test/Project.toml
index 7a21f898..e089a59d 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,3 +1,5 @@
 [deps]
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/equivalences.jl b/test/equivalences.jl
index 69e83a76..70e81aab 100644
--- a/test/equivalences.jl
+++ b/test/equivalences.jl
@@ -4,5 +4,91 @@
     y = sin.(x) + 0.9 * cos.(x * 1.6) + 0.4 * rand(rng, N)
 
     z = copy(x) # Set inducing inputs == training inputs
-    
+
+    # Create a kernel from parameters k
+    kernel(k) = softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2])))
+    k_init = [0.1, 0.1] # initial kernel parameters
+
+    lik_noise = 0.1 # The (fixed) Gaussian likelihood noise
+
+    ## FIRST - define the models
+    # GPR - Exact GP regression
+    struct GPRModel
+        k # kernel parameters
+    end
+    @Flux.functor GPRModel
+
+    function (m::GPRModel)(x)
+        f = GP(kernel(m.k))
+        fx = f(x, lik_noise)
+        return fx
+    end
+
+    # SGPR - Sparse GP regression (Titsias 2009)
+    struct SGPRModel
+        k # kernel parameters
+        z # inducing points
+    end
+    @Flux.functor SGPRModel (k,) # Don't train the inducing inputs
+
+    function (m::SGPRModel)(x)
+        f = GP(kernel(m.k))
+        fx = f(x, lik_noise)
+        fz = f(m.z, lik_noise)
+        return fx, fz
+    end
+
+    # SVGP - Sparse variational GP regression (Hensman 2014)
+    struct SVGPModel
+        k # kernel parameters
+        z # inducing points
+        m # variational mean
+        A # variational covariance sqrt (Σ = A'A)
+    end
+    @Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs
+
+    function (m::SVGPModel)(x)
+        f = GP(kernel(m.k))
+        q = MvNormal(m.m, m.A'm.A)
+        fx = f(x, lik_noise)
+        fz = f(m.z, lik_noise)
+        return fx, fz, q
+    end
+
+    ## SECOND - create the models and associated training losses
+    gpr = GPRModel(copy(k_init))
+    function GPR_loss(x, y)
+        fx = gpr(x)
+        return -logpdf(fx, y)
+    end
+
+    sgpr = SGPRModel(copy(k_init), copy(z))
+    function SGPR_loss(x, y)
+        fx, fz = sgpr(x)
+        return -AbstractGPs.elbo(fx, y, fz)
+    end
+
+    m, A = rand(rng, N), rand(rng, N, N) # initialise the variational parameters
+    svgp = SVGPModel(copy(k_init), copy(z), m, A)
+    function SVGP_loss(x, y)
+        fx, fz, q = svgp(x)
+        return -SparseGPs.elbo(fx, y, fz, q)
+    end
+
+    ## THIRD - train the models
+    data = [(x, y)]
+    opt = ADAM(0.01)
+
+    Flux.train!((x, y) -> GPR_loss(x, y), Flux.params(gpr), ncycle(data, 300), opt)
+    Flux.train!((x, y) -> SGPR_loss(x, y), Flux.params(sgpr), ncycle(data, 300), opt)
+    Flux.train!((x, y) -> SVGP_loss(x, y), Flux.params(svgp), ncycle(data, 300), opt)
+
+    ## FOURTH - test equivalence
+    println(gpr.k)
+    println(sgpr.k)
+    println(svgp.k)
+    @test gpr.k ≈ svgp.k
+
+    # TODO: test posterior predictions
 end
+
diff --git a/test/runtests.jl b/test/runtests.jl
index 914269c5..c2146c68 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,12 +1,22 @@
 using Random
 using Test
 using SparseGPs
+using Flux
+using IterTools
+using AbstractGPs
+using SparseGPs
 
 const GROUP = get(ENV, "GROUP", "All")
 const PKGDIR = dirname(dirname(pathof(SparseGPs)))
 
+include("test_utils.jl")
+
 @testset "SparseGPs" begin
     include("svgp.jl")
     println(" ")
     @info "Ran svgp tests"
+
+    include("equivalences.jl")
+    println(" ")
+    @info "Ran equivalences tests"
 end
diff --git a/test/test_utils.jl b/test/test_utils.jl
new file mode 100644
index 00000000..e69de29b

From 005f8f03c9d896774260b760df33d77cad00d42d Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Tue, 6 Jul 2021 02:43:17 +0100
Subject: [PATCH 18/66] Working tests

---
 test/Project.toml    |  2 ++
 test/equivalences.jl | 85 ++++++++++++++++++++++++++++----------------
 test/runtests.jl     |  2 +-
 3 files changed, 58 insertions(+), 31 deletions(-)

diff --git a/test/Project.toml b/test/Project.toml
index e089a59d..a4a781f3 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,4 +1,6 @@
 [deps]
+AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918"
+Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
diff --git a/test/equivalences.jl b/test/equivalences.jl
index 70e81aab..46fb2ba4 100644
--- a/test/equivalences.jl
+++ b/test/equivalences.jl
@@ -1,15 +1,16 @@
 @testset "equivalences" begin
     rng, N = MersenneTwister(654321), 20
-    x = rand(rng, N)
+    x = rand(rng, N) * 10
     y = sin.(x) + 0.9 * cos.(x * 1.6) + 0.4 * rand(rng, N)
 
     z = copy(x) # Set inducing inputs == training inputs
 
     # Create a kernel from parameters k
     kernel(k) = softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2])))
-    k_init = [0.1, 0.1] # initial kernel parameters
+    k_init = [0.2, 0.6] # initial kernel parameters
 
     lik_noise = 0.1 # The (fixed) Gaussian likelihood noise
+    jitter = 1e-5
 
     ## FIRST - define the models
     # GPR - Exact GP regression
@@ -24,19 +25,19 @@
         return fx
     end
 
-    # SGPR - Sparse GP regression (Titsias 2009)
-    struct SGPRModel
-        k # kernel parameters
-        z # inducing points
-    end
-    @Flux.functor SGPRModel (k,) # Don't train the inducing inputs
+    # # SGPR - Sparse GP regression (Titsias 2009)
+    # struct SGPRModel
+    #     k # kernel parameters
+    #     z # inducing points
+    # end
+    # @Flux.functor SGPRModel (k,) # Don't train the inducing inputs
 
-    function (m::SGPRModel)(x)
-        f = GP(kernel(m.k))
-        fx = f(x, lik_noise)
-        fz = f(m.z, lik_noise)
-        return fx, fz
-    end
+    # function (m::SGPRModel)(x)
+    #     f = GP(kernel(m.k))
+    #     fx = f(x, lik_noise)
+    #     fz = f(m.z, lik_noise)
+    #     return fx, fz
+    # end
 
     # SVGP - Sparse variational GP regression (Hensman 2014)
     struct SVGPModel
@@ -51,7 +52,7 @@
         f = GP(kernel(m.k))
         q = MvNormal(m.m, m.A'm.A)
         fx = f(x, lik_noise)
-        fz = f(m.z, lik_noise)
+        fz = f(m.z, jitter)
         return fx, fz, q
     end
 
@@ -62,13 +63,13 @@
         return -logpdf(fx, y)
     end
 
-    sgpr = SGPRModel(copy(k_init), copy(z))
-    function SGPR_loss(x, y)
-        fx, fz = sgpr(x)
-        return -AbstractGPs.elbo(fx, y, fz)
-    end
+    # sgpr = SGPRModel(copy(k_init), copy(z))
+    # function SGPR_loss(x, y)
+    #     fx, fz = sgpr(x)
+    #     return -AbstractGPs.elbo(fx, y, fz)
+    # end
 
-    m, A = rand(rng, N), rand(rng, N, N) # initialise the variational parameters
+    m, A = rand(rng, N), rand(rng, N, N)/2 # initialise the variational parameters
     svgp = SVGPModel(copy(k_init), copy(z), m, A)
     function SVGP_loss(x, y)
         fx, fz, q = svgp(x)
@@ -79,16 +80,40 @@
     data = [(x, y)]
     opt = ADAM(0.01)
 
-    Flux.train!((x, y) -> GPR_loss(x, y), Flux.params(gpr), ncycle(data, 300), opt)
-    Flux.train!((x, y) -> SGPR_loss(x, y), Flux.params(sgpr), ncycle(data, 300), opt)
-    Flux.train!((x, y) -> SVGP_loss(x, y), Flux.params(svgp), ncycle(data, 300), opt)
+    svgp_ps = Flux.params(svgp)
+    delete!(svgp_ps, svgp.k) # Don't train the kernel parameters
+
+    # Flux.train!((x, y) -> GPR_loss(x, y), Flux.params(gpr), ncycle(data, 3000), opt)
+    # Flux.train!((x, y) -> SGPR_loss(x, y), Flux.params(sgpr), ncycle(data, 3000), opt)
+    Flux.train!((x, y) -> SVGP_loss(x, y), svgp_ps, ncycle(data, 9000), opt)
+
+    ## FOURTH - construct the posteriors
+    function posterior(m::GPRModel, x, y)
+        f = GP(kernel(m.k))
+        fx = f(x, lik_noise)
+        return AbstractGPs.posterior(fx, y)
+    end
+
+    # function posterior(m::SGPRModel, x, y)
+    #     f = GP(kernel(m.k))
+    #     fx = f(x, lik_noise)
+    #     fz = f(m.z)
+    #     return AbstractGPs.approx_posterior(VFE(), fx, y, fz)
+    # end
+
+    function posterior(m::SVGPModel)
+        f = GP(kernel(m.k))
+        fz = f(m.z, jitter)
+        q = MvNormal(m.m, m.A'm.A)
+        return SparseGPs.approx_posterior(SVGP(), fz, q)
+    end
+    gpr_post = posterior(gpr, x, y)
+    # sgpr_post = posterior(sgpr, x, y)
+    svgp_post = posterior(svgp)
 
-    ## FOURTH - test equivalence
-    println(gpr.k)
-    println(sgpr.k)
-    println(svgp.k)
-    @test gpr.k ≈ svgp.k
+    ## FIFTH - test equivalences
+    @test all(isapprox.(mean(gpr_post, x), mean(svgp_post, x), atol=1e-3))
+    @test all(isapprox.(cov(gpr_post, x), cov(svgp_post, x), atol=1e-3))
 
-    # TODO: test posterior predictions
 end
 
diff --git a/test/runtests.jl b/test/runtests.jl
index c2146c68..e5d8346a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -4,7 +4,7 @@ using SparseGPs
 using Flux
 using IterTools
 using AbstractGPs
-using SparseGPs
+using Distributions
 
 const GROUP = get(ENV, "GROUP", "All")
 const PKGDIR = dirname(dirname(pathof(SparseGPs)))

From 443a2d4f1c5675b9bf21b9cedbe98a1483f53e92 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Tue, 6 Jul 2021 02:47:52 +0100
Subject: [PATCH 19/66] Fixed KL divergence

---
 examples/regression.jl | 2 +-
 src/svgp.jl            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/regression.jl b/examples/regression.jl
index 59b10b35..6f5b0766 100644
--- a/examples/regression.jl
+++ b/examples/regression.jl
@@ -98,7 +98,7 @@ println(flux_loss(x, y))
 Flux.train!(
     (x, y) -> flux_loss(x, y; n_data=N, n_batch=b),
     parameters,
-    ncycle(data_loader, 300), # Train for 400 epochs
+    ncycle(data_loader, 300), # Train for 300 epochs
     opt
 )
 
diff --git a/src/svgp.jl b/src/svgp.jl
index 6962a148..0245ed71 100644
--- a/src/svgp.jl
+++ b/src/svgp.jl
@@ -43,7 +43,7 @@ end
 function kl_divergence(q::MvNormal, p::AbstractMvNormal)
     p_μ, p_Σ = mean(p), cov(p)
     (1/2) .* (logdet(p_Σ) - logdet(q.Σ) - length(p_μ) + tr(p_Σ \ cov(q)) +
-              Xt_invA_X(cholesky(q.Σ), (q.μ - p_μ)))
+              Xt_invA_X(cholesky(p_Σ), (q.μ - p_μ)))
 end
 
 # The closed form expected loglikelihood for a Gaussian likelihood

From 92da73c4871becf5418257be2c11ce28e691a181 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Tue, 6 Jul 2021 18:06:41 +0100
Subject: [PATCH 20/66] Refactored elbo stuff

---
 src/SparseGPs.jl  |   2 +-
 src/elbo.jl       | 100 ++++++++++++++++++++++++++++++++++++++++++++++
 src/quadrature.jl |  18 ---------
 src/svgp.jl       |  74 ++++++++++------------------------
 4 files changed, 123 insertions(+), 71 deletions(-)
 create mode 100644 src/elbo.jl
 delete mode 100644 src/quadrature.jl

diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl
index 335af352..e34e34c1 100644
--- a/src/SparseGPs.jl
+++ b/src/SparseGPs.jl
@@ -25,7 +25,7 @@ export elbo,
     approx_posterior,
     SVGP
 
-include("quadrature.jl")
+include("elbo.jl")
 include("svgp.jl")
 
 end
diff --git a/src/elbo.jl b/src/elbo.jl
new file mode 100644
index 00000000..0df7f051
--- /dev/null
+++ b/src/elbo.jl
@@ -0,0 +1,100 @@
+"""
+    elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::MvNormal; n_data=1, n_batch=1)
+
+Compute the Evidence Lower BOund from [1] for the process `fx.f` where `y` are
+observations of `fx`, pseudo-inputs are given by `z = fz.z` and `q(u)` is a
+variational distribution over inducing points `u = f(z)`.
+
+[1] - Hensman, James, Alexander Matthews, and Zoubin Ghahramani. "Scalable
+variational Gaussian process classification." Artificial Intelligence and
+Statistics. PMLR, 2015.
+"""
+
+function elbo(
+    fx::FiniteGP,
+    y::AbstractVector{<:Real},
+    fz::FiniteGP,
+    q::MvNormal;
+    n_data=1,
+    n_batch=1
+)
+    kl_term, f_mean, f_var = _elbo_intermediates(lfx.fx)
+
+    Σy = diag(fx.Σy) # n.b. this assumes uncorrelated observation noise
+    variational_exp = expected_loglik(y, f_mean, f_var, Σy)
+    scale = n_data / n_batch
+    return sum(variational_exp) * scale - kl_term
+end
+
+function elbo(
+    lfx::LatentFiniteGP,
+    y::AbstractVector{<:Real},
+    fz::FiniteGP,
+    q::MvNormal;
+    n_data=1,
+    n_batch=1
+)
+    kl_term, f_mean, f_var = _elbo_intermediates(lfx.fx)
+    
+    variational_exp = expected_loglik(y, f_mean, f_var, lfx.lik)
+    scale = n_data / n_batch
+    return sum(variational_exp) * scale - kl_term
+end
+
+# Computes the common intermediates needed for the ELBO
+function _elbo_intermediates(
+    fx::FiniteGP,
+    y::AbstractVector{<:Real},
+    fz::FiniteGP,
+    q::MvNormal
+)
+    kl_term = kl_divergence(q, fz)
+    post = approx_posterior(SVGP(), fz, q)
+    f_mean, f_var = mean_and_var(post, fx.fx.x)
+    return kl_term, f_mean, f_var
+end
+
+# The closed form expected loglikelihood for a Gaussian likelihood
+function expected_loglik(
+    y::AbstractVector{<:Real},
+    f_mean::AbstractVector,
+    f_var::AbstractVector,
+    Σy::AbstractVector
+)
+    return -0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy)
+end
+
+function expected_loglik(
+    y::AbstractVector,
+    f_mean::AbstractVector,
+    f_var::AbstractVector,
+    lik::BernoulliLikelihood;
+    n_points=20
+)
+    return gauss_hermite_quadrature(y, f_mean, f_var, lik; n_points=n_points)
+end
+
+function kl_divergence(q::MvNormal, p::AbstractMvNormal)
+    p_μ, p_Σ = mean(p), cov(p)
+    (1/2) .* (logdet(p_Σ) - logdet(q.Σ) - length(p_μ) + tr(p_Σ \ cov(q)) +
+              Xt_invA_X(cholesky(p_Σ), (q.μ - p_μ)))
+end
+
+function gauss_hermite_quadrature(
+    y::AbstractVector,
+    f_mean::AbstractVector,
+    f_var::AbstractVector,
+    lik;
+    n_points=20
+)
+    # Compute the expectation via Gauss-Hermite quadrature
+    # using a reparameterisation by change of variable
+    # (see eg. en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature)
+    xs, ws = gausshermite(n_points)
+    # size(fs): (length(y), n_points)
+    fs = √2 * .√f_var .* transpose(xs) .+ f_mean
+    lls = loglikelihood.(lik.(fs), y)
+    return (1/√π) * lls * ws
+end
+
+ChainRulesCore.@non_differentiable gausshermite(n)
diff --git a/src/quadrature.jl b/src/quadrature.jl
deleted file mode 100644
index 7a1de617..00000000
--- a/src/quadrature.jl
+++ /dev/null
@@ -1,18 +0,0 @@
-function gauss_hermite_quadrature(
-    y::AbstractVector,
-    f_mean::AbstractVector,
-    f_var::AbstractVector,
-    lik;
-    n_points=20
-)
-    # Compute the expectation via Gauss-Hermite quadrature
-    # using a reparameterisation by change of variable
-    # (see eg. en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature)
-    xs, ws = gausshermite(n_points)
-    # size(fs): (length(y), n_points)
-    fs = √2 * .√f_var .* transpose(xs) .+ f_mean
-    lls = loglikelihood.(lik.(fs), y)
-    return (1/√π) * lls * ws
-end
-
-ChainRulesCore.@non_differentiable gausshermite(n)
diff --git a/src/svgp.jl b/src/svgp.jl
index 0245ed71..229d2173 100644
--- a/src/svgp.jl
+++ b/src/svgp.jl
@@ -1,11 +1,29 @@
 struct SVGP end
 
-function approx_posterior(::SVGP, fu::FiniteGP, q::MvNormal)
+"""
+    approx_posterior(::SVGP, fz::FiniteGP, q::MvNormal)
+
+Compute the approximate posterior [1] over the process `f = fz.f`, given inducing
+inputs `z = fz.x` and a variational distribution over inducing points `q(u)` where `u =
+f(z)`. The approximate posterior at test points ``x^*`` where ``f^* = f(x^*)``
+is then given by:
+
+```math
+q(f^*) = \int p(f | u) q(u) du
+```
+which can be found in closed form.
+
+[1] - Hensman, James, Alexander Matthews, and Zoubin Ghahramani. "Scalable
+variational Gaussian process classification." Artificial Intelligence and
+Statistics. PMLR, 2015.
+"""
+
+function approx_posterior(::SVGP, fz::FiniteGP, q::MvNormal)
     m, A = mean(q), cholesky(cov(q))
-    Kuu = cholesky(Symmetric(cov(fu)))
+    Kuu = cholesky(Symmetric(cov(fz)))
     B = Kuu.L \ A.L
-    data = (A=A, m=m, Kuu=Kuu, B=B, α=Kuu \ m, u=fu.x)
-    return ApproxPosteriorGP(SVGP(), fu.f, data)
+    data = (A=A, m=m, Kuu=Kuu, B=B, α=Kuu \ m, u=fz.x)
+    return ApproxPosteriorGP(SVGP(), fz.f, data)
 end
 
 function Statistics.var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
@@ -39,51 +57,3 @@ function StatsBase.mean_and_var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
     Σ_diag = var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) 
     return μ, Σ_diag
 end
-
-function kl_divergence(q::MvNormal, p::AbstractMvNormal)
-    p_μ, p_Σ = mean(p), cov(p)
-    (1/2) .* (logdet(p_Σ) - logdet(q.Σ) - length(p_μ) + tr(p_Σ \ cov(q)) +
-              Xt_invA_X(cholesky(p_Σ), (q.μ - p_μ)))
-end
-
-# The closed form expected loglikelihood for a Gaussian likelihood
-function expected_loglik(
-    y::AbstractVector{<:Real},
-    f_mean::AbstractVector,
-    f_var::AbstractVector,
-    Σy::AbstractVector
-)
-    return -0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy)
-end
-
-function expected_loglik(
-    y::AbstractVector,
-    f_mean::AbstractVector,
-    f_var::AbstractVector,
-    lik::BernoulliLikelihood;
-    n_points=20
-)
-    return gauss_hermite_quadrature(y, f_mean, f_var, lik; n_points=n_points)
-end
-
-function elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal; n_data=1, n_batch=1)
-    kl_term = kl_divergence(q, fu)
-    post = approx_posterior(SVGP(), fu, q)
-    f_mean, f_var = mean_and_var(post, fx.x)
-    Σy = diag(fx.Σy)
-
-    variational_exp = expected_loglik(y, f_mean, f_var, Σy)
-    scale = n_data / n_batch
-    return sum(variational_exp) * scale - kl_term
-end
-
-function elbo(fx::LatentFiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal; n_data=1, n_batch=1)
-    kl_term = kl_divergence(q, fu)
-    post = approx_posterior(SVGP(), fu, q)
-    f_mean, f_var = mean_and_var(post, fx.fx.x)
-    
-    variational_exp = expected_loglik(y, f_mean, f_var, fx.lik)
-    scale = n_data / n_batch
-    return sum(variational_exp) * scale - kl_term
-end
-

From 7d05d1b92969d79ca5e4a7b093cab1f67ad3611e Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Wed, 7 Jul 2021 12:46:50 +0100
Subject: [PATCH 21/66] Fixed elbo mistakes

---
 examples/classification.jl |  4 ++--
 src/elbo.jl                |  7 +++----
 src/svgp.jl                | 18 ++++++++++--------
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/examples/classification.jl b/examples/classification.jl
index 85b875ca..6dfd0c00 100644
--- a/examples/classification.jl
+++ b/examples/classification.jl
@@ -117,7 +117,7 @@ println(flux_loss(x, y))
 Flux.train!(
     (x, y) -> flux_loss(x, y),
     parameters,
-    ncycle([(x, y)], 500), # Train for 1000 epochs
+    ncycle([(x, y)], 1000), # Train for 1000 epochs
     opt
 )
 
@@ -132,7 +132,7 @@ fu = f(z).fx # want the underlying FiniteGP
 post = SparseGPs.approx_posterior(SVGP(), fu, MvNormal(m, A'A))
 l_post = LatentGP(post, BernoulliLikelihood(), 0.1)
 
-post_f_samples = rand(l_post.f(x_plot, 1e-6),20)
+post_f_samples = rand(l_post.f(x_plot, 1e-6), 20)
 
 plt = plot(
     x_plot,
diff --git a/src/elbo.jl b/src/elbo.jl
index 0df7f051..827af5f3 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -18,7 +18,7 @@ function elbo(
     n_data=1,
     n_batch=1
 )
-    kl_term, f_mean, f_var = _elbo_intermediates(lfx.fx)
+    kl_term, f_mean, f_var = _elbo_intermediates(fx, fz, q)
 
     Σy = diag(fx.Σy) # n.b. this assumes uncorrelated observation noise
     variational_exp = expected_loglik(y, f_mean, f_var, Σy)
@@ -34,7 +34,7 @@ function elbo(
     n_data=1,
     n_batch=1
 )
-    kl_term, f_mean, f_var = _elbo_intermediates(lfx.fx)
+    kl_term, f_mean, f_var = _elbo_intermediates(lfx.fx, fz, q)
     
     variational_exp = expected_loglik(y, f_mean, f_var, lfx.lik)
     scale = n_data / n_batch
@@ -44,13 +44,12 @@ end
 # Computes the common intermediates needed for the ELBO
 function _elbo_intermediates(
     fx::FiniteGP,
-    y::AbstractVector{<:Real},
     fz::FiniteGP,
     q::MvNormal
 )
     kl_term = kl_divergence(q, fz)
     post = approx_posterior(SVGP(), fz, q)
-    f_mean, f_var = mean_and_var(post, fx.fx.x)
+    f_mean, f_var = mean_and_var(post, fx.x)
     return kl_term, f_mean, f_var
 end
 
diff --git a/src/svgp.jl b/src/svgp.jl
index 229d2173..15b0a94d 100644
--- a/src/svgp.jl
+++ b/src/svgp.jl
@@ -9,7 +9,7 @@ f(z)`. The approximate posterior at test points ``x^*`` where ``f^* = f(x^*)``
 is then given by:
 
 ```math
-q(f^*) = \int p(f | u) q(u) du
+q(f^*) = \\int p(f | u) q(u) du
 ```
 which can be found in closed form.
 
@@ -19,19 +19,13 @@ Statistics. PMLR, 2015.
 """
 
 function approx_posterior(::SVGP, fz::FiniteGP, q::MvNormal)
-    m, A = mean(q), cholesky(cov(q))
+    m, A = q.μ, cholesky(q.Σ)
     Kuu = cholesky(Symmetric(cov(fz)))
     B = Kuu.L \ A.L
     data = (A=A, m=m, Kuu=Kuu, B=B, α=Kuu \ m, u=fz.x)
     return ApproxPosteriorGP(SVGP(), fz.f, data)
 end
 
-function Statistics.var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
-    Cux = cov(f.prior, f.data.u, x)
-    D = f.data.Kuu.L \ Cux
-    return var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) 
-end
-
 function Statistics.mean(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
     return cov(f.prior, x, f.data.u) * f.data.α
 end
@@ -42,6 +36,14 @@ function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
     return cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) 
 end
 
+function Statistics.var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
+    Cux = cov(f.prior, f.data.u, x)
+    D = f.data.Kuu.L \ Cux
+    return var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) 
+end
+
+#TODO: cov(x, y)
+
 function StatsBase.mean_and_cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
     Cux = cov(f.prior, f.data.u, x)
     D = f.data.Kuu.L \ Cux

From c0dd7372e70331b63ff00fffae831d2c18bcc74d Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Wed, 7 Jul 2021 14:01:17 +0100
Subject: [PATCH 22/66] Remove type restiction in ELBO

---
 src/elbo.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/elbo.jl b/src/elbo.jl
index 827af5f3..425cf6bf 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -28,7 +28,7 @@ end
 
 function elbo(
     lfx::LatentFiniteGP,
-    y::AbstractVector{<:Real},
+    y::AbstractVector,
     fz::FiniteGP,
     q::MvNormal;
     n_data=1,

From 92dcdf5a9bf0f4003767a5ef9196e36c2a0b4973 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Wed, 7 Jul 2021 16:30:49 +0100
Subject: [PATCH 23/66] Infer batch size

---
 examples/classification.jl |  4 ++--
 examples/regression.jl     |  6 +++---
 src/elbo.jl                | 11 +++++------
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/examples/classification.jl b/examples/classification.jl
index 6dfd0c00..03146024 100644
--- a/examples/classification.jl
+++ b/examples/classification.jl
@@ -89,9 +89,9 @@ function (m::SVGPModel)(x)
     return fx, fu, q
 end
 
-function flux_loss(x, y; n_data=1, n_batch=1)
+function flux_loss(x, y; n_data=length(y))
     fx, fu, q = model(x)
-    return -SparseGPs.elbo(fx, y, fu, q; n_data, n_batch)
+    return -SparseGPs.elbo(fx, y, fu, q; n_data)
 end
 
 # %%
diff --git a/examples/regression.jl b/examples/regression.jl
index 6f5b0766..0cba8ac6 100644
--- a/examples/regression.jl
+++ b/examples/regression.jl
@@ -65,9 +65,9 @@ function posterior(m::SVGPModel)
 end
 
 # Return the loss given data - in this case the negative ELBO.
-function flux_loss(x, y; n_data=1, n_batch=1)
+function flux_loss(x, y; n_data=length(y))
     fx, fu, q = model(x)
-    return -SparseGPs.elbo(fx, y, fu, q; n_data, n_batch)
+    return -SparseGPs.elbo(fx, y, fu, q; n_data)
 end
 
 
@@ -96,7 +96,7 @@ println(flux_loss(x, y))
 # %%
 # Train the model
 Flux.train!(
-    (x, y) -> flux_loss(x, y; n_data=N, n_batch=b),
+    (x, y) -> flux_loss(x, y; n_data=N),
     parameters,
     ncycle(data_loader, 300), # Train for 300 epochs
     opt
diff --git a/src/elbo.jl b/src/elbo.jl
index 425cf6bf..20e37b96 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -1,5 +1,5 @@
 """
-    elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::MvNormal; n_data=1, n_batch=1)
+    elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::MvNormal; n_data=1)
 
 Compute the Evidence Lower BOund from [1] for the process `fx.f` where `y` are
 observations of `fx`, pseudo-inputs are given by `z = fz.z` and `q(u)` is a
@@ -9,15 +9,14 @@ variational distribution over inducing points `u = f(z)`.
 variational Gaussian process classification." Artificial Intelligence and
 Statistics. PMLR, 2015.
 """
-
 function elbo(
     fx::FiniteGP,
     y::AbstractVector{<:Real},
     fz::FiniteGP,
     q::MvNormal;
-    n_data=1,
-    n_batch=1
+    n_data=length(y)
 )
+    n_batch = length(y)
     kl_term, f_mean, f_var = _elbo_intermediates(fx, fz, q)
 
     Σy = diag(fx.Σy) # n.b. this assumes uncorrelated observation noise
@@ -31,9 +30,9 @@ function elbo(
     y::AbstractVector,
     fz::FiniteGP,
     q::MvNormal;
-    n_data=1,
-    n_batch=1
+    n_data=length(y)
 )
+    n_batch = length(y)
     kl_term, f_mean, f_var = _elbo_intermediates(lfx.fx, fz, q)
     
     variational_exp = expected_loglik(y, f_mean, f_var, lfx.lik)

From ec5fa05e546fe7966a88f46fe2b14b695352c27c Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Tue, 13 Jul 2021 18:25:00 +0100
Subject: [PATCH 24/66] Added docstrings to elbo.jl

---
 src/elbo.jl | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 51 insertions(+), 2 deletions(-)

diff --git a/src/elbo.jl b/src/elbo.jl
index 20e37b96..4f6783bd 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -1,5 +1,5 @@
 """
-    elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::MvNormal; n_data=1)
+    elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::MvNormal; n_data=length(y))
 
 Compute the Evidence Lower BOund from [1] for the process `fx.f` where `y` are
 observations of `fx`, pseudo-inputs are given by `z = fz.z` and `q(u)` is a
@@ -25,6 +25,12 @@ function elbo(
     return sum(variational_exp) * scale - kl_term
 end
 
+
+"""
+    elbo(fx::LatentFiniteGP, y::AbstractVector, fz::FiniteGP, q::MvNormal; n_data=length(y))
+
+Compute the ELBO for a LatentGP with a possibly non-conjugate likelihood.
+"""
 function elbo(
     lfx::LatentFiniteGP,
     y::AbstractVector,
@@ -52,6 +58,42 @@ function _elbo_intermediates(
     return kl_term, f_mean, f_var
 end
 
+"Likelihoods which take a scalar (or vector of scalars) as input and return a single scalar."
+ScalarLikelihood = Union{BernoulliLikelihood, CategoricalLikelihood, PoissonLikelihood}
+
+"""
+    expected_loglik(y, f_mean, f_var, [Σy | lik])
+
+This function computes the expected log likelihood:
+
+```math
+    ∫ q(f) log p(y | f) df
+```
+where `p(y | f)` is the process likelihood.
+
+`q(f)` is an approximation to the latent function values `f` given by:
+```math
+    q(f) = ∫ p(f | u) q(u) du
+```
+where `q(u)` is the variational distribution over inducing points (see
+[`elbo`](@ref)).
+
+Where possible, this expectation is calculated in closed form. Otherwise, it is
+approximated using Gauss-Hermite quadrature by default.
+
+# Extended help
+
+`q(f)` is assumed to be an `MvNormal` distribution and `p(y | f)` is assumed to
+have independent marginals such that only the marginals of `q(f)` are required.
+"""
+
+function expected_loglik end
+
+"""
+    expected_loglik(y::AbstractVector{<:Real}, f_mean::AbstractVector, f_var::AbstractVector, Σy::AbstractVector)
+
+The expected log likelihood for a Gaussian likelihood, computed in closed form.
+"""
 # The closed form expected loglikelihood for a Gaussian likelihood
 function expected_loglik(
     y::AbstractVector{<:Real},
@@ -62,11 +104,18 @@ function expected_loglik(
     return -0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy)
 end
 
+"""
+    expected_loglik(y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, lik::ScalarLikelihood; n_points=20)
+
+The expected log likelihood for a `ScalarLikelihood`, approximated via
+Gauss-Hermite quadrature with `n_points` quadrature points.
+"""
+
 function expected_loglik(
     y::AbstractVector,
     f_mean::AbstractVector,
     f_var::AbstractVector,
-    lik::BernoulliLikelihood;
+    lik::ScalarLikelihood;
     n_points=20
 )
     return gauss_hermite_quadrature(y, f_mean, f_var, lik; n_points=n_points)

From 6d4e87b87f5329eb00750c0c4c5c7cadf2c97328 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Tue, 13 Jul 2021 20:56:06 +0100
Subject: [PATCH 25/66] Added cross-covariance

---
 src/svgp.jl | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/svgp.jl b/src/svgp.jl
index 15b0a94d..d4f0e944 100644
--- a/src/svgp.jl
+++ b/src/svgp.jl
@@ -42,7 +42,14 @@ function Statistics.var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
     return var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) 
 end
 
-#TODO: cov(x, y)
+function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector, y::AbstractVector)
+    B = f.data.B
+    Cxu = cov(f.prior, x, f.data.u)
+    Cuy = cov(f.prior, f.data.u, y)
+    D = f.data.Kuu.L \ Cuy
+    E = Cxu / f.data.Kuu.L'
+    return cov(f.prior, x, y) - (E * D) + (E * B * B' * D)
+end
 
 function StatsBase.mean_and_cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
     Cux = cov(f.prior, f.data.u, x)

From 22c999ae39f11989fd556be47b5254cee0af8487 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Tue, 13 Jul 2021 21:46:44 +0100
Subject: [PATCH 26/66] Removed unnecessary dependencies

---
 Project.toml     | 2 --
 src/SparseGPs.jl | 2 --
 2 files changed, 4 deletions(-)

diff --git a/Project.toml b/Project.toml
index 45712703..e2645e29 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,7 +10,5 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 FastGaussQuadrature = "442a2c76-b920-505d-bb47-c5924d526838"
 GPLikelihoods = "6031954c-0455-49d7-b3b9-3e1c99afaf40"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-Optim = "429524aa-4258-5aef-a3af-852621145aeb"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
-StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl
index e34e34c1..4bd6092b 100644
--- a/src/SparseGPs.jl
+++ b/src/SparseGPs.jl
@@ -2,8 +2,6 @@ module SparseGPs
 
 using AbstractGPs
 using Distributions
-using Optim
-using StatsFuns
 using LinearAlgebra
 using Statistics
 using StatsBase

From 27639723ce0d4d4c7d9f85e6ec453e387f9a53bb Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Tue, 13 Jul 2021 22:07:50 +0100
Subject: [PATCH 27/66] Updated regression example

---
 examples/regression.jl | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/examples/regression.jl b/examples/regression.jl
index 0cba8ac6..d537f448 100644
--- a/examples/regression.jl
+++ b/examples/regression.jl
@@ -30,6 +30,9 @@ scatter(x, y; xlabel="x", ylabel="y", legend=false)
 # A simple Flux model
 using Flux
 
+lik_noise = 0.3
+jitter = 1e-5
+
 struct SVGPModel
     k # kernel parameters
     m # variational mean
@@ -50,8 +53,8 @@ function (m::SVGPModel)(x)
     kernel = make_kernel(m.k)
     f = GP(kernel)
     q = MvNormal(m.m, m.A'm.A)
-    fx = f(x, 0.3)
-    fu = f(m.z, 0.3)
+    fx = f(x, lik_noise)
+    fu = f(m.z, jitter)
     return fx, fu, q
 end
 
@@ -59,7 +62,7 @@ end
 function posterior(m::SVGPModel)
     kernel = make_kernel(m.k)
     f = GP(kernel)
-    fu = f(m.z, 0.3)
+    fu = f(m.z, jitter)
     q = MvNormal(m.m, m.A'm.A)
     return SparseGPs.approx_posterior(SVGP(), fu, q)
 end
@@ -85,7 +88,7 @@ A = Matrix{Float64}(I, M, M)
 model = SVGPModel(k, m, A, z)
 
 b = 100 # minibatch size
-opt = ADAM(0.01)
+opt = ADAM(0.001)
 parameters = Flux.params(model)
 data_loader = Flux.Data.DataLoader((x, y), batchsize=b)
 
@@ -140,10 +143,10 @@ function exact_q(fu, fx, y)
     return MvNormal(m, S)
 end
 
-kernel = make_kernel([0.2, 11])
+kernel = make_kernel([0.3, 10])
 f = GP(kernel)
-fx = f(x, 0.1)
-fu = f(z, 0.1)
+fx = f(x, lik_noise)
+fu = f(z, jitter)
 q_ex = exact_q(fu, fx, y)
 
 scatter(x, y)
@@ -153,7 +156,7 @@ scatter!(z, q_ex.μ)
 ap_ex = SparseGPs.approx_posterior(SVGP(), fu, q_ex) # Hensman (2013) exact posterior
 ap_tits = AbstractGPs.approx_posterior(VFE(), fx, y, fu) # Titsias posterior
 
-# Should these be the same? (they currently aren't)
+# These are also approximately equal
 SparseGPs.elbo(fx, y, fu, q_ex)
 AbstractGPs.elbo(fx, y, fu)
 
@@ -161,7 +164,9 @@ AbstractGPs.elbo(fx, y, fu)
 scatter(
     x,
     y;
-    xlim=(0, 1),
+    markershape=:xcross,
+    markeralpha=0.1,
+    xlim=(-1, 1),
     xlabel="x",
     ylabel="y",
     title="posterior (VI with sparse grid)",

From 23e5c2e3c6ba7174d679002d0a482853d2227778 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Wed, 14 Jul 2021 11:56:42 +0100
Subject: [PATCH 28/66] Added exact posterior tests

---
 test/Project.toml    |   1 +
 test/equivalences.jl | 206 ++++++++++++++++++++++---------------------
 test/runtests.jl     |   1 +
 3 files changed, 109 insertions(+), 99 deletions(-)

diff --git a/test/Project.toml b/test/Project.toml
index a4a781f3..47a7de77 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -3,5 +3,6 @@ AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/equivalences.jl b/test/equivalences.jl
index 46fb2ba4..e3f70fab 100644
--- a/test/equivalences.jl
+++ b/test/equivalences.jl
@@ -4,116 +4,124 @@
     y = sin.(x) + 0.9 * cos.(x * 1.6) + 0.4 * rand(rng, N)
 
     z = copy(x) # Set inducing inputs == training inputs
+    
+    make_kernel(k) = softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2])))
 
-    # Create a kernel from parameters k
-    kernel(k) = softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2])))
     k_init = [0.2, 0.6] # initial kernel parameters
-
     lik_noise = 0.1 # The (fixed) Gaussian likelihood noise
-    jitter = 1e-5
-
-    ## FIRST - define the models
-    # GPR - Exact GP regression
-    struct GPRModel
-        k # kernel parameters
-    end
-    @Flux.functor GPRModel
-
-    function (m::GPRModel)(x)
-        f = GP(kernel(m.k))
-        fx = f(x, lik_noise)
-        return fx
-    end
-
-    # # SGPR - Sparse GP regression (Titsias 2009)
-    # struct SGPRModel
-    #     k # kernel parameters
-    #     z # inducing points
-    # end
-    # @Flux.functor SGPRModel (k,) # Don't train the inducing inputs
-
-    # function (m::SGPRModel)(x)
-    #     f = GP(kernel(m.k))
-    #     fx = f(x, lik_noise)
-    #     fz = f(m.z, lik_noise)
-    #     return fx, fz
-    # end
-
-    # SVGP - Sparse variational GP regression (Hensman 2014)
-    struct SVGPModel
-        k # kernel parameters
-        z # inducing points
-        m # variational mean
-        A # variational covariance sqrt (Σ = A'A)
-    end
-    @Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs
 
-    function (m::SVGPModel)(x)
-        f = GP(kernel(m.k))
-        q = MvNormal(m.m, m.A'm.A)
+    @testset "exact posterior" begin
+        # There is a closed form optimal solution for the variational posterior
+        # q(u) (e.g. # https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/
+        # equations (11) & (12)). The SVGP posterior with this optimal q(u)
+        # should therefore be equivalent to the sparse GP (Titsias) posterior
+        # and exact GP regression (when z == x).
+
+        function exact_q(fu, fx, y)
+            σ² = fx.Σy[1]
+            Kuf = cov(fu, fx)
+            Kuu = Symmetric(cov(fu))
+            Σ = (Symmetric(cov(fu) + (1/σ²) * Kuf * Kuf'))
+            m = ((1/σ²)*Kuu* (Σ\Kuf)) * y
+            S = Symmetric(Kuu * (Σ \ Kuu))
+            return MvNormal(m, S)
+        end
+
+        kernel = make_kernel(k_init)
+        f = GP(kernel)
         fx = f(x, lik_noise)
-        fz = f(m.z, jitter)
-        return fx, fz, q
-    end
-
-    ## SECOND - create the models and associated training losses
-    gpr = GPRModel(copy(k_init))
-    function GPR_loss(x, y)
-        fx = gpr(x)
-        return -logpdf(fx, y)
-    end
-
-    # sgpr = SGPRModel(copy(k_init), copy(z))
-    # function SGPR_loss(x, y)
-    #     fx, fz = sgpr(x)
-    #     return -AbstractGPs.elbo(fx, y, fz)
-    # end
-
-    m, A = rand(rng, N), rand(rng, N, N)/2 # initialise the variational parameters
-    svgp = SVGPModel(copy(k_init), copy(z), m, A)
-    function SVGP_loss(x, y)
-        fx, fz, q = svgp(x)
-        return -SparseGPs.elbo(fx, y, fz, q)
-    end
-
-    ## THIRD - train the models
-    data = [(x, y)]
-    opt = ADAM(0.01)
+        fu = f(z)
+        q_ex = exact_q(fu, fx, y)
 
-    svgp_ps = Flux.params(svgp)
-    delete!(svgp_ps, svgp.k) # Don't train the kernel parameters
+        gpr_post = AbstractGPs.posterior(fx, y) # Exact GP regression
+        vfe_post = AbstractGPs.approx_posterior(VFE(), fx, y, fu) # Titsias posterior
+        svgp_post = SparseGPs.approx_posterior(SVGP(), fu, q_ex) # Hensman (2013) exact posterior
 
-    # Flux.train!((x, y) -> GPR_loss(x, y), Flux.params(gpr), ncycle(data, 3000), opt)
-    # Flux.train!((x, y) -> SGPR_loss(x, y), Flux.params(sgpr), ncycle(data, 3000), opt)
-    Flux.train!((x, y) -> SVGP_loss(x, y), svgp_ps, ncycle(data, 9000), opt)
+        @test mean(gpr_post, x) ≈ mean(svgp_post, x) atol=1e-10
+        @test cov(gpr_post, x) ≈ cov(svgp_post, x) atol=1e-10
 
-    ## FOURTH - construct the posteriors
-    function posterior(m::GPRModel, x, y)
-        f = GP(kernel(m.k))
-        fx = f(x, lik_noise)
-        return AbstractGPs.posterior(fx, y)
+        @test mean(vfe_post, x) ≈ mean(svgp_post, x) atol=1e-10
+        @test cov(vfe_post, x) ≈ cov(svgp_post, x) atol=1e-10
     end
 
-    # function posterior(m::SGPRModel, x, y)
-    #     f = GP(kernel(m.k))
-    #     fx = f(x, lik_noise)
-    #     fz = f(m.z)
-    #     return AbstractGPs.approx_posterior(VFE(), fx, y, fz)
-    # end
-
-    function posterior(m::SVGPModel)
-        f = GP(kernel(m.k))
-        fz = f(m.z, jitter)
-        q = MvNormal(m.m, m.A'm.A)
-        return SparseGPs.approx_posterior(SVGP(), fz, q)
+    @testset "optimised posterior" begin
+        jitter = 1e-5
+        
+        ## FIRST - define the models
+        # GPR - Exact GP regression
+        struct GPRModel
+            k # kernel parameters
+        end
+        @Flux.functor GPRModel
+
+        function (m::GPRModel)(x)
+            f = GP(make_kernel(m.k))
+            fx = f(x, lik_noise)
+            return fx
+        end
+
+        # SVGP - Sparse variational GP regression (Hensman 2014)
+        struct SVGPModel
+            k # kernel parameters
+            z # inducing points
+            m # variational mean
+            A # variational covariance sqrt (Σ = A'A)
+        end
+        @Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs
+
+        function (m::SVGPModel)(x)
+            f = GP(make_kernel(m.k))
+            q = MvNormal(m.m, m.A'm.A)
+            fx = f(x, lik_noise)
+            fz = f(m.z, jitter)
+            return fx, fz, q
+        end
+
+        ## SECOND - create the models and associated training losses
+        gpr = GPRModel(copy(k_init))
+        function GPR_loss(x, y)
+            fx = gpr(x)
+            return -logpdf(fx, y)
+        end
+
+        m, A = zeros(N), Matrix{Float64}(I, N, N) # initialise the variational parameters
+        svgp = SVGPModel(copy(k_init), copy(z), m, A)
+        function SVGP_loss(x, y)
+            fx, fz, q = svgp(x)
+            return -SparseGPs.elbo(fx, y, fz, q)
+        end
+
+        ## THIRD - train the models
+        data = [(x, y)]
+        opt = ADAM(0.001)
+
+        svgp_ps = Flux.params(svgp)
+        delete!(svgp_ps, svgp.k) # Don't train the kernel parameters
+
+        # Optimise q(u)
+        Flux.train!((x, y) -> SVGP_loss(x, y), svgp_ps, ncycle(data, 20000), opt)
+
+        ## FOURTH - construct the posteriors
+        function posterior(m::GPRModel, x, y)
+            f = GP(make_kernel(m.k))
+            fx = f(x, lik_noise)
+            return AbstractGPs.posterior(fx, y)
+        end
+
+        function posterior(m::SVGPModel)
+            f = GP(make_kernel(m.k))
+            fz = f(m.z, jitter)
+            q = MvNormal(m.m, m.A'm.A)
+            return SparseGPs.approx_posterior(SVGP(), fz, q)
+        end
+        
+        gpr_post = posterior(gpr, x, y)
+        svgp_post = posterior(svgp)
+
+        ## FIFTH - test equivalences
+        @test all(isapprox.(mean(gpr_post, x), mean(svgp_post, x), atol=1e-4))
+        @test all(isapprox.(cov(gpr_post, x), cov(svgp_post, x), atol=1e-4))
     end
-    gpr_post = posterior(gpr, x, y)
-    # sgpr_post = posterior(sgpr, x, y)
-    svgp_post = posterior(svgp)
-
-    ## FIFTH - test equivalences
-    @test all(isapprox.(mean(gpr_post, x), mean(svgp_post, x), atol=1e-3))
-    @test all(isapprox.(cov(gpr_post, x), cov(svgp_post, x), atol=1e-3))
 
 end
 
diff --git a/test/runtests.jl b/test/runtests.jl
index e5d8346a..5419760a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -5,6 +5,7 @@ using Flux
 using IterTools
 using AbstractGPs
 using Distributions
+using LinearAlgebra
 
 const GROUP = get(ENV, "GROUP", "All")
 const PKGDIR = dirname(dirname(pathof(SparseGPs)))

From a8e5cbe740e26e0090021cc0cc05cf67e07435e0 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Wed, 14 Jul 2021 22:39:20 +0100
Subject: [PATCH 29/66] Address review comments

---
 src/elbo.jl | 21 +++++++++++----------
 src/svgp.jl |  4 ++--
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/elbo.jl b/src/elbo.jl
index 4f6783bd..ad2e2f54 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -13,7 +13,7 @@ function elbo(
     fx::FiniteGP,
     y::AbstractVector{<:Real},
     fz::FiniteGP,
-    q::MvNormal;
+    q::AbstractMvNormal;
     n_data=length(y)
 )
     n_batch = length(y)
@@ -35,7 +35,7 @@ function elbo(
     lfx::LatentFiniteGP,
     y::AbstractVector,
     fz::FiniteGP,
-    q::MvNormal;
+    q::AbstractMvNormal;
     n_data=length(y)
 )
     n_batch = length(y)
@@ -43,14 +43,14 @@ function elbo(
     
     variational_exp = expected_loglik(y, f_mean, f_var, lfx.lik)
     scale = n_data / n_batch
-    return sum(variational_exp) * scale - kl_term
+    return variational_exp * scale - kl_term
 end
 
 # Computes the common intermediates needed for the ELBO
 function _elbo_intermediates(
     fx::FiniteGP,
     fz::FiniteGP,
-    q::MvNormal
+    q::AbstractMvNormal
 )
     kl_term = kl_divergence(q, fz)
     post = approx_posterior(SVGP(), fz, q)
@@ -59,7 +59,7 @@ function _elbo_intermediates(
 end
 
 "Likelihoods which take a scalar (or vector of scalars) as input and return a single scalar."
-ScalarLikelihood = Union{BernoulliLikelihood, CategoricalLikelihood, PoissonLikelihood}
+ScalarLikelihood = Union{BernoulliLikelihood,PoissonLikelihood}
 
 """
     expected_loglik(y, f_mean, f_var, [Σy | lik])
@@ -101,7 +101,7 @@ function expected_loglik(
     f_var::AbstractVector,
     Σy::AbstractVector
 )
-    return -0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy)
+    return sum(-0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy))
 end
 
 """
@@ -118,13 +118,14 @@ function expected_loglik(
     lik::ScalarLikelihood;
     n_points=20
 )
-    return gauss_hermite_quadrature(y, f_mean, f_var, lik; n_points=n_points)
+    return sum(gauss_hermite_quadrature(y, f_mean, f_var, lik; n_points=n_points))
 end
 
-function kl_divergence(q::MvNormal, p::AbstractMvNormal)
+function kl_divergence(q::AbstractMvNormal, p::AbstractMvNormal)
     p_μ, p_Σ = mean(p), cov(p)
-    (1/2) .* (logdet(p_Σ) - logdet(q.Σ) - length(p_μ) + tr(p_Σ \ cov(q)) +
-              Xt_invA_X(cholesky(p_Σ), (q.μ - p_μ)))
+    q_μ, q_Σ = mean(q), cov(q)
+    (1/2) .* (logdet(p_Σ) - logdet(q_Σ) - length(p_μ) + tr(p_Σ \ q_Σ) +
+              Xt_invA_X(cholesky(p_Σ), (q_μ - p_μ)))
 end
 
 function gauss_hermite_quadrature(
diff --git a/src/svgp.jl b/src/svgp.jl
index d4f0e944..f0724905 100644
--- a/src/svgp.jl
+++ b/src/svgp.jl
@@ -18,8 +18,8 @@ variational Gaussian process classification." Artificial Intelligence and
 Statistics. PMLR, 2015.
 """
 
-function approx_posterior(::SVGP, fz::FiniteGP, q::MvNormal)
-    m, A = q.μ, cholesky(q.Σ)
+function approx_posterior(::SVGP, fz::FiniteGP, q::AbstractMvNormal)
+    m, A = mean(q), cholesky(cov(q))
     Kuu = cholesky(Symmetric(cov(fz)))
     B = Kuu.L \ A.L
     data = (A=A, m=m, Kuu=Kuu, B=B, α=Kuu \ m, u=fz.x)

From 1bbeae0e18f5f18de08d8e5a2a07afc69040a744 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Fri, 16 Jul 2021 16:29:28 +0100
Subject: [PATCH 30/66] Fix docstrings

Co-authored-by: st-- <st--@users.noreply.github.com>
---
 src/elbo.jl | 9 +++------
 src/svgp.jl | 4 ++--
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/elbo.jl b/src/elbo.jl
index ad2e2f54..c12afccc 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -1,8 +1,8 @@
 """
-    elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::MvNormal; n_data=length(y))
+    elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y))
 
 Compute the Evidence Lower BOund from [1] for the process `fx.f` where `y` are
-observations of `fx`, pseudo-inputs are given by `z = fz.z` and `q(u)` is a
+observations of `fx`, pseudo-inputs are given by `z = fz.x` and `q(u)` is a
 variational distribution over inducing points `u = f(z)`.
 
 [1] - Hensman, James, Alexander Matthews, and Zoubin Ghahramani. "Scalable
@@ -27,7 +27,7 @@ end
 
 
 """
-    elbo(fx::LatentFiniteGP, y::AbstractVector, fz::FiniteGP, q::MvNormal; n_data=length(y))
+    elbo(lfx::LatentFiniteGP, y::AbstractVector, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y))
 
 Compute the ELBO for a LatentGP with a possibly non-conjugate likelihood.
 """
@@ -86,7 +86,6 @@ approximated using Gauss-Hermite quadrature by default.
 `q(f)` is assumed to be an `MvNormal` distribution and `p(y | f)` is assumed to
 have independent marginals such that only the marginals of `q(f)` are required.
 """
-
 function expected_loglik end
 
 """
@@ -94,7 +93,6 @@ function expected_loglik end
 
 The expected log likelihood for a Gaussian likelihood, computed in closed form.
 """
-# The closed form expected loglikelihood for a Gaussian likelihood
 function expected_loglik(
     y::AbstractVector{<:Real},
     f_mean::AbstractVector,
@@ -110,7 +108,6 @@ end
 The expected log likelihood for a `ScalarLikelihood`, approximated via
 Gauss-Hermite quadrature with `n_points` quadrature points.
 """
-
 function expected_loglik(
     y::AbstractVector,
     f_mean::AbstractVector,
diff --git a/src/svgp.jl b/src/svgp.jl
index f0724905..e87da8bc 100644
--- a/src/svgp.jl
+++ b/src/svgp.jl
@@ -1,6 +1,6 @@
 struct SVGP end
 
-"""
+raw"""
     approx_posterior(::SVGP, fz::FiniteGP, q::MvNormal)
 
 Compute the approximate posterior [1] over the process `f = fz.f`, given inducing
@@ -9,7 +9,7 @@ f(z)`. The approximate posterior at test points ``x^*`` where ``f^* = f(x^*)``
 is then given by:
 
 ```math
-q(f^*) = \\int p(f | u) q(u) du
+q(f^*) = \int p(f | u) q(u) du
 ```
 which can be found in closed form.
 

From 1a0782ffbb0ead758ba80b7892fcd0094306c576 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Sun, 18 Jul 2021 20:07:59 +0100
Subject: [PATCH 31/66] Rename kldivergence

---
 src/elbo.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/elbo.jl b/src/elbo.jl
index c12afccc..5894faeb 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -52,7 +52,7 @@ function _elbo_intermediates(
     fz::FiniteGP,
     q::AbstractMvNormal
 )
-    kl_term = kl_divergence(q, fz)
+    kl_term = StatsBase.kldivergence(q, fz)
     post = approx_posterior(SVGP(), fz, q)
     f_mean, f_var = mean_and_var(post, fx.x)
     return kl_term, f_mean, f_var
@@ -118,7 +118,7 @@ function expected_loglik(
     return sum(gauss_hermite_quadrature(y, f_mean, f_var, lik; n_points=n_points))
 end
 
-function kl_divergence(q::AbstractMvNormal, p::AbstractMvNormal)
+function StatsBase.kldivergence(q::AbstractMvNormal, p::AbstractMvNormal)
     p_μ, p_Σ = mean(p), cov(p)
     q_μ, q_Σ = mean(q), cov(q)
     (1/2) .* (logdet(p_Σ) - logdet(q_Σ) - length(p_μ) + tr(p_Σ \ q_Σ) +

From eddc7ab8ebaa675efb463d601ca21696c31018bb Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Mon, 19 Jul 2021 13:18:58 +0100
Subject: [PATCH 32/66] Factor out exact posterior

---
 test/equivalences.jl | 12 +-----------
 test/test_utils.jl   | 12 ++++++++++++
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/test/equivalences.jl b/test/equivalences.jl
index e3f70fab..1765c790 100644
--- a/test/equivalences.jl
+++ b/test/equivalences.jl
@@ -17,21 +17,11 @@
         # should therefore be equivalent to the sparse GP (Titsias) posterior
         # and exact GP regression (when z == x).
 
-        function exact_q(fu, fx, y)
-            σ² = fx.Σy[1]
-            Kuf = cov(fu, fx)
-            Kuu = Symmetric(cov(fu))
-            Σ = (Symmetric(cov(fu) + (1/σ²) * Kuf * Kuf'))
-            m = ((1/σ²)*Kuu* (Σ\Kuf)) * y
-            S = Symmetric(Kuu * (Σ \ Kuu))
-            return MvNormal(m, S)
-        end
-
         kernel = make_kernel(k_init)
         f = GP(kernel)
         fx = f(x, lik_noise)
         fu = f(z)
-        q_ex = exact_q(fu, fx, y)
+        q_ex = exact_variational_posterior(fu, fx, y)
 
         gpr_post = AbstractGPs.posterior(fx, y) # Exact GP regression
         vfe_post = AbstractGPs.approx_posterior(VFE(), fx, y, fu) # Titsias posterior
diff --git a/test/test_utils.jl b/test/test_utils.jl
index e69de29b..0bae973c 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -0,0 +1,12 @@
+# Computes the optimal closed form solution for the variational posterior
+# q(u) (e.g. # https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/
+# equations (11) & (12)).
+function exact_variational_posterior(fu, fx, y)
+    σ² = fx.Σy[1]
+    Kuf = cov(fu, fx)
+    Kuu = Symmetric(cov(fu))
+    Σ = (Symmetric(cov(fu) + (1/σ²) * Kuf * Kuf'))
+    m = ((1/σ²)*Kuu* (Σ\Kuf)) * y
+    S = Symmetric(Kuu * (Σ \ Kuu))
+    return MvNormal(m, S)
+end

From 7ea3c2f7e2e9256ceb68047bebd3f087c0849244 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Mon, 19 Jul 2021 13:19:19 +0100
Subject: [PATCH 33/66] Use AbstractGPs TestUtils

---
 test/svgp.jl | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/test/svgp.jl b/test/svgp.jl
index b5e84b04..a55b9bf4 100644
--- a/test/svgp.jl
+++ b/test/svgp.jl
@@ -1,4 +1,20 @@
 @testset "svgp" begin
-    x = 4
-    @test x == 4
+    rng = MersenneTwister(123456)
+    N_cond = 5
+    N_a = 6
+    N_b = 7
+
+    # Specify prior.
+    f = GP(Matern32Kernel())
+    # Sample from prior.
+    x = collect(range(-1.0, 1.0; length=N_cond))
+    fx = f(x, 1e-15)
+    y = rand(rng, fx)
+
+    q = exact_variational_posterior(fx, fx, y)
+    f_approx_post = SparseGPs.approx_posterior(SVGP(), fx, q)
+
+    a = collect(range(-1.0, 1.0; length=N_a))
+    b = randn(rng, N_b)
+    AbstractGPs.TestUtils.test_internal_abstractgps_interface(rng, f_approx_post, a, b)
 end

From 9b6557fcc7b2481512a82b350fef18e97e522837 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Mon, 19 Jul 2021 17:00:57 +0100
Subject: [PATCH 34/66] Added support for prior mean function

---
 src/svgp.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/svgp.jl b/src/svgp.jl
index e87da8bc..2a5edd68 100644
--- a/src/svgp.jl
+++ b/src/svgp.jl
@@ -22,12 +22,13 @@ function approx_posterior(::SVGP, fz::FiniteGP, q::AbstractMvNormal)
     m, A = mean(q), cholesky(cov(q))
     Kuu = cholesky(Symmetric(cov(fz)))
     B = Kuu.L \ A.L
-    data = (A=A, m=m, Kuu=Kuu, B=B, α=Kuu \ m, u=fz.x)
+    α=Kuu \ (m - mean(fz))
+    data = (A=A, m=m, Kuu=Kuu, B=B, α=α, u=fz.x)
     return ApproxPosteriorGP(SVGP(), fz.f, data)
 end
 
 function Statistics.mean(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
-    return cov(f.prior, x, f.data.u) * f.data.α
+    return mean(f.prior, x) + cov(f.prior, x, f.data.u) * f.data.α
 end
 
 function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)

From 0e59e49ae9d32be55c20c96336addb85304e350c Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Wed, 21 Jul 2021 02:52:56 +0100
Subject: [PATCH 35/66] Added MC expectation and refactored elbo

---
 Project.toml               |   1 +
 examples/classification.jl |   2 +-
 src/SparseGPs.jl           |   1 +
 src/elbo.jl                | 121 ++++++++++++++++++++++++++-----------
 4 files changed, 89 insertions(+), 36 deletions(-)

diff --git a/Project.toml b/Project.toml
index e2645e29..5f55a43d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,5 +10,6 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 FastGaussQuadrature = "442a2c76-b920-505d-bb47-c5924d526838"
 GPLikelihoods = "6031954c-0455-49d7-b3b9-3e1c99afaf40"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
diff --git a/examples/classification.jl b/examples/classification.jl
index 03146024..153f633c 100644
--- a/examples/classification.jl
+++ b/examples/classification.jl
@@ -91,7 +91,7 @@ end
 
 function flux_loss(x, y; n_data=length(y))
     fx, fu, q = model(x)
-    return -SparseGPs.elbo(fx, y, fu, q; n_data)
+    return -SparseGPs.elbo(fx, y, fu, q; n_data, method=:montecarlo)
 end
 
 # %%
diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl
index 4bd6092b..ad55bb39 100644
--- a/src/SparseGPs.jl
+++ b/src/SparseGPs.jl
@@ -7,6 +7,7 @@ using Statistics
 using StatsBase
 using FastGaussQuadrature
 using GPLikelihoods
+using SpecialFunctions
 using ChainRulesCore
 
 using AbstractGPs:
diff --git a/src/elbo.jl b/src/elbo.jl
index 5894faeb..eab1afa1 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -1,3 +1,6 @@
+"Likelihoods which take a scalar (or vector of scalars) as input and return a single scalar."
+ScalarLikelihood = Union{BernoulliLikelihood,PoissonLikelihood,GaussianLikelihood}
+
 """
     elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y))
 
@@ -14,15 +17,11 @@ function elbo(
     y::AbstractVector{<:Real},
     fz::FiniteGP,
     q::AbstractMvNormal;
-    n_data=length(y)
+    n_data=length(y),
+    method=:default,
+    kwargs...
 )
-    n_batch = length(y)
-    kl_term, f_mean, f_var = _elbo_intermediates(fx, fz, q)
-
-    Σy = diag(fx.Σy) # n.b. this assumes uncorrelated observation noise
-    variational_exp = expected_loglik(y, f_mean, f_var, Σy)
-    scale = n_data / n_batch
-    return sum(variational_exp) * scale - kl_term
+    return _elbo(fx, y, fz, q, fx.Σy, n_data, method; kwargs...)
 end
 
 
@@ -36,30 +35,34 @@ function elbo(
     y::AbstractVector,
     fz::FiniteGP,
     q::AbstractMvNormal;
-    n_data=length(y)
+    n_data=length(y),
+    method=:default,
+    kwargs...
 )
-    n_batch = length(y)
-    kl_term, f_mean, f_var = _elbo_intermediates(lfx.fx, fz, q)
-    
-    variational_exp = expected_loglik(y, f_mean, f_var, lfx.lik)
-    scale = n_data / n_batch
-    return variational_exp * scale - kl_term
+    return _elbo(lfx.fx, y, fz, q, lfx.lik, n_data, method; kwargs...)
 end
 
-# Computes the common intermediates needed for the ELBO
-function _elbo_intermediates(
+
+function _elbo(
     fx::FiniteGP,
+    y::AbstractVector,
     fz::FiniteGP,
-    q::AbstractMvNormal
+    q::AbstractMvNormal,
+    lik::Union{AbstractVecOrMat,ScalarLikelihood},
+    n_data::Integer,
+    method::Symbol;
+    kwargs...
 )
-    kl_term = StatsBase.kldivergence(q, fz)
     post = approx_posterior(SVGP(), fz, q)
     f_mean, f_var = mean_and_var(post, fx.x)
-    return kl_term, f_mean, f_var
-end
+    variational_exp = expected_loglik(y, f_mean, f_var, lik; method, kwargs...)
 
-"Likelihoods which take a scalar (or vector of scalars) as input and return a single scalar."
-ScalarLikelihood = Union{BernoulliLikelihood,PoissonLikelihood}
+    kl_term = StatsBase.kldivergence(q, fz)
+
+    n_batch = length(y)
+    scale = n_data / n_batch
+    return sum(variational_exp) * scale - kl_term
+end
 
 """
     expected_loglik(y, f_mean, f_var, [Σy | lik])
@@ -89,17 +92,23 @@ have independent marginals such that only the marginals of `q(f)` are required.
 function expected_loglik end
 
 """
-    expected_loglik(y::AbstractVector{<:Real}, f_mean::AbstractVector, f_var::AbstractVector, Σy::AbstractVector)
+    expected_loglik(y::AbstractVector{<:Real}, f_mean::AbstractVector, f_var::AbstractVector, Σy::AbstractMatrix)
 
-The expected log likelihood for a Gaussian likelihood, computed in closed form.
+The expected log likelihood for a Gaussian likelihood, computed in closed form by default.
 """
 function expected_loglik(
     y::AbstractVector{<:Real},
     f_mean::AbstractVector,
     f_var::AbstractVector,
-    Σy::AbstractVector
+    Σy::AbstractMatrix;
+    method=:default,
+    kwargs...
 )
-    return sum(-0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy))
+    if method === :default
+        return closed_form_expectation(y, f_mean, f_var, diag(Σy))
+    else
+        return expected_loglik(y, f_mean, f_var, GaussianLikelihood(Σy[1]); method, kwargs...)
+    end
 end
 
 """
@@ -113,16 +122,48 @@ function expected_loglik(
     f_mean::AbstractVector,
     f_var::AbstractVector,
     lik::ScalarLikelihood;
-    n_points=20
+    method=:default,
+    n_points=20,
+    n_samples=20
 )
-    return sum(gauss_hermite_quadrature(y, f_mean, f_var, lik; n_points=n_points))
+    if method === :default && has_closed_form_expectation(lik)
+        return closed_form_expectation(y, f_mean, f_var, lik)
+    elseif method === :default || method === :gausshermite
+        return gauss_hermite_quadrature(y, f_mean, f_var, lik; n_points)
+    elseif method === :montecarlo
+        return monte_carlo_expectation(y, f_mean, f_var, lik; n_samples)
+    end
 end
 
-function StatsBase.kldivergence(q::AbstractMvNormal, p::AbstractMvNormal)
-    p_μ, p_Σ = mean(p), cov(p)
-    q_μ, q_Σ = mean(q), cov(q)
-    (1/2) .* (logdet(p_Σ) - logdet(q_Σ) - length(p_μ) + tr(p_Σ \ q_Σ) +
-              Xt_invA_X(cholesky(p_Σ), (q_μ - p_μ)))
+function closed_form_expectation(
+    y::AbstractVector,
+    f_mean::AbstractVector,
+    f_var::AbstractVector,
+    Σy::AbstractVector
+    )
+    return sum(-0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy))
+end
+
+function closed_form_expectation(
+    y::AbstractVector,
+    f_mean::AbstractVector,
+    f_var::AbstractVector,
+    ::PoissonLikelihood
+    )
+    return sum(y .* f_mean - exp(f_mean .+ (f_var / 2) - loggamma.(y)))
+end
+
+function monte_carlo_expectation(
+    y::AbstractVector,
+    f_mean::AbstractVector,
+    f_var::AbstractVector,
+    lik::ScalarLikelihood;
+    n_samples=20
+)
+    # take 'n_samples' reparameterised samples with μ=f_mean and σ²=f_var
+    fs = f_mean .+ .√f_var .* randn(eltype(f_mean), length(f_mean), n_samples)
+    lls = loglikelihood.(lik.(fs), y)
+    return sum(lls) / n_samples
 end
 
 function gauss_hermite_quadrature(
@@ -139,7 +180,17 @@ function gauss_hermite_quadrature(
     # size(fs): (length(y), n_points)
     fs = √2 * .√f_var .* transpose(xs) .+ f_mean
     lls = loglikelihood.(lik.(fs), y)
-    return (1/√π) * lls * ws
+    return sum((1/√π) * lls * ws)
 end
 
 ChainRulesCore.@non_differentiable gausshermite(n)
+
+function StatsBase.kldivergence(q::AbstractMvNormal, p::AbstractMvNormal)
+    p_μ, p_Σ = mean(p), cov(p)
+    q_μ, q_Σ = mean(q), cov(q)
+    (1/2) .* (logdet(p_Σ) - logdet(q_Σ) - length(p_μ) + tr(p_Σ \ q_Σ) +
+              Xt_invA_X(cholesky(p_Σ), (q_μ - p_μ)))
+end
+
+has_closed_form_expectation(lik::Union{PoissonLikelihood,GaussianLikelihood}) = true
+has_closed_form_expectation(lik) = false

From 38ed15ff973beb73fd694ad5f4f1f25c8373129e Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Wed, 21 Jul 2021 03:08:16 +0100
Subject: [PATCH 36/66] Updated docstrings

---
 src/elbo.jl | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/src/elbo.jl b/src/elbo.jl
index eab1afa1..58dd9d9b 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -2,12 +2,18 @@
 ScalarLikelihood = Union{BernoulliLikelihood,PoissonLikelihood,GaussianLikelihood}
 
 """
-    elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y))
+    elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), method=:default)
 
 Compute the Evidence Lower BOund from [1] for the process `fx.f` where `y` are
 observations of `fx`, pseudo-inputs are given by `z = fz.x` and `q(u)` is a
 variational distribution over inducing points `u = f(z)`.
 
+`method` selects which method is used to calculate the expected loglikelihood in
+the ELBO. The options are: `:default`, `:gausshermite` and `:montecarlo`. For
+likelihoods with a closed form solution, `:default` uses this exact solution. If
+there is no such solution, `:default` is instead synonymous with
+`:gausshermite`.
+
 [1] - Hensman, James, Alexander Matthews, and Zoubin Ghahramani. "Scalable
 variational Gaussian process classification." Artificial Intelligence and
 Statistics. PMLR, 2015.
@@ -26,7 +32,7 @@ end
 
 
 """
-    elbo(lfx::LatentFiniteGP, y::AbstractVector, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y))
+    elbo(lfx::LatentFiniteGP, y::AbstractVector, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), method=:default)
 
 Compute the ELBO for a LatentGP with a possibly non-conjugate likelihood.
 """
@@ -42,7 +48,7 @@ function elbo(
     return _elbo(lfx.fx, y, fz, q, lfx.lik, n_data, method; kwargs...)
 end
 
-
+# Compute the common elements of the ELBO
 function _elbo(
     fx::FiniteGP,
     y::AbstractVector,
@@ -94,7 +100,10 @@ function expected_loglik end
 """
     expected_loglik(y::AbstractVector{<:Real}, f_mean::AbstractVector, f_var::AbstractVector, Σy::AbstractMatrix)
 
-The expected log likelihood for a Gaussian likelihood, computed in closed form by default.
+The expected log likelihood for a Gaussian likelihood, computed in closed form
+by default. If using the closed form solution, the noise Σy is assumed to be
+uncorrelated (i.e. only diag(Σy) is used). If using `:gausshermite` or `:montecarlo`,
+the noise is assumed to be homoscedastic as well (i.e. only Σy[1] is used).
 """
 function expected_loglik(
     y::AbstractVector{<:Real},
@@ -112,10 +121,11 @@ function expected_loglik(
 end
 
 """
-    expected_loglik(y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, lik::ScalarLikelihood; n_points=20)
+    expected_loglik(y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, lik::ScalarLikelihood; method=:default, n_points=20, n_samples=20)
 
-The expected log likelihood for a `ScalarLikelihood`, approximated via
-Gauss-Hermite quadrature with `n_points` quadrature points.
+The expected log likelihood for a `ScalarLikelihood`, computed via `method`.
+Defaults to a closed form solution if it exists, otherwise defaults to
+Gauss-Hermite quadrature.
 """
 function expected_loglik(
     y::AbstractVector,
@@ -135,6 +145,7 @@ function expected_loglik(
     end
 end
 
+# The closed form solution for independent Gaussian noise
 function closed_form_expectation(
     y::AbstractVector,
     f_mean::AbstractVector,
@@ -144,6 +155,7 @@ function closed_form_expectation(
     return sum(-0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy))
 end
 
+# The closed form solution for a Poisson likelihood
 function closed_form_expectation(
     y::AbstractVector,
     f_mean::AbstractVector,

From c8a974f5d77f379a3688de35931c16983329e92b Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Wed, 21 Jul 2021 14:59:19 +0100
Subject: [PATCH 37/66] Dispatch on types instead of symbols

---
 examples/classification.jl |   2 +-
 src/SparseGPs.jl           |   7 ++-
 src/elbo.jl                | 118 +++++++++++++++++++++----------------
 3 files changed, 73 insertions(+), 54 deletions(-)

diff --git a/examples/classification.jl b/examples/classification.jl
index 153f633c..b1442a09 100644
--- a/examples/classification.jl
+++ b/examples/classification.jl
@@ -91,7 +91,7 @@ end
 
 function flux_loss(x, y; n_data=length(y))
     fx, fu, q = model(x)
-    return -SparseGPs.elbo(fx, y, fu, q; n_data, method=:montecarlo)
+    return -SparseGPs.elbo(fx, y, fu, q; n_data, method=MonteCarlo())
 end
 
 # %%
diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl
index ad55bb39..b39b7a51 100644
--- a/src/SparseGPs.jl
+++ b/src/SparseGPs.jl
@@ -22,7 +22,12 @@ using AbstractGPs:
 
 export elbo,
     approx_posterior,
-    SVGP
+    SVGP,
+    Default,
+    Analytic,
+    GaussHermite,
+    MonteCarlo
+
 
 include("elbo.jl")
 include("svgp.jl")
diff --git a/src/elbo.jl b/src/elbo.jl
index 58dd9d9b..61739a69 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -1,6 +1,21 @@
 "Likelihoods which take a scalar (or vector of scalars) as input and return a single scalar."
 ScalarLikelihood = Union{BernoulliLikelihood,PoissonLikelihood,GaussianLikelihood}
 
+
+abstract type ExpectationMethod end
+struct Default <: ExpectationMethod end
+struct Analytic <: ExpectationMethod end
+
+struct GaussHermite <: ExpectationMethod
+    n_points
+end
+GaussHermite() = GaussHermite(20)
+
+struct MonteCarlo <: ExpectationMethod
+    n_samples
+end
+MonteCarlo() = MonteCarlo(20)
+
 """
     elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), method=:default)
 
@@ -24,10 +39,9 @@ function elbo(
     fz::FiniteGP,
     q::AbstractMvNormal;
     n_data=length(y),
-    method=:default,
-    kwargs...
+    method=Default()
 )
-    return _elbo(fx, y, fz, q, fx.Σy, n_data, method; kwargs...)
+    return _elbo(method, fx, y, fz, q, fx.Σy, n_data)
 end
 
 
@@ -42,26 +56,24 @@ function elbo(
     fz::FiniteGP,
     q::AbstractMvNormal;
     n_data=length(y),
-    method=:default,
-    kwargs...
+    method=Default()
 )
-    return _elbo(lfx.fx, y, fz, q, lfx.lik, n_data, method; kwargs...)
+    return _elbo(method, lfx.fx, y, fz, q, lfx.lik, n_data)
 end
 
 # Compute the common elements of the ELBO
 function _elbo(
+    method::ExpectationMethod,
     fx::FiniteGP,
     y::AbstractVector,
     fz::FiniteGP,
     q::AbstractMvNormal,
     lik::Union{AbstractVecOrMat,ScalarLikelihood},
-    n_data::Integer,
-    method::Symbol;
-    kwargs...
+    n_data::Integer
 )
     post = approx_posterior(SVGP(), fz, q)
     f_mean, f_var = mean_and_var(post, fx.x)
-    variational_exp = expected_loglik(y, f_mean, f_var, lik; method, kwargs...)
+    variational_exp = expected_loglik(method, y, f_mean, f_var, lik)
 
     kl_term = StatsBase.kldivergence(q, fz)
 
@@ -106,18 +118,36 @@ uncorrelated (i.e. only diag(Σy) is used). If using `:gausshermite` or `:montec
 the noise is assumed to be homoscedastic as well (i.e. only Σy[1] is used).
 """
 function expected_loglik(
+    ::Default,
     y::AbstractVector{<:Real},
     f_mean::AbstractVector,
     f_var::AbstractVector,
-    Σy::AbstractMatrix;
-    method=:default,
-    kwargs...
+    Σy::AbstractMatrix
+)
+    method = _default_method(GaussianLikelihood())
+    expected_loglik(method, y, f_mean, f_var, Σy)
+end
+
+# The closed form solution for independent Gaussian noise
+function expected_loglik(
+    ::Analytic,
+    y::AbstractVector,
+    f_mean::AbstractVector,
+    f_var::AbstractVector,
+    Σy::AbstractMatrix
 )
-    if method === :default
-        return closed_form_expectation(y, f_mean, f_var, diag(Σy))
-    else
-        return expected_loglik(y, f_mean, f_var, GaussianLikelihood(Σy[1]); method, kwargs...)
-    end
+    Σy_diag = diag(Σy)
+    return sum(-0.5 * (log(2π) .+ log.(Σy_diag) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy_diag))
+end
+
+function expected_loglik(
+    method::Union{GaussHermite,MonteCarlo},
+    y::AbstractVector,
+    f_mean::AbstractVector,
+    f_var::AbstractVector,
+    Σy::AbstractMatrix
+)
+    return expected_loglik(method, y, f_mean, f_var, GaussianLikelihood(Σy[1]))
 end
 
 """
@@ -128,67 +158,51 @@ Defaults to a closed form solution if it exists, otherwise defaults to
 Gauss-Hermite quadrature.
 """
 function expected_loglik(
+    ::Default,
     y::AbstractVector,
     f_mean::AbstractVector,
     f_var::AbstractVector,
-    lik::ScalarLikelihood;
-    method=:default,
-    n_points=20,
-    n_samples=20
+    lik::ScalarLikelihood
 )
-    if method === :default && has_closed_form_expectation(lik)
-        return closed_form_expectation(y, f_mean, f_var, lik)
-    elseif method === :default || method === :gausshermite
-        return gauss_hermite_quadrature(y, f_mean, f_var, lik; n_points)
-    elseif method === :montecarlo
-        return monte_carlo_expectation(y, f_mean, f_var, lik; n_samples)
-    end
-end
-
-# The closed form solution for independent Gaussian noise
-function closed_form_expectation(
-    y::AbstractVector,
-    f_mean::AbstractVector,
-    f_var::AbstractVector,
-    Σy::AbstractVector
-    )
-    return sum(-0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy))
+    method = _default_method(lik)
+    expected_loglik(method, y, f_mean, f_var, lik)
 end
 
 # The closed form solution for a Poisson likelihood
-function closed_form_expectation(
+function expected_loglik(
+    ::Analytic,
     y::AbstractVector,
     f_mean::AbstractVector,
     f_var::AbstractVector,
     ::PoissonLikelihood
-    )
+)
     return sum(y .* f_mean - exp(f_mean .+ (f_var / 2) - loggamma.(y)))
 end
 
-function monte_carlo_expectation(
+function expected_loglik(
+    mc::MonteCarlo,
     y::AbstractVector,
     f_mean::AbstractVector,
     f_var::AbstractVector,
-    lik::ScalarLikelihood;
-    n_samples=20
+    lik::ScalarLikelihood
 )
     # take 'n_samples' reparameterised samples with μ=f_mean and σ²=f_var
-    fs = f_mean .+ .√f_var .* randn(eltype(f_mean), length(f_mean), n_samples)
+    fs = f_mean .+ .√f_var .* randn(eltype(f_mean), length(f_mean), mc.n_samples)
     lls = loglikelihood.(lik.(fs), y)
-    return sum(lls) / n_samples
+    return sum(lls) / mc.n_samples
 end
 
-function gauss_hermite_quadrature(
+function expected_loglik(
+    gh::GaussHermite,
     y::AbstractVector,
     f_mean::AbstractVector,
     f_var::AbstractVector,
-    lik;
-    n_points=20
+    lik::ScalarLikelihood
 )
     # Compute the expectation via Gauss-Hermite quadrature
     # using a reparameterisation by change of variable
     # (see eg. en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature)
-    xs, ws = gausshermite(n_points)
+    xs, ws = gausshermite(gh.n_points)
     # size(fs): (length(y), n_points)
     fs = √2 * .√f_var .* transpose(xs) .+ f_mean
     lls = loglikelihood.(lik.(fs), y)
@@ -204,5 +218,5 @@ function StatsBase.kldivergence(q::AbstractMvNormal, p::AbstractMvNormal)
               Xt_invA_X(cholesky(p_Σ), (q_μ - p_μ)))
 end
 
-has_closed_form_expectation(lik::Union{PoissonLikelihood,GaussianLikelihood}) = true
-has_closed_form_expectation(lik) = false
+_default_method(::Union{PoissonLikelihood,GaussianLikelihood}) = Analytic()
+_default_method(_) = GaussHermite()

From 56507a8e2157f236a687d82d40a33b873b073ee2 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Wed, 21 Jul 2021 15:08:49 +0100
Subject: [PATCH 38/66] Update doctrings

---
 src/elbo.jl | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/src/elbo.jl b/src/elbo.jl
index 61739a69..14d4b5a8 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -17,17 +17,17 @@ end
 MonteCarlo() = MonteCarlo(20)
 
 """
-    elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), method=:default)
+    elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), method=Default())
 
 Compute the Evidence Lower BOund from [1] for the process `fx.f` where `y` are
 observations of `fx`, pseudo-inputs are given by `z = fz.x` and `q(u)` is a
 variational distribution over inducing points `u = f(z)`.
 
 `method` selects which method is used to calculate the expected loglikelihood in
-the ELBO. The options are: `:default`, `:gausshermite` and `:montecarlo`. For
-likelihoods with a closed form solution, `:default` uses this exact solution. If
-there is no such solution, `:default` is instead synonymous with
-`:gausshermite`.
+the ELBO. The options are: `Default()`, `Analytic()`, `GaussHermite()` and
+`MonteCarlo()`. For likelihoods with an analytic solution, `Default()` uses this
+exact solution. If there is no such solution, `Default()` either uses
+`GaussHermite()` or `MonteCarlo()`, depending on the likelihood.
 
 [1] - Hensman, James, Alexander Matthews, and Zoubin Ghahramani. "Scalable
 variational Gaussian process classification." Artificial Intelligence and
@@ -46,7 +46,7 @@ end
 
 
 """
-    elbo(lfx::LatentFiniteGP, y::AbstractVector, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), method=:default)
+    elbo(lfx::LatentFiniteGP, y::AbstractVector, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), method=Default())
 
 Compute the ELBO for a LatentGP with a possibly non-conjugate likelihood.
 """
@@ -83,7 +83,7 @@ function _elbo(
 end
 
 """
-    expected_loglik(y, f_mean, f_var, [Σy | lik])
+    expected_loglik(method, y, f_mean, f_var, [Σy | lik])
 
 This function computes the expected log likelihood:
 
@@ -100,7 +100,7 @@ where `q(u)` is the variational distribution over inducing points (see
 [`elbo`](@ref)).
 
 Where possible, this expectation is calculated in closed form. Otherwise, it is
-approximated using Gauss-Hermite quadrature by default.
+approximated using either Gauss-Hermite quadrature or Monte Carlo.
 
 # Extended help
 
@@ -114,7 +114,7 @@ function expected_loglik end
 
 The expected log likelihood for a Gaussian likelihood, computed in closed form
 by default. If using the closed form solution, the noise Σy is assumed to be
-uncorrelated (i.e. only diag(Σy) is used). If using `:gausshermite` or `:montecarlo`,
+uncorrelated (i.e. only diag(Σy) is used). If using `GaussHermite()` or `MonteCarlo()`,
 the noise is assumed to be homoscedastic as well (i.e. only Σy[1] is used).
 """
 function expected_loglik(
@@ -151,7 +151,7 @@ function expected_loglik(
 end
 
 """
-    expected_loglik(y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, lik::ScalarLikelihood; method=:default, n_points=20, n_samples=20)
+    expected_loglik(method::ExpectationMethod, y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, lik::ScalarLikelihood)
 
 The expected log likelihood for a `ScalarLikelihood`, computed via `method`.
 Defaults to a closed form solution if it exists, otherwise defaults to
@@ -179,6 +179,16 @@ function expected_loglik(
     return sum(y .* f_mean - exp(f_mean .+ (f_var / 2) - loggamma.(y)))
 end
 
+function expected_loglik(
+    ::Analytic,
+    y::AbstractVector,
+    f_mean::AbstractVector,
+    f_var::AbstractVector,
+    lik
+)
+    return sum(y .* f_mean - exp(f_mean .+ (f_var / 2) - loggamma.(y)))
+end
+
 function expected_loglik(
     mc::MonteCarlo,
     y::AbstractVector,

From 857ecc34f4c718a063cb83c7caff61903acbffef Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Wed, 21 Jul 2021 15:09:16 +0100
Subject: [PATCH 39/66] Enforce type for MonteCarlo and GaussHermite

---
 src/elbo.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/elbo.jl b/src/elbo.jl
index 14d4b5a8..95f15900 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -7,12 +7,12 @@ struct Default <: ExpectationMethod end
 struct Analytic <: ExpectationMethod end
 
 struct GaussHermite <: ExpectationMethod
-    n_points
+    n_points::Int
 end
 GaussHermite() = GaussHermite(20)
 
 struct MonteCarlo <: ExpectationMethod
-    n_samples
+    n_samples::Int
 end
 MonteCarlo() = MonteCarlo(20)
 

From 1bbf38568b8e3ed935786a093e64e066c7b88231 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Wed, 21 Jul 2021 15:17:04 +0100
Subject: [PATCH 40/66] Added error for Analytic

---
 src/elbo.jl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/elbo.jl b/src/elbo.jl
index 95f15900..cabe300e 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -186,7 +186,10 @@ function expected_loglik(
     f_var::AbstractVector,
     lik
 )
-    return sum(y .* f_mean - exp(f_mean .+ (f_var / 2) - loggamma.(y)))
+    return error(
+        "No analytic solution exists for ", lik,
+        ". Use `Default()`, `GaussHermite()` or `MonteCarlo()` instead."
+    )
 end
 
 function expected_loglik(

From bbd8502c66dfabf7000066df56de5d0b14e08fb3 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Wed, 21 Jul 2021 15:26:31 +0100
Subject: [PATCH 41/66] Rename GaussHermite to Quadrature

---
 src/elbo.jl | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/elbo.jl b/src/elbo.jl
index cabe300e..d2d3533b 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -6,10 +6,10 @@ abstract type ExpectationMethod end
 struct Default <: ExpectationMethod end
 struct Analytic <: ExpectationMethod end
 
-struct GaussHermite <: ExpectationMethod
+struct Quadrature <: ExpectationMethod
     n_points::Int
 end
-GaussHermite() = GaussHermite(20)
+Quadrature() = Quadrature(20)
 
 struct MonteCarlo <: ExpectationMethod
     n_samples::Int
@@ -24,10 +24,10 @@ observations of `fx`, pseudo-inputs are given by `z = fz.x` and `q(u)` is a
 variational distribution over inducing points `u = f(z)`.
 
 `method` selects which method is used to calculate the expected loglikelihood in
-the ELBO. The options are: `Default()`, `Analytic()`, `GaussHermite()` and
+the ELBO. The options are: `Default()`, `Analytic()`, `Quadrature()` and
 `MonteCarlo()`. For likelihoods with an analytic solution, `Default()` uses this
 exact solution. If there is no such solution, `Default()` either uses
-`GaussHermite()` or `MonteCarlo()`, depending on the likelihood.
+`Quadrature()` or `MonteCarlo()`, depending on the likelihood.
 
 [1] - Hensman, James, Alexander Matthews, and Zoubin Ghahramani. "Scalable
 variational Gaussian process classification." Artificial Intelligence and
@@ -114,7 +114,7 @@ function expected_loglik end
 
 The expected log likelihood for a Gaussian likelihood, computed in closed form
 by default. If using the closed form solution, the noise Σy is assumed to be
-uncorrelated (i.e. only diag(Σy) is used). If using `GaussHermite()` or `MonteCarlo()`,
+uncorrelated (i.e. only diag(Σy) is used). If using `Quadrature()` or `MonteCarlo()`,
 the noise is assumed to be homoscedastic as well (i.e. only Σy[1] is used).
 """
 function expected_loglik(
@@ -141,7 +141,7 @@ function expected_loglik(
 end
 
 function expected_loglik(
-    method::Union{GaussHermite,MonteCarlo},
+    method::Union{Quadrature,MonteCarlo},
     y::AbstractVector,
     f_mean::AbstractVector,
     f_var::AbstractVector,
@@ -188,7 +188,7 @@ function expected_loglik(
 )
     return error(
         "No analytic solution exists for ", lik,
-        ". Use `Default()`, `GaussHermite()` or `MonteCarlo()` instead."
+        ". Use `Default()`, `Quadrature()` or `MonteCarlo()` instead."
     )
 end
 
@@ -206,7 +206,7 @@ function expected_loglik(
 end
 
 function expected_loglik(
-    gh::GaussHermite,
+    gh::Quadrature,
     y::AbstractVector,
     f_mean::AbstractVector,
     f_var::AbstractVector,
@@ -232,4 +232,4 @@ function StatsBase.kldivergence(q::AbstractMvNormal, p::AbstractMvNormal)
 end
 
 _default_method(::Union{PoissonLikelihood,GaussianLikelihood}) = Analytic()
-_default_method(_) = GaussHermite()
+_default_method(_) = Quadrature()

From 0563d0192f830889f40e9feda51c2274008aee8d Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Wed, 21 Jul 2021 18:00:57 +0100
Subject: [PATCH 42/66] Assume homoscedastic Gaussian noise

---
 src/SparseGPs.jl |  2 +-
 src/elbo.jl      | 64 ++++++++++++++----------------------------------
 2 files changed, 20 insertions(+), 46 deletions(-)

diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl
index b39b7a51..6209acdb 100644
--- a/src/SparseGPs.jl
+++ b/src/SparseGPs.jl
@@ -25,7 +25,7 @@ export elbo,
     SVGP,
     Default,
     Analytic,
-    GaussHermite,
+    Quadrature,
     MonteCarlo
 
 
diff --git a/src/elbo.jl b/src/elbo.jl
index d2d3533b..a6ea2ba4 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -29,6 +29,9 @@ the ELBO. The options are: `Default()`, `Analytic()`, `Quadrature()` and
 exact solution. If there is no such solution, `Default()` either uses
 `Quadrature()` or `MonteCarlo()`, depending on the likelihood.
 
+N.B. the observation noise `fx.Σy` is assumed to be homoscedastic and
+uncorrelated - i.e. only `fx.Σy[1]` is used.
+
 [1] - Hensman, James, Alexander Matthews, and Zoubin Ghahramani. "Scalable
 variational Gaussian process classification." Artificial Intelligence and
 Statistics. PMLR, 2015.
@@ -41,7 +44,7 @@ function elbo(
     n_data=length(y),
     method=Default()
 )
-    return _elbo(method, fx, y, fz, q, fx.Σy, n_data)
+    return _elbo(method, fx, y, fz, q, GaussianLikelihood(fx.Σy[1]), n_data)
 end
 
 
@@ -68,7 +71,7 @@ function _elbo(
     y::AbstractVector,
     fz::FiniteGP,
     q::AbstractMvNormal,
-    lik::Union{AbstractVecOrMat,ScalarLikelihood},
+    lik::ScalarLikelihood,
     n_data::Integer
 )
     post = approx_posterior(SVGP(), fz, q)
@@ -83,7 +86,7 @@ function _elbo(
 end
 
 """
-    expected_loglik(method, y, f_mean, f_var, [Σy | lik])
+    expected_loglik(method, y, f_mean, f_var, lik)
 
 This function computes the expected log likelihood:
 
@@ -97,7 +100,8 @@ where `p(y | f)` is the process likelihood.
     q(f) = ∫ p(f | u) q(u) du
 ```
 where `q(u)` is the variational distribution over inducing points (see
-[`elbo`](@ref)).
+[`elbo`](@ref)). The marginal means and variances of `q(f)` are given by
+`f_mean` and `f_var` respectively.
 
 Where possible, this expectation is calculated in closed form. Otherwise, it is
 approximated using either Gauss-Hermite quadrature or Monte Carlo.
@@ -110,22 +114,21 @@ have independent marginals such that only the marginals of `q(f)` are required.
 function expected_loglik end
 
 """
-    expected_loglik(y::AbstractVector{<:Real}, f_mean::AbstractVector, f_var::AbstractVector, Σy::AbstractMatrix)
+    expected_loglik(method::ExpectationMethod, y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, lik::ScalarLikelihood)
 
-The expected log likelihood for a Gaussian likelihood, computed in closed form
-by default. If using the closed form solution, the noise Σy is assumed to be
-uncorrelated (i.e. only diag(Σy) is used). If using `Quadrature()` or `MonteCarlo()`,
-the noise is assumed to be homoscedastic as well (i.e. only Σy[1] is used).
+The expected log likelihood for a `ScalarLikelihood`, computed via `method`.
+Defaults to a closed form solution if it exists, otherwise defaults to
+Gauss-Hermite quadrature.
 """
 function expected_loglik(
     ::Default,
-    y::AbstractVector{<:Real},
+    y::AbstractVector,
     f_mean::AbstractVector,
     f_var::AbstractVector,
-    Σy::AbstractMatrix
+    lik::ScalarLikelihood
 )
-    method = _default_method(GaussianLikelihood())
-    expected_loglik(method, y, f_mean, f_var, Σy)
+    method = _default_method(lik)
+    expected_loglik(method, y, f_mean, f_var, lik)
 end
 
 # The closed form solution for independent Gaussian noise
@@ -134,38 +137,9 @@ function expected_loglik(
     y::AbstractVector,
     f_mean::AbstractVector,
     f_var::AbstractVector,
-    Σy::AbstractMatrix
-)
-    Σy_diag = diag(Σy)
-    return sum(-0.5 * (log(2π) .+ log.(Σy_diag) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy_diag))
-end
-
-function expected_loglik(
-    method::Union{Quadrature,MonteCarlo},
-    y::AbstractVector,
-    f_mean::AbstractVector,
-    f_var::AbstractVector,
-    Σy::AbstractMatrix
+    lik::GaussianLikelihood
 )
-    return expected_loglik(method, y, f_mean, f_var, GaussianLikelihood(Σy[1]))
-end
-
-"""
-    expected_loglik(method::ExpectationMethod, y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, lik::ScalarLikelihood)
-
-The expected log likelihood for a `ScalarLikelihood`, computed via `method`.
-Defaults to a closed form solution if it exists, otherwise defaults to
-Gauss-Hermite quadrature.
-"""
-function expected_loglik(
-    ::Default,
-    y::AbstractVector,
-    f_mean::AbstractVector,
-    f_var::AbstractVector,
-    lik::ScalarLikelihood
-)
-    method = _default_method(lik)
-    expected_loglik(method, y, f_mean, f_var, lik)
+    return sum(-0.5 * (log(2π) .+ log(lik.σ²) .+ ((y .- f_mean).^2 .+ f_var) / lik.σ²))
 end
 
 # The closed form solution for a Poisson likelihood
@@ -176,7 +150,7 @@ function expected_loglik(
     f_var::AbstractVector,
     ::PoissonLikelihood
 )
-    return sum(y .* f_mean - exp(f_mean .+ (f_var / 2) - loggamma.(y)))
+    return sum((y .* f_mean) - exp.(f_mean .+ (f_var / 2)) - loggamma.(y .+ 1))
 end
 
 function expected_loglik(

From fb9a56399e1710578928ecc20d52751f8fe57992 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Wed, 21 Jul 2021 18:01:16 +0100
Subject: [PATCH 43/66] Add tests for `expected_loglik`

---
 test/elbo.jl     | 18 ++++++++++++++++++
 test/runtests.jl |  4 ++++
 2 files changed, 22 insertions(+)
 create mode 100644 test/elbo.jl

diff --git a/test/elbo.jl b/test/elbo.jl
new file mode 100644
index 00000000..649f985b
--- /dev/null
+++ b/test/elbo.jl
@@ -0,0 +1,18 @@
+@testset "elbo" begin
+    # Test that the various methods of computing expectations return the same
+    # result.
+    rng = MersenneTwister(123456)
+    f_mean = rand(rng, 10)
+    f_var = rand(rng, 10)
+
+    @testset "$lik" for lik in Base.uniontypes(SparseGPs.ScalarLikelihood)
+        l = lik()
+        methods = [Quadrature(100), MonteCarlo(1000000)]
+        def = SparseGPs._default_method(l)
+        if def isa Analytic push!(methods, def) end
+        y = rand.(rng, l.(f_mean))
+
+        results = map(m -> SparseGPs.expected_loglik(m, y, f_mean, f_var, l), methods)
+        @test all(x->isapprox(x, results[end], rtol=1e-3), results)
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 5419760a..3a60e76b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -17,6 +17,10 @@ include("test_utils.jl")
     println(" ")
     @info "Ran svgp tests"
 
+    include("elbo.jl")
+    println(" ")
+    @info "Ran elbo tests"
+
     include("equivalences.jl")
     println(" ")
     @info "Ran equivalences tests"

From e62fbf70e2c588bc797a0645c96ad7be27a00003 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Sat, 24 Jul 2021 15:44:34 +0100
Subject: [PATCH 44/66] Require ExpLink for Poisson closed form

---
 src/elbo.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/elbo.jl b/src/elbo.jl
index a6ea2ba4..e96a38c7 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -139,16 +139,16 @@ function expected_loglik(
     f_var::AbstractVector,
     lik::GaussianLikelihood
 )
-    return sum(-0.5 * (log(2π) .+ log(lik.σ²) .+ ((y .- f_mean).^2 .+ f_var) / lik.σ²))
+    return sum(-0.5 * (log(2π) .+ log.(lik.σ²) .+ ((y .- f_mean).^2 .+ f_var) / lik.σ²))
 end
 
-# The closed form solution for a Poisson likelihood
+# The closed form solution for a Poisson likelihood with an exponential inverse link function
 function expected_loglik(
     ::Analytic,
     y::AbstractVector,
     f_mean::AbstractVector,
     f_var::AbstractVector,
-    ::PoissonLikelihood
+    ::PoissonLikelihood{ExpLink}
 )
     return sum((y .* f_mean) - exp.(f_mean .+ (f_var / 2)) - loggamma.(y .+ 1))
 end

From 36c62b941812b48b34cc98ddb4fac43012474172 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Sat, 24 Jul 2021 15:49:15 +0100
Subject: [PATCH 45/66] Better error message

---
 src/elbo.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/elbo.jl b/src/elbo.jl
index e96a38c7..167946ac 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -161,7 +161,7 @@ function expected_loglik(
     lik
 )
     return error(
-        "No analytic solution exists for ", lik,
+        "No analytic solution exists for ", typeof(lik),
         ". Use `Default()`, `Quadrature()` or `MonteCarlo()` instead."
     )
 end

From 0ee10044df91dee8496cd3efcabe83bed0855467 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Sat, 24 Jul 2021 17:53:29 +0100
Subject: [PATCH 46/66] Added close form for Gamma and Exponential

---
 src/elbo.jl | 39 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/src/elbo.jl b/src/elbo.jl
index 167946ac..33c62914 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -1,5 +1,11 @@
 "Likelihoods which take a scalar (or vector of scalars) as input and return a single scalar."
-ScalarLikelihood = Union{BernoulliLikelihood,PoissonLikelihood,GaussianLikelihood}
+ScalarLikelihood = Union{
+    BernoulliLikelihood,
+    PoissonLikelihood,
+    GaussianLikelihood,
+    ExponentialLikelihood,
+    GammaLikelihood
+}
 
 
 abstract type ExpectationMethod end
@@ -153,6 +159,29 @@ function expected_loglik(
     return sum((y .* f_mean) - exp.(f_mean .+ (f_var / 2)) - loggamma.(y .+ 1))
 end
 
+# The closed form solution for an Exponential likelihood with an exponential inverse link function
+function expected_loglik(
+    ::Analytic,
+    y::AbstractVector,
+    f_mean::AbstractVector,
+    f_var::AbstractVector,
+    ::ExponentialLikelihood{ExpLink}
+)
+    return sum(-f_mean - y .* exp.((f_var / 2) .- f_mean))
+end
+
+# The closed form solution for a Gamma likelihood with an exponential inverse link function
+function expected_loglik(
+    ::Analytic,
+    y::AbstractVector,
+    f_mean::AbstractVector,
+    f_var::AbstractVector,
+    lik::GammaLikelihood{<:Any, ExpLink}
+)
+    return sum((lik.α - 1) * log.(y) .- y .* exp.((f_var / 2) .- f_mean)
+               .- lik.α * f_mean .- loggamma(lik.α))
+end
+
 function expected_loglik(
     ::Analytic,
     y::AbstractVector,
@@ -205,5 +234,11 @@ function StatsBase.kldivergence(q::AbstractMvNormal, p::AbstractMvNormal)
               Xt_invA_X(cholesky(p_Σ), (q_μ - p_μ)))
 end
 
-_default_method(::Union{PoissonLikelihood,GaussianLikelihood}) = Analytic()
+AnalyticLikelihood = Union{
+    PoissonLikelihood,
+    GaussianLikelihood,
+    ExponentialLikelihood,
+    GammaLikelihood
+}
+_default_method(::AnalyticLikelihood) = Analytic()
 _default_method(_) = Quadrature()

From f648a7c6f11191d22c63dbdff31101d8a19f3450 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Sat, 24 Jul 2021 17:56:04 +0100
Subject: [PATCH 47/66] Fix docstring

---
 src/svgp.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/svgp.jl b/src/svgp.jl
index 2a5edd68..c4257eef 100644
--- a/src/svgp.jl
+++ b/src/svgp.jl
@@ -17,7 +17,6 @@ which can be found in closed form.
 variational Gaussian process classification." Artificial Intelligence and
 Statistics. PMLR, 2015.
 """
-
 function approx_posterior(::SVGP, fz::FiniteGP, q::AbstractMvNormal)
     m, A = mean(q), cholesky(cov(q))
     Kuu = cholesky(Symmetric(cov(fz)))

From a9b9a57dab947ae20711de6f9e8ef2af1e6fc6f1 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Sat, 24 Jul 2021 17:59:33 +0100
Subject: [PATCH 48/66] Update docstring

---
 src/elbo.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/elbo.jl b/src/elbo.jl
index 33c62914..a1e2db65 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -109,8 +109,8 @@ where `q(u)` is the variational distribution over inducing points (see
 [`elbo`](@ref)). The marginal means and variances of `q(f)` are given by
 `f_mean` and `f_var` respectively.
 
-Where possible, this expectation is calculated in closed form. Otherwise, it is
-approximated using either Gauss-Hermite quadrature or Monte Carlo.
+`method` determines which method is used to calculate the expected log
+likelihood - see [`elbo`](@ref) for more details.
 
 # Extended help
 

From b8e7d6b71a3659df52ef28270b02b2e1336a9605 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Mon, 26 Jul 2021 16:56:00 +0100
Subject: [PATCH 49/66] Fix docstring

---
 src/elbo.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/elbo.jl b/src/elbo.jl
index a1e2db65..92f87917 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -1,4 +1,4 @@
-"Likelihoods which take a scalar (or vector of scalars) as input and return a single scalar."
+"Likelihoods which take a scalar as input and return a scalar."
 ScalarLikelihood = Union{
     BernoulliLikelihood,
     PoissonLikelihood,

From 9353e44d5511af90362de4cf8c576eb0ea9ca6ad Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Mon, 26 Jul 2021 17:23:36 +0100
Subject: [PATCH 50/66] Restrict types for continuous distributions

---
 src/elbo.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/elbo.jl b/src/elbo.jl
index 92f87917..e21d2bf4 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -140,7 +140,7 @@ end
 # The closed form solution for independent Gaussian noise
 function expected_loglik(
     ::Analytic,
-    y::AbstractVector,
+    y::AbstractVector{<:Real},
     f_mean::AbstractVector,
     f_var::AbstractVector,
     lik::GaussianLikelihood
@@ -162,7 +162,7 @@ end
 # The closed form solution for an Exponential likelihood with an exponential inverse link function
 function expected_loglik(
     ::Analytic,
-    y::AbstractVector,
+    y::AbstractVector{<:Real},
     f_mean::AbstractVector,
     f_var::AbstractVector,
     ::ExponentialLikelihood{ExpLink}
@@ -173,7 +173,7 @@ end
 # The closed form solution for a Gamma likelihood with an exponential inverse link function
 function expected_loglik(
     ::Analytic,
-    y::AbstractVector,
+    y::AbstractVector{<:Real},
     f_mean::AbstractVector,
     f_var::AbstractVector,
     lik::GammaLikelihood{<:Any, ExpLink}

From ea3d3c68c476b0c4a9e641c02d4a351b53ae0d1d Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Mon, 26 Jul 2021 18:31:30 +0100
Subject: [PATCH 51/66] Use `AbstractGPs.approx_posterior` and `elbo`

---
 src/SparseGPs.jl     |  4 +---
 src/elbo.jl          |  4 ++--
 src/svgp.jl          |  2 +-
 test/equivalences.jl | 10 +++++-----
 test/svgp.jl         |  2 +-
 5 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl
index 6209acdb..7c96a8bd 100644
--- a/src/SparseGPs.jl
+++ b/src/SparseGPs.jl
@@ -20,9 +20,7 @@ using AbstractGPs:
     diag_At_A,
     Xt_invA_X
 
-export elbo,
-    approx_posterior,
-    SVGP,
+export SVGP,
     Default,
     Analytic,
     Quadrature,
diff --git a/src/elbo.jl b/src/elbo.jl
index e21d2bf4..cefd09ae 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -42,7 +42,7 @@ uncorrelated - i.e. only `fx.Σy[1]` is used.
 variational Gaussian process classification." Artificial Intelligence and
 Statistics. PMLR, 2015.
 """
-function elbo(
+function AbstractGPs.elbo(
     fx::FiniteGP,
     y::AbstractVector{<:Real},
     fz::FiniteGP,
@@ -59,7 +59,7 @@ end
 
 Compute the ELBO for a LatentGP with a possibly non-conjugate likelihood.
 """
-function elbo(
+function AbstractGPs.elbo(
     lfx::LatentFiniteGP,
     y::AbstractVector,
     fz::FiniteGP,
diff --git a/src/svgp.jl b/src/svgp.jl
index c4257eef..c53f5054 100644
--- a/src/svgp.jl
+++ b/src/svgp.jl
@@ -17,7 +17,7 @@ which can be found in closed form.
 variational Gaussian process classification." Artificial Intelligence and
 Statistics. PMLR, 2015.
 """
-function approx_posterior(::SVGP, fz::FiniteGP, q::AbstractMvNormal)
+function AbstractGPs.approx_posterior(::SVGP, fz::FiniteGP, q::AbstractMvNormal)
     m, A = mean(q), cholesky(cov(q))
     Kuu = cholesky(Symmetric(cov(fz)))
     B = Kuu.L \ A.L
diff --git a/test/equivalences.jl b/test/equivalences.jl
index 1765c790..d03ee474 100644
--- a/test/equivalences.jl
+++ b/test/equivalences.jl
@@ -23,9 +23,9 @@
         fu = f(z)
         q_ex = exact_variational_posterior(fu, fx, y)
 
-        gpr_post = AbstractGPs.posterior(fx, y) # Exact GP regression
-        vfe_post = AbstractGPs.approx_posterior(VFE(), fx, y, fu) # Titsias posterior
-        svgp_post = SparseGPs.approx_posterior(SVGP(), fu, q_ex) # Hensman (2013) exact posterior
+        gpr_post = posterior(fx, y) # Exact GP regression
+        vfe_post = approx_posterior(VFE(), fx, y, fu) # Titsias posterior
+        svgp_post = approx_posterior(SVGP(), fu, q_ex) # Hensman (2013) exact posterior
 
         @test mean(gpr_post, x) ≈ mean(svgp_post, x) atol=1e-10
         @test cov(gpr_post, x) ≈ cov(svgp_post, x) atol=1e-10
@@ -78,7 +78,7 @@
         svgp = SVGPModel(copy(k_init), copy(z), m, A)
         function SVGP_loss(x, y)
             fx, fz, q = svgp(x)
-            return -SparseGPs.elbo(fx, y, fz, q)
+            return -elbo(fx, y, fz, q)
         end
 
         ## THIRD - train the models
@@ -102,7 +102,7 @@
             f = GP(make_kernel(m.k))
             fz = f(m.z, jitter)
             q = MvNormal(m.m, m.A'm.A)
-            return SparseGPs.approx_posterior(SVGP(), fz, q)
+            return approx_posterior(SVGP(), fz, q)
         end
         
         gpr_post = posterior(gpr, x, y)
diff --git a/test/svgp.jl b/test/svgp.jl
index a55b9bf4..7dd90692 100644
--- a/test/svgp.jl
+++ b/test/svgp.jl
@@ -12,7 +12,7 @@
     y = rand(rng, fx)
 
     q = exact_variational_posterior(fx, fx, y)
-    f_approx_post = SparseGPs.approx_posterior(SVGP(), fx, q)
+    f_approx_post = approx_posterior(SVGP(), fx, q)
 
     a = collect(range(-1.0, 1.0; length=N_a))
     b = randn(rng, N_b)

From c1a45464d75f73ea9e8e53c533e3d4168008bbf2 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Tue, 27 Jul 2021 21:09:51 +0100
Subject: [PATCH 52/66] Minor formatting

---
 src/elbo.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/elbo.jl b/src/elbo.jl
index cefd09ae..3490ec92 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -53,7 +53,6 @@ function AbstractGPs.elbo(
     return _elbo(method, fx, y, fz, q, GaussianLikelihood(fx.Σy[1]), n_data)
 end
 
-
 """
     elbo(lfx::LatentFiniteGP, y::AbstractVector, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), method=Default())
 
@@ -117,7 +116,7 @@ likelihood - see [`elbo`](@ref) for more details.
 `q(f)` is assumed to be an `MvNormal` distribution and `p(y | f)` is assumed to
 have independent marginals such that only the marginals of `q(f)` are required.
 """
-function expected_loglik end
+expected_loglik(method, y, f_mean, f_var, lik)
 
 """
     expected_loglik(method::ExpectationMethod, y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, lik::ScalarLikelihood)

From 835da224cfb0b6a5bbc8c0d43e2f99162c3f4c16 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Tue, 27 Jul 2021 22:09:09 +0100
Subject: [PATCH 53/66] Dispatch on filled diagonal matrix obs noise

---
 Project.toml     | 1 +
 src/SparseGPs.jl | 2 ++
 src/elbo.jl      | 6 +++++-
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 5f55a43d..0bb253b1 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,6 +8,7 @@ AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 FastGaussQuadrature = "442a2c76-b920-505d-bb47-c5924d526838"
+FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 GPLikelihoods = "6031954c-0455-49d7-b3b9-3e1c99afaf40"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl
index 7c96a8bd..14fef8b4 100644
--- a/src/SparseGPs.jl
+++ b/src/SparseGPs.jl
@@ -9,8 +9,10 @@ using FastGaussQuadrature
 using GPLikelihoods
 using SpecialFunctions
 using ChainRulesCore
+using FillArrays
 
 using AbstractGPs:
+    AbstractGP,
     FiniteGP,
     LatentFiniteGP,
     ApproxPosteriorGP,
diff --git a/src/elbo.jl b/src/elbo.jl
index 3490ec92..1cb2bf40 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -43,7 +43,7 @@ variational Gaussian process classification." Artificial Intelligence and
 Statistics. PMLR, 2015.
 """
 function AbstractGPs.elbo(
-    fx::FiniteGP,
+    fx::FiniteGP{<:AbstractGP, <:AbstractVector, <:Diagonal{<:Real, <:Fill}},
     y::AbstractVector{<:Real},
     fz::FiniteGP,
     q::AbstractMvNormal;
@@ -53,6 +53,10 @@ function AbstractGPs.elbo(
     return _elbo(method, fx, y, fz, q, GaussianLikelihood(fx.Σy[1]), n_data)
 end
 
+function AbstractGPs.elbo(::FiniteGP, ::AbstractVector, ::FiniteGP, ::AbstractMvNormal; kwargs...) where T<:FiniteGP
+    return error("The observation noise fx.Σy may not be homoscedastic.\n To avoid this error, construct fx using: f = GP(kernel); fx = f(x, σ²)")
+end
+
 """
     elbo(lfx::LatentFiniteGP, y::AbstractVector, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), method=Default())
 

From fa1cdc36bbd9b340b069a37d568645ee3534c04e Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Tue, 27 Jul 2021 22:39:12 +0100
Subject: [PATCH 54/66] Add elbo tests

---
 test/elbo.jl         | 21 +++++++++++++++++++--
 test/equivalences.jl | 12 ++++++------
 test/test_utils.jl   |  3 +++
 3 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/test/elbo.jl b/test/elbo.jl
index 649f985b..c4c879c2 100644
--- a/test/elbo.jl
+++ b/test/elbo.jl
@@ -1,9 +1,26 @@
 @testset "elbo" begin
+    rng, N = MersenneTwister(654321), 20
+    x = rand(rng, N) * 10
+    y = sin.(x) + 0.9 * cos.(x * 1.6) + 0.4 * rand(rng, N)
+    z = x[begin:5]
+
+    kernel = make_kernel([0.2, 0.6])
+    f = GP(sin, kernel)
+    fx = f(x, 0.1)
+    fx_bad = f(x, fill(0.1, N))
+    fz = f(z)
+    q_ex = exact_variational_posterior(fz, fx, y)
+
+    @test elbo(fx, y, fz, q_ex) isa Real
+    @test elbo(fx, y, fz, q_ex) ≤ logpdf(fx, y)
+
+    @test_throws ErrorException elbo(fx_bad, y, fz, q_ex)
+
     # Test that the various methods of computing expectations return the same
     # result.
     rng = MersenneTwister(123456)
-    f_mean = rand(rng, 10)
-    f_var = rand(rng, 10)
+    f_mean = zeros(10)
+    f_var = ones(10)
 
     @testset "$lik" for lik in Base.uniontypes(SparseGPs.ScalarLikelihood)
         l = lik()
diff --git a/test/equivalences.jl b/test/equivalences.jl
index d03ee474..1d3eba31 100644
--- a/test/equivalences.jl
+++ b/test/equivalences.jl
@@ -5,8 +5,6 @@
 
     z = copy(x) # Set inducing inputs == training inputs
     
-    make_kernel(k) = softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2])))
-
     k_init = [0.2, 0.6] # initial kernel parameters
     lik_noise = 0.1 # The (fixed) Gaussian likelihood noise
 
@@ -20,18 +18,20 @@
         kernel = make_kernel(k_init)
         f = GP(kernel)
         fx = f(x, lik_noise)
-        fu = f(z)
-        q_ex = exact_variational_posterior(fu, fx, y)
+        fz = f(z)
+        q_ex = exact_variational_posterior(fz, fx, y)
 
         gpr_post = posterior(fx, y) # Exact GP regression
-        vfe_post = approx_posterior(VFE(), fx, y, fu) # Titsias posterior
-        svgp_post = approx_posterior(SVGP(), fu, q_ex) # Hensman (2013) exact posterior
+        vfe_post = approx_posterior(VFE(), fx, y, fz) # Titsias posterior
+        svgp_post = approx_posterior(SVGP(), fz, q_ex) # Hensman (2013) exact posterior
 
         @test mean(gpr_post, x) ≈ mean(svgp_post, x) atol=1e-10
         @test cov(gpr_post, x) ≈ cov(svgp_post, x) atol=1e-10
 
         @test mean(vfe_post, x) ≈ mean(svgp_post, x) atol=1e-10
         @test cov(vfe_post, x) ≈ cov(svgp_post, x) atol=1e-10
+
+        @test elbo(fx, y, fz, q_ex) ≈ logpdf(fx, y)
     end
 
     @testset "optimised posterior" begin
diff --git a/test/test_utils.jl b/test/test_utils.jl
index 0bae973c..33ed214f 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -1,3 +1,6 @@
+# Create a default kernel from two parameters k[1] and k[2]
+make_kernel(k) = softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2])))
+
 # Computes the optimal closed form solution for the variational posterior
 # q(u) (e.g. # https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/
 # equations (11) & (12)).

From af41ca3a0384e0300aa1720569902e8a9b27708b Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Tue, 27 Jul 2021 22:57:45 +0100
Subject: [PATCH 55/66] Small test changes

---
 test/elbo.jl         |  2 +-
 test/equivalences.jl | 14 ++++++++------
 test/test_utils.jl   |  2 +-
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/test/elbo.jl b/test/elbo.jl
index c4c879c2..2d33b53a 100644
--- a/test/elbo.jl
+++ b/test/elbo.jl
@@ -7,13 +7,13 @@
     kernel = make_kernel([0.2, 0.6])
     f = GP(sin, kernel)
     fx = f(x, 0.1)
-    fx_bad = f(x, fill(0.1, N))
     fz = f(z)
     q_ex = exact_variational_posterior(fz, fx, y)
 
     @test elbo(fx, y, fz, q_ex) isa Real
     @test elbo(fx, y, fz, q_ex) ≤ logpdf(fx, y)
 
+    fx_bad = f(x, fill(0.1, N))
     @test_throws ErrorException elbo(fx_bad, y, fz, q_ex)
 
     # Test that the various methods of computing expectations return the same
diff --git a/test/equivalences.jl b/test/equivalences.jl
index 1d3eba31..162e8d3d 100644
--- a/test/equivalences.jl
+++ b/test/equivalences.jl
@@ -36,7 +36,9 @@
 
     @testset "optimised posterior" begin
         jitter = 1e-5
-        
+
+        make_gp(kernel) = GP(kernel)
+
         ## FIRST - define the models
         # GPR - Exact GP regression
         struct GPRModel
@@ -45,7 +47,7 @@
         @Flux.functor GPRModel
 
         function (m::GPRModel)(x)
-            f = GP(make_kernel(m.k))
+            f = make_gp(make_kernel(m.k))
             fx = f(x, lik_noise)
             return fx
         end
@@ -60,7 +62,7 @@
         @Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs
 
         function (m::SVGPModel)(x)
-            f = GP(make_kernel(m.k))
+            f = make_gp(make_kernel(m.k))
             q = MvNormal(m.m, m.A'm.A)
             fx = f(x, lik_noise)
             fz = f(m.z, jitter)
@@ -93,18 +95,18 @@
 
         ## FOURTH - construct the posteriors
         function posterior(m::GPRModel, x, y)
-            f = GP(make_kernel(m.k))
+            f = make_gp(make_kernel(m.k))
             fx = f(x, lik_noise)
             return AbstractGPs.posterior(fx, y)
         end
 
         function posterior(m::SVGPModel)
-            f = GP(make_kernel(m.k))
+            f = make_gp(make_kernel(m.k))
             fz = f(m.z, jitter)
             q = MvNormal(m.m, m.A'm.A)
             return approx_posterior(SVGP(), fz, q)
         end
-        
+
         gpr_post = posterior(gpr, x, y)
         svgp_post = posterior(svgp)
 
diff --git a/test/test_utils.jl b/test/test_utils.jl
index 33ed214f..0ac02c41 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -3,7 +3,7 @@ make_kernel(k) = softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(soft
 
 # Computes the optimal closed form solution for the variational posterior
 # q(u) (e.g. # https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/
-# equations (11) & (12)).
+# equations (11) & (12)). Assumes a ZeroMean function.
 function exact_variational_posterior(fu, fx, y)
     σ² = fx.Σy[1]
     Kuf = cov(fu, fx)

From de2c4cd42fc0cae5079ff91501a329bc5f7de7e2 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Tue, 27 Jul 2021 23:07:23 +0100
Subject: [PATCH 56/66] Fix elbo error

---
 src/elbo.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/elbo.jl b/src/elbo.jl
index 1cb2bf40..4a90329d 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -53,8 +53,8 @@ function AbstractGPs.elbo(
     return _elbo(method, fx, y, fz, q, GaussianLikelihood(fx.Σy[1]), n_data)
 end
 
-function AbstractGPs.elbo(::FiniteGP, ::AbstractVector, ::FiniteGP, ::AbstractMvNormal; kwargs...) where T<:FiniteGP
-    return error("The observation noise fx.Σy may not be homoscedastic.\n To avoid this error, construct fx using: f = GP(kernel); fx = f(x, σ²)")
+function AbstractGPs.elbo(::FiniteGP, ::AbstractVector, ::FiniteGP, ::AbstractMvNormal; kwargs...)
+    return error("The observation noise fx.Σy must be homoscedastic.\n To avoid this error, construct fx using: f = GP(kernel); fx = f(x, σ²)")
 end
 
 """

From f07c6f17a4722234546acdd1ddff0de0e11a86a8 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Wed, 28 Jul 2021 13:55:34 +0100
Subject: [PATCH 57/66] Remove qualifier from kldivergence

Co-authored-by: st-- <st--@users.noreply.github.com>
---
 src/elbo.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/elbo.jl b/src/elbo.jl
index 4a90329d..17ebf802 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -87,7 +87,7 @@ function _elbo(
     f_mean, f_var = mean_and_var(post, fx.x)
     variational_exp = expected_loglik(method, y, f_mean, f_var, lik)
 
-    kl_term = StatsBase.kldivergence(q, fz)
+    kl_term = kldivergence(q, fz)
 
     n_batch = length(y)
     scale = n_data / n_batch

From 9f4d2951c1663f04657e603663c9b5220fd5b142 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Wed, 28 Jul 2021 14:00:12 +0100
Subject: [PATCH 58/66] Check for ZeroMean

---
 test/elbo.jl       | 2 +-
 test/test_utils.jl | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/elbo.jl b/test/elbo.jl
index 2d33b53a..7de90d9a 100644
--- a/test/elbo.jl
+++ b/test/elbo.jl
@@ -5,7 +5,7 @@
     z = x[begin:5]
 
     kernel = make_kernel([0.2, 0.6])
-    f = GP(sin, kernel)
+    f = GP(kernel)
     fx = f(x, 0.1)
     fz = f(z)
     q_ex = exact_variational_posterior(fz, fx, y)
diff --git a/test/test_utils.jl b/test/test_utils.jl
index 0ac02c41..02b50670 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -5,6 +5,7 @@ make_kernel(k) = softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(soft
 # q(u) (e.g. # https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/
 # equations (11) & (12)). Assumes a ZeroMean function.
 function exact_variational_posterior(fu, fx, y)
+    fu.f.mean isa AbstractGPs.ZeroMean || error("The exact posterior requires a GP with ZeroMean.")
     σ² = fx.Σy[1]
     Kuf = cov(fu, fx)
     Kuu = Symmetric(cov(fu))

From ca5f1488b4f374ed5e005c8a66c54e618509df21 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Wed, 28 Jul 2021 14:20:32 +0100
Subject: [PATCH 59/66] Fix classification example jitter

---
 examples/classification.jl | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/examples/classification.jl b/examples/classification.jl
index b1442a09..ab476bce 100644
--- a/examples/classification.jl
+++ b/examples/classification.jl
@@ -51,6 +51,7 @@ plt = plot(
 )
 scatter!(plt, x, y; seriescolor="blue", label="Data points")
 
+
 # %%
 # Plot the same samples, but pushed through a logistic sigmoid to constrain
 # them in (0, 1).
@@ -80,9 +81,11 @@ end
 @Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs
 
 lik = BernoulliLikelihood()
+jitter = 1e-4
+
 function (m::SVGPModel)(x)
     kernel = make_kernel(m.k)
-    f = LatentGP(GP(kernel), BernoulliLikelihood(), 0.1)
+    f = LatentGP(GP(kernel), lik, jitter)
     q = MvNormal(m.m, m.A'm.A)
     fx = f(x)
     fu = f(m.z).fx
@@ -117,7 +120,7 @@ println(flux_loss(x, y))
 Flux.train!(
     (x, y) -> flux_loss(x, y),
     parameters,
-    ncycle([(x, y)], 1000), # Train for 1000 epochs
+    ncycle([(x, y)], 2000), # Train for 1000 epochs
     opt
 )
 
@@ -127,10 +130,9 @@ println(flux_loss(x, y))
 
 # %%
 # After optimisation, plot samples from the underlying posterior GP.
-
 fu = f(z).fx # want the underlying FiniteGP
 post = SparseGPs.approx_posterior(SVGP(), fu, MvNormal(m, A'A))
-l_post = LatentGP(post, BernoulliLikelihood(), 0.1)
+l_post = LatentGP(post, BernoulliLikelihood(), jitter)
 
 post_f_samples = rand(l_post.f(x_plot, 1e-6), 20)
 

From 66ec256e8df64bb13447fc052b12ff7c7421dfde Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Wed, 28 Jul 2021 14:30:00 +0100
Subject: [PATCH 60/66] Remove unnecessary imports from AbstractGPs

---
 src/SparseGPs.jl | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl
index 14fef8b4..cc0d156b 100644
--- a/src/SparseGPs.jl
+++ b/src/SparseGPs.jl
@@ -16,8 +16,6 @@ using AbstractGPs:
     FiniteGP,
     LatentFiniteGP,
     ApproxPosteriorGP,
-    _cholesky,
-    _symmetric,
     At_A,
     diag_At_A,
     Xt_invA_X

From 6841074f3aaf0a09a917250732739df4b30035a6 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Wed, 28 Jul 2021 15:10:31 +0100
Subject: [PATCH 61/66] Better cholesky of covariance methods

---
 src/SparseGPs.jl | 1 -
 src/svgp.jl      | 7 +++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl
index cc0d156b..2c2c380f 100644
--- a/src/SparseGPs.jl
+++ b/src/SparseGPs.jl
@@ -26,7 +26,6 @@ export SVGP,
     Quadrature,
     MonteCarlo
 
-
 include("elbo.jl")
 include("svgp.jl")
 
diff --git a/src/svgp.jl b/src/svgp.jl
index c53f5054..12b9e293 100644
--- a/src/svgp.jl
+++ b/src/svgp.jl
@@ -18,8 +18,8 @@ variational Gaussian process classification." Artificial Intelligence and
 Statistics. PMLR, 2015.
 """
 function AbstractGPs.approx_posterior(::SVGP, fz::FiniteGP, q::AbstractMvNormal)
-    m, A = mean(q), cholesky(cov(q))
-    Kuu = cholesky(Symmetric(cov(fz)))
+    m, A = mean(q), _chol_cov(q)
+    Kuu = _chol_cov(fz)
     B = Kuu.L \ A.L
     α=Kuu \ (m - mean(fz))
     data = (A=A, m=m, Kuu=Kuu, B=B, α=α, u=fz.x)
@@ -66,3 +66,6 @@ function StatsBase.mean_and_var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
     Σ_diag = var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) 
     return μ, Σ_diag
 end
+
+_chol_cov(q::AbstractMvNormal) = cholesky(Symmetric(cov(q)))
+_chol_cov(q::MvNormal) = cholesky(q.Σ)

From 1594ee8ff9490c8c568412b3da7a540e5c8345ab Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Wed, 28 Jul 2021 15:16:24 +0100
Subject: [PATCH 62/66] Use KLDivergences

---
 Project.toml     | 1 +
 src/SparseGPs.jl | 1 +
 src/elbo.jl      | 7 -------
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/Project.toml b/Project.toml
index 0bb253b1..de63578f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,6 +10,7 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 FastGaussQuadrature = "442a2c76-b920-505d-bb47-c5924d526838"
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 GPLikelihoods = "6031954c-0455-49d7-b3b9-3e1c99afaf40"
+KLDivergences = "3c9cd921-3d3f-41e2-830c-e020174918cc"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl
index 2c2c380f..575ed4cb 100644
--- a/src/SparseGPs.jl
+++ b/src/SparseGPs.jl
@@ -10,6 +10,7 @@ using GPLikelihoods
 using SpecialFunctions
 using ChainRulesCore
 using FillArrays
+using KLDivergences
 
 using AbstractGPs:
     AbstractGP,
diff --git a/src/elbo.jl b/src/elbo.jl
index 17ebf802..7c1e74fb 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -230,13 +230,6 @@ end
 
 ChainRulesCore.@non_differentiable gausshermite(n)
 
-function StatsBase.kldivergence(q::AbstractMvNormal, p::AbstractMvNormal)
-    p_μ, p_Σ = mean(p), cov(p)
-    q_μ, q_Σ = mean(q), cov(q)
-    (1/2) .* (logdet(p_Σ) - logdet(q_Σ) - length(p_μ) + tr(p_Σ \ q_Σ) +
-              Xt_invA_X(cholesky(p_Σ), (q_μ - p_μ)))
-end
-
 AnalyticLikelihood = Union{
     PoissonLikelihood,
     GaussianLikelihood,

From 878b2145d1e8346b4ada39f7a0ed0babb6f9791e Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Wed, 28 Jul 2021 16:01:54 +0100
Subject: [PATCH 63/66] Use vector of marginals `q_f` vs. `f_mean, f_var`

---
 src/elbo.jl  | 69 ++++++++++++++++++++++++----------------------------
 test/elbo.jl |  7 +++---
 2 files changed, 35 insertions(+), 41 deletions(-)

diff --git a/src/elbo.jl b/src/elbo.jl
index 7c1e74fb..e25e43da 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -35,8 +35,8 @@ the ELBO. The options are: `Default()`, `Analytic()`, `Quadrature()` and
 exact solution. If there is no such solution, `Default()` either uses
 `Quadrature()` or `MonteCarlo()`, depending on the likelihood.
 
-N.B. the observation noise `fx.Σy` is assumed to be homoscedastic and
-uncorrelated - i.e. only `fx.Σy[1]` is used.
+N.B. the likelihood is assumed to be Gaussian with observation noise `fx.Σy`.
+Further, `fx.Σy` must be homoscedastic and uncorrelated - i.e. `fx.Σy = α * I`.
 
 [1] - Hensman, James, Alexander Matthews, and Zoubin Ghahramani. "Scalable
 variational Gaussian process classification." Artificial Intelligence and
@@ -84,8 +84,8 @@ function _elbo(
     n_data::Integer
 )
     post = approx_posterior(SVGP(), fz, q)
-    f_mean, f_var = mean_and_var(post, fx.x)
-    variational_exp = expected_loglik(method, y, f_mean, f_var, lik)
+    q_f = marginals(post(fx.x))
+    variational_exp = expected_loglik(method, y, q_f, lik)
 
     kl_term = kldivergence(q, fz)
 
@@ -95,7 +95,7 @@ function _elbo(
 end
 
 """
-    expected_loglik(method, y, f_mean, f_var, lik)
+    expected_loglik(method::ExpectationMethod, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik)
 
 This function computes the expected log likelihood:
 
@@ -109,8 +109,7 @@ where `p(y | f)` is the process likelihood.
     q(f) = ∫ p(f | u) q(u) du
 ```
 where `q(u)` is the variational distribution over inducing points (see
-[`elbo`](@ref)). The marginal means and variances of `q(f)` are given by
-`f_mean` and `f_var` respectively.
+[`elbo`](@ref)). The marginal distributions of `q(f)` are given by `q_f`.
 
 `method` determines which method is used to calculate the expected log
 likelihood - see [`elbo`](@ref) for more details.
@@ -120,10 +119,10 @@ likelihood - see [`elbo`](@ref) for more details.
 `q(f)` is assumed to be an `MvNormal` distribution and `p(y | f)` is assumed to
 have independent marginals such that only the marginals of `q(f)` are required.
 """
-expected_loglik(method, y, f_mean, f_var, lik)
+expected_loglik(method, y, q_f, lik)
 
 """
-    expected_loglik(method::ExpectationMethod, y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, lik::ScalarLikelihood)
+    expected_loglik(method::ExpectationMethod, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik::ScalarLikelihood)
 
 The expected log likelihood for a `ScalarLikelihood`, computed via `method`.
 Defaults to a closed form solution if it exists, otherwise defaults to
@@ -132,64 +131,61 @@ Gauss-Hermite quadrature.
 function expected_loglik(
     ::Default,
     y::AbstractVector,
-    f_mean::AbstractVector,
-    f_var::AbstractVector,
+    q_f::AbstractVector{<:Normal},
     lik::ScalarLikelihood
 )
     method = _default_method(lik)
-    expected_loglik(method, y, f_mean, f_var, lik)
+    expected_loglik(method, y, q_f, lik)
 end
 
 # The closed form solution for independent Gaussian noise
 function expected_loglik(
     ::Analytic,
     y::AbstractVector{<:Real},
-    f_mean::AbstractVector,
-    f_var::AbstractVector,
+    q_f::AbstractVector{<:Normal},
     lik::GaussianLikelihood
 )
-    return sum(-0.5 * (log(2π) .+ log.(lik.σ²) .+ ((y .- f_mean).^2 .+ f_var) / lik.σ²))
+    return sum(-0.5 * (log(2π) .+ log.(lik.σ²) .+ ((y .- mean.(q_f)).^2 .+ var.(q_f)) / lik.σ²))
 end
 
 # The closed form solution for a Poisson likelihood with an exponential inverse link function
 function expected_loglik(
     ::Analytic,
     y::AbstractVector,
-    f_mean::AbstractVector,
-    f_var::AbstractVector,
+    q_f::AbstractVector{<:Normal},
     ::PoissonLikelihood{ExpLink}
-)
-    return sum((y .* f_mean) - exp.(f_mean .+ (f_var / 2)) - loggamma.(y .+ 1))
+    )
+    f_μ = mean.(q_f)
+    return sum((y .* f_μ) - exp.(f_μ .+ (var.(q_f) / 2)) - loggamma.(y .+ 1))
 end
 
 # The closed form solution for an Exponential likelihood with an exponential inverse link function
 function expected_loglik(
     ::Analytic,
     y::AbstractVector{<:Real},
-    f_mean::AbstractVector,
-    f_var::AbstractVector,
+    q_f::AbstractVector{<:Normal},
     ::ExponentialLikelihood{ExpLink}
-)
-    return sum(-f_mean - y .* exp.((f_var / 2) .- f_mean))
+    )
+    f_μ = mean.(q_f)
+    return sum(-f_μ - y .* exp.((var.(q_f) / 2) .- f_μ))
 end
 
 # The closed form solution for a Gamma likelihood with an exponential inverse link function
 function expected_loglik(
     ::Analytic,
     y::AbstractVector{<:Real},
-    f_mean::AbstractVector,
-    f_var::AbstractVector,
+    q_f::AbstractVector{<:Normal},
     lik::GammaLikelihood{<:Any, ExpLink}
-)
-    return sum((lik.α - 1) * log.(y) .- y .* exp.((f_var / 2) .- f_mean)
-               .- lik.α * f_mean .- loggamma(lik.α))
+    )
+    f_μ = mean.(q_f)
+    return sum((lik.α - 1) * log.(y) .- y .* exp.((var.(q_f) / 2) .- f_μ)
+               .- lik.α * f_μ .- loggamma(lik.α))
 end
 
 function expected_loglik(
     ::Analytic,
     y::AbstractVector,
-    f_mean::AbstractVector,
-    f_var::AbstractVector,
+    q_f::AbstractVector{<:Normal},
     lik
 )
     return error(
@@ -201,12 +197,12 @@ end
 function expected_loglik(
     mc::MonteCarlo,
     y::AbstractVector,
-    f_mean::AbstractVector,
-    f_var::AbstractVector,
+    q_f::AbstractVector{<:Normal},
     lik::ScalarLikelihood
 )
-    # take 'n_samples' reparameterised samples with μ=f_mean and σ²=f_var
-    fs = f_mean .+ .√f_var .* randn(eltype(f_mean), length(f_mean), mc.n_samples)
+    # take 'n_samples' reparameterised samples
+    f_μ = mean.(q_f)
+    fs = f_μ .+ std.(q_f) .* randn(eltype(f_μ), length(q_f), mc.n_samples)
     lls = loglikelihood.(lik.(fs), y)
     return sum(lls) / mc.n_samples
 end
@@ -214,8 +210,7 @@ end
 function expected_loglik(
     gh::Quadrature,
     y::AbstractVector,
-    f_mean::AbstractVector,
-    f_var::AbstractVector,
+    q_f::AbstractVector{<:Normal},
     lik::ScalarLikelihood
 )
     # Compute the expectation via Gauss-Hermite quadrature
@@ -223,7 +218,7 @@ function expected_loglik(
     # (see eg. en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature)
     xs, ws = gausshermite(gh.n_points)
     # size(fs): (length(y), n_points)
-    fs = √2 * .√f_var .* transpose(xs) .+ f_mean
+    fs = √2 * std.(q_f) .* transpose(xs) .+ mean.(q_f)
     lls = loglikelihood.(lik.(fs), y)
     return sum((1/√π) * lls * ws)
 end
diff --git a/test/elbo.jl b/test/elbo.jl
index 7de90d9a..07b89035 100644
--- a/test/elbo.jl
+++ b/test/elbo.jl
@@ -19,17 +19,16 @@
     # Test that the various methods of computing expectations return the same
     # result.
     rng = MersenneTwister(123456)
-    f_mean = zeros(10)
-    f_var = ones(10)
+    q_f = Normal.(zeros(10), ones(10))
 
     @testset "$lik" for lik in Base.uniontypes(SparseGPs.ScalarLikelihood)
         l = lik()
         methods = [Quadrature(100), MonteCarlo(1000000)]
         def = SparseGPs._default_method(l)
         if def isa Analytic push!(methods, def) end
-        y = rand.(rng, l.(f_mean))
+        y = rand.(rng, l.(zeros(10)))
 
-        results = map(m -> SparseGPs.expected_loglik(m, y, f_mean, f_var, l), methods)
+        results = map(m -> SparseGPs.expected_loglik(m, y, q_f, l), methods)
         @test all(x->isapprox(x, results[end], rtol=1e-3), results)
     end
 end

From be967226317e5f9aa7e23474b9b86c9434d70d6a Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Fri, 30 Jul 2021 12:25:09 +0100
Subject: [PATCH 64/66] Ran JuliaFormatter

---
 examples/classification.jl | 58 ++++++++++------------------
 examples/regression.jl     | 65 ++++++++++++++++---------------
 src/SparseGPs.jl           | 14 +------
 src/elbo.jl                | 78 ++++++++++++++++++++------------------
 src/svgp.jl                | 12 +++---
 test/elbo.jl               |  6 ++-
 test/equivalences.jl       | 29 +++++++-------
 test/svgp.jl               |  4 +-
 test/test_utils.jl         |  7 ++--
 9 files changed, 125 insertions(+), 148 deletions(-)

diff --git a/examples/classification.jl b/examples/classification.jl
index ab476bce..c97dc556 100644
--- a/examples/classification.jl
+++ b/examples/classification.jl
@@ -12,7 +12,7 @@ using DelimitedFiles
 using IterTools
 
 using Plots
-default(; legend=:outertopright, size=(700, 400))
+default(; legend = :outertopright, size = (700, 400))
 
 using Random
 Random.seed!(1234)
@@ -40,16 +40,10 @@ fx = f(x)
 # %%
 # Then, plot some samples from the prior underlying GP
 x_plot = 0:0.02:6
-prior_f_samples = rand(f.f(x_plot, 1e-6),20)
+prior_f_samples = rand(f.f(x_plot, 1e-6), 20)
 
-plt = plot(
-    x_plot,
-    prior_f_samples;
-    seriescolor="red",
-    linealpha=0.2,
-    label=""
-)
-scatter!(plt, x, y; seriescolor="blue", label="Data points")
+plt = plot(x_plot, prior_f_samples; seriescolor = "red", linealpha = 0.2, label = "")
+scatter!(plt, x, y; seriescolor = "blue", label = "Data points")
 
 
 # %%
@@ -57,14 +51,8 @@ scatter!(plt, x, y; seriescolor="blue", label="Data points")
 # them in (0, 1).
 prior_y_samples = mean.(f.lik.(prior_f_samples))
 
-plt = plot(
-    x_plot,
-    prior_y_samples;
-    seriescolor="red",
-    linealpha=0.2,
-    label=""
-)
-scatter!(plt, x, y; seriescolor="blue", label="Data points")
+plt = plot(x_plot, prior_y_samples; seriescolor = "red", linealpha = 0.2, label = "")
+scatter!(plt, x, y; seriescolor = "blue", label = "Data points")
 
 
 # %%
@@ -72,13 +60,13 @@ scatter!(plt, x, y; seriescolor="blue", label="Data points")
 using Flux
 
 struct SVGPModel
-    k # kernel parameters
-    m # variational mean
-    A # variational covariance
-    z # inducing points
+    k::Any # kernel parameters
+    m::Any # variational mean
+    A::Any # variational covariance
+    z::Any # inducing points
 end
 
-@Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs
+Flux.@functor SVGPModel (k, m, A) # Don't train the inducing inputs
 
 lik = BernoulliLikelihood()
 jitter = 1e-4
@@ -92,9 +80,9 @@ function (m::SVGPModel)(x)
     return fx, fu, q
 end
 
-function flux_loss(x, y; n_data=length(y))
+function flux_loss(x, y; n_data = length(y))
     fx, fu, q = model(x)
-    return -SparseGPs.elbo(fx, y, fu, q; n_data, method=MonteCarlo())
+    return -SparseGPs.elbo(fx, y, fu, q; n_data, method = MonteCarlo())
 end
 
 # %%
@@ -121,7 +109,7 @@ Flux.train!(
     (x, y) -> flux_loss(x, y),
     parameters,
     ncycle([(x, y)], 2000), # Train for 1000 epochs
-    opt
+    opt,
 )
 
 # %%
@@ -136,13 +124,7 @@ l_post = LatentGP(post, BernoulliLikelihood(), jitter)
 
 post_f_samples = rand(l_post.f(x_plot, 1e-6), 20)
 
-plt = plot(
-    x_plot,
-    post_f_samples;
-    seriescolor="red",
-    linealpha=0.2,
-    legend=false
-)
+plt = plot(x_plot, post_f_samples; seriescolor = "red", linealpha = 0.2, legend = false)
 
 # %%
 # As above, push these samples through a logistic sigmoid to get posterior predictions.
@@ -151,10 +133,10 @@ post_y_samples = mean.(l_post.lik.(post_f_samples))
 plt = plot(
     x_plot,
     post_y_samples;
-    seriescolor="red",
-    linealpha=0.2,
+    seriescolor = "red",
+    linealpha = 0.2,
     # legend=false,
-    label=""
+    label = "",
 )
-scatter!(plt, x, y; seriescolor="blue", label="Data points")
-vline!(z; label="Pseudo-points")
+scatter!(plt, x, y; seriescolor = "blue", label = "Data points")
+vline!(z; label = "Pseudo-points")
diff --git a/examples/regression.jl b/examples/regression.jl
index d537f448..82a31e61 100644
--- a/examples/regression.jl
+++ b/examples/regression.jl
@@ -8,7 +8,7 @@ using Optim
 using IterTools
 
 using Plots
-default(; legend=:outertopright, size=(700, 400))
+default(; legend = :outertopright, size = (700, 400))
 
 using Random
 Random.seed!(1234)
@@ -23,7 +23,7 @@ N = 10000 # Number of training points
 x = rand(Uniform(-1, 1), N)
 y = g.(x) + 0.3 * randn(N)
 
-scatter(x, y; xlabel="x", ylabel="y", legend=false)
+scatter(x, y; xlabel = "x", ylabel = "y", legend = false)
 
 
 # %%
@@ -34,13 +34,13 @@ lik_noise = 0.3
 jitter = 1e-5
 
 struct SVGPModel
-    k # kernel parameters
-    m # variational mean
-    A # variational covariance
-    z # inducing points
+    k::Any # kernel parameters
+    m::Any # variational mean
+    A::Any # variational covariance
+    z::Any # inducing points
 end
 
-@Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs
+Flux.@functor SVGPModel (k, m, A) # Don't train the inducing inputs
 
 function make_kernel(k)
     return softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2])))
@@ -68,7 +68,7 @@ function posterior(m::SVGPModel)
 end
 
 # Return the loss given data - in this case the negative ELBO.
-function flux_loss(x, y; n_data=length(y))
+function flux_loss(x, y; n_data = length(y))
     fx, fu, q = model(x)
     return -SparseGPs.elbo(fx, y, fu, q; n_data)
 end
@@ -90,7 +90,7 @@ model = SVGPModel(k, m, A, z)
 b = 100 # minibatch size
 opt = ADAM(0.001)
 parameters = Flux.params(model)
-data_loader = Flux.Data.DataLoader((x, y), batchsize=b)
+data_loader = Flux.Data.DataLoader((x, y), batchsize = b)
 
 # %%
 # Negative ELBO before training
@@ -99,10 +99,10 @@ println(flux_loss(x, y))
 # %%
 # Train the model
 Flux.train!(
-    (x, y) -> flux_loss(x, y; n_data=N),
+    (x, y) -> flux_loss(x, y; n_data = N),
     parameters,
     ncycle(data_loader, 300), # Train for 300 epochs
-    opt
+    opt,
 )
 
 # %%
@@ -116,16 +116,16 @@ post = posterior(model)
 scatter(
     x,
     y;
-    markershape=:xcross,
-    markeralpha=0.1,
-    xlim=(-1, 1),
-    xlabel="x",
-    ylabel="y",
-    title="posterior (VI with sparse grid)",
-    label="Train Data",
+    markershape = :xcross,
+    markeralpha = 0.1,
+    xlim = (-1, 1),
+    xlabel = "x",
+    ylabel = "y",
+    title = "posterior (VI with sparse grid)",
+    label = "Train Data",
 )
-plot!(-1:0.001:1, post; label="Posterior")
-vline!(z; label="Pseudo-points")
+plot!(-1:0.001:1, post; label = "Posterior")
+vline!(z; label = "Pseudo-points")
 
 
 # %% There is a closed form optimal solution for the variational posterior q(u)
@@ -137,8 +137,8 @@ function exact_q(fu, fx, y)
     σ² = fx.Σy[1]
     Kuf = cov(fu, fx)
     Kuu = Symmetric(cov(fu))
-    Σ = (Symmetric(cov(fu) + (1/σ²) * Kuf * Kuf'))
-    m = ((1/σ²)*Kuu* (Σ\Kuf)) * y
+    Σ = (Symmetric(cov(fu) + (1 / σ²) * Kuf * Kuf'))
+    m = ((1 / σ²) * Kuu * (Σ \ Kuf)) * y
     S = Symmetric(Kuu * (Σ \ Kuu))
     return MvNormal(m, S)
 end
@@ -164,15 +164,14 @@ AbstractGPs.elbo(fx, y, fu)
 scatter(
     x,
     y;
-    markershape=:xcross,
-    markeralpha=0.1,
-    xlim=(-1, 1),
-    xlabel="x",
-    ylabel="y",
-    title="posterior (VI with sparse grid)",
-    label="Train Data",
+    markershape = :xcross,
+    markeralpha = 0.1,
+    xlim = (-1, 1),
+    xlabel = "x",
+    ylabel = "y",
+    title = "posterior (VI with sparse grid)",
+    label = "Train Data",
 )
-plot!(-1:0.001:1, ap_ex; label="SVGP posterior")
-plot!(-1:0.001:1, ap_tits; label="Titsias posterior")
-vline!(z; label="Pseudo-points")
-
+plot!(-1:0.001:1, ap_ex; label = "SVGP posterior")
+plot!(-1:0.001:1, ap_tits; label = "Titsias posterior")
+vline!(z; label = "Pseudo-points")
diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl
index 575ed4cb..c0a165af 100644
--- a/src/SparseGPs.jl
+++ b/src/SparseGPs.jl
@@ -13,19 +13,9 @@ using FillArrays
 using KLDivergences
 
 using AbstractGPs:
-    AbstractGP,
-    FiniteGP,
-    LatentFiniteGP,
-    ApproxPosteriorGP,
-    At_A,
-    diag_At_A,
-    Xt_invA_X
+    AbstractGP, FiniteGP, LatentFiniteGP, ApproxPosteriorGP, At_A, diag_At_A
 
-export SVGP,
-    Default,
-    Analytic,
-    Quadrature,
-    MonteCarlo
+export SVGP, Default, Analytic, Quadrature, MonteCarlo
 
 include("elbo.jl")
 include("svgp.jl")
diff --git a/src/elbo.jl b/src/elbo.jl
index e25e43da..94a16e49 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -4,7 +4,7 @@ ScalarLikelihood = Union{
     PoissonLikelihood,
     GaussianLikelihood,
     ExponentialLikelihood,
-    GammaLikelihood
+    GammaLikelihood,
 }
 
 
@@ -43,18 +43,26 @@ variational Gaussian process classification." Artificial Intelligence and
 Statistics. PMLR, 2015.
 """
 function AbstractGPs.elbo(
-    fx::FiniteGP{<:AbstractGP, <:AbstractVector, <:Diagonal{<:Real, <:Fill}},
+    fx::FiniteGP{<:AbstractGP,<:AbstractVector,<:Diagonal{<:Real,<:Fill}},
     y::AbstractVector{<:Real},
     fz::FiniteGP,
     q::AbstractMvNormal;
-    n_data=length(y),
-    method=Default()
+    n_data = length(y),
+    method = Default(),
 )
     return _elbo(method, fx, y, fz, q, GaussianLikelihood(fx.Σy[1]), n_data)
 end
 
-function AbstractGPs.elbo(::FiniteGP, ::AbstractVector, ::FiniteGP, ::AbstractMvNormal; kwargs...)
-    return error("The observation noise fx.Σy must be homoscedastic.\n To avoid this error, construct fx using: f = GP(kernel); fx = f(x, σ²)")
+function AbstractGPs.elbo(
+    ::FiniteGP,
+    ::AbstractVector,
+    ::FiniteGP,
+    ::AbstractMvNormal;
+    kwargs...,
+)
+    return error(
+        "The observation noise fx.Σy must be homoscedastic.\n To avoid this error, construct fx using: f = GP(kernel); fx = f(x, σ²)",
+    )
 end
 
 """
@@ -67,8 +75,8 @@ function AbstractGPs.elbo(
     y::AbstractVector,
     fz::FiniteGP,
     q::AbstractMvNormal;
-    n_data=length(y),
-    method=Default()
+    n_data = length(y),
+    method = Default(),
 )
     return _elbo(method, lfx.fx, y, fz, q, lfx.lik, n_data)
 end
@@ -81,7 +89,7 @@ function _elbo(
     fz::FiniteGP,
     q::AbstractMvNormal,
     lik::ScalarLikelihood,
-    n_data::Integer
+    n_data::Integer,
 )
     post = approx_posterior(SVGP(), fz, q)
     q_f = marginals(post(fx.x))
@@ -132,7 +140,7 @@ function expected_loglik(
     ::Default,
     y::AbstractVector,
     q_f::AbstractVector{<:Normal},
-    lik::ScalarLikelihood
+    lik::ScalarLikelihood,
 )
     method = _default_method(lik)
     expected_loglik(method, y, q_f, lik)
@@ -143,9 +151,11 @@ function expected_loglik(
     ::Analytic,
     y::AbstractVector{<:Real},
     q_f::AbstractVector{<:Normal},
-    lik::GaussianLikelihood
+    lik::GaussianLikelihood,
 )
-    return sum(-0.5 * (log(2π) .+ log.(lik.σ²) .+ ((y .- mean.(q_f)).^2 .+ var.(q_f)) / lik.σ²))
+    return sum(
+        -0.5 * (log(2π) .+ log.(lik.σ²) .+ ((y .- mean.(q_f)) .^ 2 .+ var.(q_f)) / lik.σ²),
+    )
 end
 
 # The closed form solution for a Poisson likelihood with an exponential inverse link function
@@ -153,8 +163,8 @@ function expected_loglik(
     ::Analytic,
     y::AbstractVector,
     q_f::AbstractVector{<:Normal},
-    ::PoissonLikelihood{ExpLink}
-    )
+    ::PoissonLikelihood{ExpLink},
+)
     f_μ = mean.(q_f)
     return sum((y .* f_μ) - exp.(f_μ .+ (var.(q_f) / 2)) - loggamma.(y .+ 1))
 end
@@ -164,8 +174,8 @@ function expected_loglik(
     ::Analytic,
     y::AbstractVector{<:Real},
     q_f::AbstractVector{<:Normal},
-    ::ExponentialLikelihood{ExpLink}
-    )
+    ::ExponentialLikelihood{ExpLink},
+)
     f_μ = mean.(q_f)
     return sum(-f_μ - y .* exp.((var.(q_f) / 2) .- f_μ))
 end
@@ -175,22 +185,20 @@ function expected_loglik(
     ::Analytic,
     y::AbstractVector{<:Real},
     q_f::AbstractVector{<:Normal},
-    lik::GammaLikelihood{<:Any, ExpLink}
-    )
+    lik::GammaLikelihood{<:Any,ExpLink},
+)
     f_μ = mean.(q_f)
-    return sum((lik.α - 1) * log.(y) .- y .* exp.((var.(q_f) / 2) .- f_μ)
-               .- lik.α * f_μ .- loggamma(lik.α))
+    return sum(
+        (lik.α - 1) * log.(y) .- y .* exp.((var.(q_f) / 2) .- f_μ) .- lik.α * f_μ .-
+        loggamma(lik.α),
+    )
 end
 
-function expected_loglik(
-    ::Analytic,
-    y::AbstractVector,
-    q_f::AbstractVector{<:Normal},
-    lik
-)
+function expected_loglik(::Analytic, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik)
     return error(
-        "No analytic solution exists for ", typeof(lik),
-        ". Use `Default()`, `Quadrature()` or `MonteCarlo()` instead."
+        "No analytic solution exists for ",
+        typeof(lik),
+        ". Use `Default()`, `Quadrature()` or `MonteCarlo()` instead.",
     )
 end
 
@@ -198,7 +206,7 @@ function expected_loglik(
     mc::MonteCarlo,
     y::AbstractVector,
     q_f::AbstractVector{<:Normal},
-    lik::ScalarLikelihood
+    lik::ScalarLikelihood,
 )
     # take 'n_samples' reparameterised samples
     f_μ = mean.(q_f)
@@ -211,7 +219,7 @@ function expected_loglik(
     gh::Quadrature,
     y::AbstractVector,
     q_f::AbstractVector{<:Normal},
-    lik::ScalarLikelihood
+    lik::ScalarLikelihood,
 )
     # Compute the expectation via Gauss-Hermite quadrature
     # using a reparameterisation by change of variable
@@ -220,16 +228,12 @@ function expected_loglik(
     # size(fs): (length(y), n_points)
     fs = √2 * std.(q_f) .* transpose(xs) .+ mean.(q_f)
     lls = loglikelihood.(lik.(fs), y)
-    return sum((1/√π) * lls * ws)
+    return sum((1 / √π) * lls * ws)
 end
 
 ChainRulesCore.@non_differentiable gausshermite(n)
 
-AnalyticLikelihood = Union{
-    PoissonLikelihood,
-    GaussianLikelihood,
-    ExponentialLikelihood,
-    GammaLikelihood
-}
+AnalyticLikelihood =
+    Union{PoissonLikelihood,GaussianLikelihood,ExponentialLikelihood,GammaLikelihood}
 _default_method(::AnalyticLikelihood) = Analytic()
 _default_method(_) = Quadrature()
diff --git a/src/svgp.jl b/src/svgp.jl
index 12b9e293..f9a10838 100644
--- a/src/svgp.jl
+++ b/src/svgp.jl
@@ -21,8 +21,8 @@ function AbstractGPs.approx_posterior(::SVGP, fz::FiniteGP, q::AbstractMvNormal)
     m, A = mean(q), _chol_cov(q)
     Kuu = _chol_cov(fz)
     B = Kuu.L \ A.L
-    α=Kuu \ (m - mean(fz))
-    data = (A=A, m=m, Kuu=Kuu, B=B, α=α, u=fz.x)
+    α = Kuu \ (m - mean(fz))
+    data = (A = A, m = m, Kuu = Kuu, B = B, α = α, u = fz.x)
     return ApproxPosteriorGP(SVGP(), fz.f, data)
 end
 
@@ -33,13 +33,13 @@ end
 function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
     Cux = cov(f.prior, f.data.u, x)
     D = f.data.Kuu.L \ Cux
-    return cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) 
+    return cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D)
 end
 
 function Statistics.var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
     Cux = cov(f.prior, f.data.u, x)
     D = f.data.Kuu.L \ Cux
-    return var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) 
+    return var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D)
 end
 
 function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector, y::AbstractVector)
@@ -55,7 +55,7 @@ function StatsBase.mean_and_cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
     Cux = cov(f.prior, f.data.u, x)
     D = f.data.Kuu.L \ Cux
     μ = Cux' * f.data.α
-    Σ = cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) 
+    Σ = cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D)
     return μ, Σ
 end
 
@@ -63,7 +63,7 @@ function StatsBase.mean_and_var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
     Cux = cov(f.prior, f.data.u, x)
     D = f.data.Kuu.L \ Cux
     μ = Cux' * f.data.α
-    Σ_diag = var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) 
+    Σ_diag = var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D)
     return μ, Σ_diag
 end
 
diff --git a/test/elbo.jl b/test/elbo.jl
index 07b89035..e4394da0 100644
--- a/test/elbo.jl
+++ b/test/elbo.jl
@@ -25,10 +25,12 @@
         l = lik()
         methods = [Quadrature(100), MonteCarlo(1000000)]
         def = SparseGPs._default_method(l)
-        if def isa Analytic push!(methods, def) end
+        if def isa Analytic
+            push!(methods, def)
+        end
         y = rand.(rng, l.(zeros(10)))
 
         results = map(m -> SparseGPs.expected_loglik(m, y, q_f, l), methods)
-        @test all(x->isapprox(x, results[end], rtol=1e-3), results)
+        @test all(x -> isapprox(x, results[end], rtol = 1e-3), results)
     end
 end
diff --git a/test/equivalences.jl b/test/equivalences.jl
index 162e8d3d..8b375eb9 100644
--- a/test/equivalences.jl
+++ b/test/equivalences.jl
@@ -4,7 +4,7 @@
     y = sin.(x) + 0.9 * cos.(x * 1.6) + 0.4 * rand(rng, N)
 
     z = copy(x) # Set inducing inputs == training inputs
-    
+
     k_init = [0.2, 0.6] # initial kernel parameters
     lik_noise = 0.1 # The (fixed) Gaussian likelihood noise
 
@@ -25,11 +25,11 @@
         vfe_post = approx_posterior(VFE(), fx, y, fz) # Titsias posterior
         svgp_post = approx_posterior(SVGP(), fz, q_ex) # Hensman (2013) exact posterior
 
-        @test mean(gpr_post, x) ≈ mean(svgp_post, x) atol=1e-10
-        @test cov(gpr_post, x) ≈ cov(svgp_post, x) atol=1e-10
+        @test mean(gpr_post, x) ≈ mean(svgp_post, x) atol = 1e-10
+        @test cov(gpr_post, x) ≈ cov(svgp_post, x) atol = 1e-10
 
-        @test mean(vfe_post, x) ≈ mean(svgp_post, x) atol=1e-10
-        @test cov(vfe_post, x) ≈ cov(svgp_post, x) atol=1e-10
+        @test mean(vfe_post, x) ≈ mean(svgp_post, x) atol = 1e-10
+        @test cov(vfe_post, x) ≈ cov(svgp_post, x) atol = 1e-10
 
         @test elbo(fx, y, fz, q_ex) ≈ logpdf(fx, y)
     end
@@ -42,9 +42,9 @@
         ## FIRST - define the models
         # GPR - Exact GP regression
         struct GPRModel
-            k # kernel parameters
+            k::Any # kernel parameters
         end
-        @Flux.functor GPRModel
+        Flux.@functor GPRModel
 
         function (m::GPRModel)(x)
             f = make_gp(make_kernel(m.k))
@@ -54,12 +54,12 @@
 
         # SVGP - Sparse variational GP regression (Hensman 2014)
         struct SVGPModel
-            k # kernel parameters
-            z # inducing points
-            m # variational mean
-            A # variational covariance sqrt (Σ = A'A)
+            k::Any # kernel parameters
+            z::Any # inducing points
+            m::Any # variational mean
+            A::Any # variational covariance sqrt (Σ = A'A)
         end
-        @Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs
+        Flux.@functor SVGPModel (k, m, A) # Don't train the inducing inputs
 
         function (m::SVGPModel)(x)
             f = make_gp(make_kernel(m.k))
@@ -111,9 +111,8 @@
         svgp_post = posterior(svgp)
 
         ## FIFTH - test equivalences
-        @test all(isapprox.(mean(gpr_post, x), mean(svgp_post, x), atol=1e-4))
-        @test all(isapprox.(cov(gpr_post, x), cov(svgp_post, x), atol=1e-4))
+        @test all(isapprox.(mean(gpr_post, x), mean(svgp_post, x), atol = 1e-4))
+        @test all(isapprox.(cov(gpr_post, x), cov(svgp_post, x), atol = 1e-4))
     end
 
 end
-
diff --git a/test/svgp.jl b/test/svgp.jl
index 7dd90692..9cfef116 100644
--- a/test/svgp.jl
+++ b/test/svgp.jl
@@ -7,14 +7,14 @@
     # Specify prior.
     f = GP(Matern32Kernel())
     # Sample from prior.
-    x = collect(range(-1.0, 1.0; length=N_cond))
+    x = collect(range(-1.0, 1.0; length = N_cond))
     fx = f(x, 1e-15)
     y = rand(rng, fx)
 
     q = exact_variational_posterior(fx, fx, y)
     f_approx_post = approx_posterior(SVGP(), fx, q)
 
-    a = collect(range(-1.0, 1.0; length=N_a))
+    a = collect(range(-1.0, 1.0; length = N_a))
     b = randn(rng, N_b)
     AbstractGPs.TestUtils.test_internal_abstractgps_interface(rng, f_approx_post, a, b)
 end
diff --git a/test/test_utils.jl b/test/test_utils.jl
index 02b50670..805c799d 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -5,12 +5,13 @@ make_kernel(k) = softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(soft
 # q(u) (e.g. # https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/
 # equations (11) & (12)). Assumes a ZeroMean function.
 function exact_variational_posterior(fu, fx, y)
-    fu.f.mean isa AbstractGPs.ZeroMean || error("The exact posterior requires a GP with ZeroMean.")
+    fu.f.mean isa AbstractGPs.ZeroMean ||
+        error("The exact posterior requires a GP with ZeroMean.")
     σ² = fx.Σy[1]
     Kuf = cov(fu, fx)
     Kuu = Symmetric(cov(fu))
-    Σ = (Symmetric(cov(fu) + (1/σ²) * Kuf * Kuf'))
-    m = ((1/σ²)*Kuu* (Σ\Kuf)) * y
+    Σ = (Symmetric(cov(fu) + (1 / σ²) * Kuf * Kuf'))
+    m = ((1 / σ²) * Kuu * (Σ \ Kuf)) * y
     S = Symmetric(Kuu * (Σ \ Kuu))
     return MvNormal(m, S)
 end

From 39f243a3622b13c61b5f6db89d8fe85735836dda Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Fri, 30 Jul 2021 12:28:37 +0100
Subject: [PATCH 65/66] Revert "Ran JuliaFormatter"

This reverts commit be967226317e5f9aa7e23474b9b86c9434d70d6a.
---
 examples/classification.jl | 58 ++++++++++++++++++----------
 examples/regression.jl     | 65 +++++++++++++++----------------
 src/SparseGPs.jl           | 14 ++++++-
 src/elbo.jl                | 78 ++++++++++++++++++--------------------
 src/svgp.jl                | 12 +++---
 test/elbo.jl               |  6 +--
 test/equivalences.jl       | 29 +++++++-------
 test/svgp.jl               |  4 +-
 test/test_utils.jl         |  7 ++--
 9 files changed, 148 insertions(+), 125 deletions(-)

diff --git a/examples/classification.jl b/examples/classification.jl
index c97dc556..ab476bce 100644
--- a/examples/classification.jl
+++ b/examples/classification.jl
@@ -12,7 +12,7 @@ using DelimitedFiles
 using IterTools
 
 using Plots
-default(; legend = :outertopright, size = (700, 400))
+default(; legend=:outertopright, size=(700, 400))
 
 using Random
 Random.seed!(1234)
@@ -40,10 +40,16 @@ fx = f(x)
 # %%
 # Then, plot some samples from the prior underlying GP
 x_plot = 0:0.02:6
-prior_f_samples = rand(f.f(x_plot, 1e-6), 20)
+prior_f_samples = rand(f.f(x_plot, 1e-6),20)
 
-plt = plot(x_plot, prior_f_samples; seriescolor = "red", linealpha = 0.2, label = "")
-scatter!(plt, x, y; seriescolor = "blue", label = "Data points")
+plt = plot(
+    x_plot,
+    prior_f_samples;
+    seriescolor="red",
+    linealpha=0.2,
+    label=""
+)
+scatter!(plt, x, y; seriescolor="blue", label="Data points")
 
 
 # %%
@@ -51,8 +57,14 @@ scatter!(plt, x, y; seriescolor = "blue", label = "Data points")
 # them in (0, 1).
 prior_y_samples = mean.(f.lik.(prior_f_samples))
 
-plt = plot(x_plot, prior_y_samples; seriescolor = "red", linealpha = 0.2, label = "")
-scatter!(plt, x, y; seriescolor = "blue", label = "Data points")
+plt = plot(
+    x_plot,
+    prior_y_samples;
+    seriescolor="red",
+    linealpha=0.2,
+    label=""
+)
+scatter!(plt, x, y; seriescolor="blue", label="Data points")
 
 
 # %%
@@ -60,13 +72,13 @@ scatter!(plt, x, y; seriescolor = "blue", label = "Data points")
 using Flux
 
 struct SVGPModel
-    k::Any # kernel parameters
-    m::Any # variational mean
-    A::Any # variational covariance
-    z::Any # inducing points
+    k # kernel parameters
+    m # variational mean
+    A # variational covariance
+    z # inducing points
 end
 
-Flux.@functor SVGPModel (k, m, A) # Don't train the inducing inputs
+@Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs
 
 lik = BernoulliLikelihood()
 jitter = 1e-4
@@ -80,9 +92,9 @@ function (m::SVGPModel)(x)
     return fx, fu, q
 end
 
-function flux_loss(x, y; n_data = length(y))
+function flux_loss(x, y; n_data=length(y))
     fx, fu, q = model(x)
-    return -SparseGPs.elbo(fx, y, fu, q; n_data, method = MonteCarlo())
+    return -SparseGPs.elbo(fx, y, fu, q; n_data, method=MonteCarlo())
 end
 
 # %%
@@ -109,7 +121,7 @@ Flux.train!(
     (x, y) -> flux_loss(x, y),
     parameters,
     ncycle([(x, y)], 2000), # Train for 1000 epochs
-    opt,
+    opt
 )
 
 # %%
@@ -124,7 +136,13 @@ l_post = LatentGP(post, BernoulliLikelihood(), jitter)
 
 post_f_samples = rand(l_post.f(x_plot, 1e-6), 20)
 
-plt = plot(x_plot, post_f_samples; seriescolor = "red", linealpha = 0.2, legend = false)
+plt = plot(
+    x_plot,
+    post_f_samples;
+    seriescolor="red",
+    linealpha=0.2,
+    legend=false
+)
 
 # %%
 # As above, push these samples through a logistic sigmoid to get posterior predictions.
@@ -133,10 +151,10 @@ post_y_samples = mean.(l_post.lik.(post_f_samples))
 plt = plot(
     x_plot,
     post_y_samples;
-    seriescolor = "red",
-    linealpha = 0.2,
+    seriescolor="red",
+    linealpha=0.2,
     # legend=false,
-    label = "",
+    label=""
 )
-scatter!(plt, x, y; seriescolor = "blue", label = "Data points")
-vline!(z; label = "Pseudo-points")
+scatter!(plt, x, y; seriescolor="blue", label="Data points")
+vline!(z; label="Pseudo-points")
diff --git a/examples/regression.jl b/examples/regression.jl
index 82a31e61..d537f448 100644
--- a/examples/regression.jl
+++ b/examples/regression.jl
@@ -8,7 +8,7 @@ using Optim
 using IterTools
 
 using Plots
-default(; legend = :outertopright, size = (700, 400))
+default(; legend=:outertopright, size=(700, 400))
 
 using Random
 Random.seed!(1234)
@@ -23,7 +23,7 @@ N = 10000 # Number of training points
 x = rand(Uniform(-1, 1), N)
 y = g.(x) + 0.3 * randn(N)
 
-scatter(x, y; xlabel = "x", ylabel = "y", legend = false)
+scatter(x, y; xlabel="x", ylabel="y", legend=false)
 
 
 # %%
@@ -34,13 +34,13 @@ lik_noise = 0.3
 jitter = 1e-5
 
 struct SVGPModel
-    k::Any # kernel parameters
-    m::Any # variational mean
-    A::Any # variational covariance
-    z::Any # inducing points
+    k # kernel parameters
+    m # variational mean
+    A # variational covariance
+    z # inducing points
 end
 
-Flux.@functor SVGPModel (k, m, A) # Don't train the inducing inputs
+@Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs
 
 function make_kernel(k)
     return softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2])))
@@ -68,7 +68,7 @@ function posterior(m::SVGPModel)
 end
 
 # Return the loss given data - in this case the negative ELBO.
-function flux_loss(x, y; n_data = length(y))
+function flux_loss(x, y; n_data=length(y))
     fx, fu, q = model(x)
     return -SparseGPs.elbo(fx, y, fu, q; n_data)
 end
@@ -90,7 +90,7 @@ model = SVGPModel(k, m, A, z)
 b = 100 # minibatch size
 opt = ADAM(0.001)
 parameters = Flux.params(model)
-data_loader = Flux.Data.DataLoader((x, y), batchsize = b)
+data_loader = Flux.Data.DataLoader((x, y), batchsize=b)
 
 # %%
 # Negative ELBO before training
@@ -99,10 +99,10 @@ println(flux_loss(x, y))
 # %%
 # Train the model
 Flux.train!(
-    (x, y) -> flux_loss(x, y; n_data = N),
+    (x, y) -> flux_loss(x, y; n_data=N),
     parameters,
     ncycle(data_loader, 300), # Train for 300 epochs
-    opt,
+    opt
 )
 
 # %%
@@ -116,16 +116,16 @@ post = posterior(model)
 scatter(
     x,
     y;
-    markershape = :xcross,
-    markeralpha = 0.1,
-    xlim = (-1, 1),
-    xlabel = "x",
-    ylabel = "y",
-    title = "posterior (VI with sparse grid)",
-    label = "Train Data",
+    markershape=:xcross,
+    markeralpha=0.1,
+    xlim=(-1, 1),
+    xlabel="x",
+    ylabel="y",
+    title="posterior (VI with sparse grid)",
+    label="Train Data",
 )
-plot!(-1:0.001:1, post; label = "Posterior")
-vline!(z; label = "Pseudo-points")
+plot!(-1:0.001:1, post; label="Posterior")
+vline!(z; label="Pseudo-points")
 
 
 # %% There is a closed form optimal solution for the variational posterior q(u)
@@ -137,8 +137,8 @@ function exact_q(fu, fx, y)
     σ² = fx.Σy[1]
     Kuf = cov(fu, fx)
     Kuu = Symmetric(cov(fu))
-    Σ = (Symmetric(cov(fu) + (1 / σ²) * Kuf * Kuf'))
-    m = ((1 / σ²) * Kuu * (Σ \ Kuf)) * y
+    Σ = (Symmetric(cov(fu) + (1/σ²) * Kuf * Kuf'))
+    m = ((1/σ²)*Kuu* (Σ\Kuf)) * y
     S = Symmetric(Kuu * (Σ \ Kuu))
     return MvNormal(m, S)
 end
@@ -164,14 +164,15 @@ AbstractGPs.elbo(fx, y, fu)
 scatter(
     x,
     y;
-    markershape = :xcross,
-    markeralpha = 0.1,
-    xlim = (-1, 1),
-    xlabel = "x",
-    ylabel = "y",
-    title = "posterior (VI with sparse grid)",
-    label = "Train Data",
+    markershape=:xcross,
+    markeralpha=0.1,
+    xlim=(-1, 1),
+    xlabel="x",
+    ylabel="y",
+    title="posterior (VI with sparse grid)",
+    label="Train Data",
 )
-plot!(-1:0.001:1, ap_ex; label = "SVGP posterior")
-plot!(-1:0.001:1, ap_tits; label = "Titsias posterior")
-vline!(z; label = "Pseudo-points")
+plot!(-1:0.001:1, ap_ex; label="SVGP posterior")
+plot!(-1:0.001:1, ap_tits; label="Titsias posterior")
+vline!(z; label="Pseudo-points")
+
diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl
index c0a165af..575ed4cb 100644
--- a/src/SparseGPs.jl
+++ b/src/SparseGPs.jl
@@ -13,9 +13,19 @@ using FillArrays
 using KLDivergences
 
 using AbstractGPs:
-    AbstractGP, FiniteGP, LatentFiniteGP, ApproxPosteriorGP, At_A, diag_At_A
+    AbstractGP,
+    FiniteGP,
+    LatentFiniteGP,
+    ApproxPosteriorGP,
+    At_A,
+    diag_At_A,
+    Xt_invA_X
 
-export SVGP, Default, Analytic, Quadrature, MonteCarlo
+export SVGP,
+    Default,
+    Analytic,
+    Quadrature,
+    MonteCarlo
 
 include("elbo.jl")
 include("svgp.jl")
diff --git a/src/elbo.jl b/src/elbo.jl
index 94a16e49..e25e43da 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -4,7 +4,7 @@ ScalarLikelihood = Union{
     PoissonLikelihood,
     GaussianLikelihood,
     ExponentialLikelihood,
-    GammaLikelihood,
+    GammaLikelihood
 }
 
 
@@ -43,26 +43,18 @@ variational Gaussian process classification." Artificial Intelligence and
 Statistics. PMLR, 2015.
 """
 function AbstractGPs.elbo(
-    fx::FiniteGP{<:AbstractGP,<:AbstractVector,<:Diagonal{<:Real,<:Fill}},
+    fx::FiniteGP{<:AbstractGP, <:AbstractVector, <:Diagonal{<:Real, <:Fill}},
     y::AbstractVector{<:Real},
     fz::FiniteGP,
     q::AbstractMvNormal;
-    n_data = length(y),
-    method = Default(),
+    n_data=length(y),
+    method=Default()
 )
     return _elbo(method, fx, y, fz, q, GaussianLikelihood(fx.Σy[1]), n_data)
 end
 
-function AbstractGPs.elbo(
-    ::FiniteGP,
-    ::AbstractVector,
-    ::FiniteGP,
-    ::AbstractMvNormal;
-    kwargs...,
-)
-    return error(
-        "The observation noise fx.Σy must be homoscedastic.\n To avoid this error, construct fx using: f = GP(kernel); fx = f(x, σ²)",
-    )
+function AbstractGPs.elbo(::FiniteGP, ::AbstractVector, ::FiniteGP, ::AbstractMvNormal; kwargs...)
+    return error("The observation noise fx.Σy must be homoscedastic.\n To avoid this error, construct fx using: f = GP(kernel); fx = f(x, σ²)")
 end
 
 """
@@ -75,8 +67,8 @@ function AbstractGPs.elbo(
     y::AbstractVector,
     fz::FiniteGP,
     q::AbstractMvNormal;
-    n_data = length(y),
-    method = Default(),
+    n_data=length(y),
+    method=Default()
 )
     return _elbo(method, lfx.fx, y, fz, q, lfx.lik, n_data)
 end
@@ -89,7 +81,7 @@ function _elbo(
     fz::FiniteGP,
     q::AbstractMvNormal,
     lik::ScalarLikelihood,
-    n_data::Integer,
+    n_data::Integer
 )
     post = approx_posterior(SVGP(), fz, q)
     q_f = marginals(post(fx.x))
@@ -140,7 +132,7 @@ function expected_loglik(
     ::Default,
     y::AbstractVector,
     q_f::AbstractVector{<:Normal},
-    lik::ScalarLikelihood,
+    lik::ScalarLikelihood
 )
     method = _default_method(lik)
     expected_loglik(method, y, q_f, lik)
@@ -151,11 +143,9 @@ function expected_loglik(
     ::Analytic,
     y::AbstractVector{<:Real},
     q_f::AbstractVector{<:Normal},
-    lik::GaussianLikelihood,
+    lik::GaussianLikelihood
 )
-    return sum(
-        -0.5 * (log(2π) .+ log.(lik.σ²) .+ ((y .- mean.(q_f)) .^ 2 .+ var.(q_f)) / lik.σ²),
-    )
+    return sum(-0.5 * (log(2π) .+ log.(lik.σ²) .+ ((y .- mean.(q_f)).^2 .+ var.(q_f)) / lik.σ²))
 end
 
 # The closed form solution for a Poisson likelihood with an exponential inverse link function
@@ -163,8 +153,8 @@ function expected_loglik(
     ::Analytic,
     y::AbstractVector,
     q_f::AbstractVector{<:Normal},
-    ::PoissonLikelihood{ExpLink},
-)
+    ::PoissonLikelihood{ExpLink}
+    )
     f_μ = mean.(q_f)
     return sum((y .* f_μ) - exp.(f_μ .+ (var.(q_f) / 2)) - loggamma.(y .+ 1))
 end
@@ -174,8 +164,8 @@ function expected_loglik(
     ::Analytic,
     y::AbstractVector{<:Real},
     q_f::AbstractVector{<:Normal},
-    ::ExponentialLikelihood{ExpLink},
-)
+    ::ExponentialLikelihood{ExpLink}
+    )
     f_μ = mean.(q_f)
     return sum(-f_μ - y .* exp.((var.(q_f) / 2) .- f_μ))
 end
@@ -185,20 +175,22 @@ function expected_loglik(
     ::Analytic,
     y::AbstractVector{<:Real},
     q_f::AbstractVector{<:Normal},
-    lik::GammaLikelihood{<:Any,ExpLink},
-)
-    f_μ = mean.(q_f)
-    return sum(
-        (lik.α - 1) * log.(y) .- y .* exp.((var.(q_f) / 2) .- f_μ) .- lik.α * f_μ .-
-        loggamma(lik.α),
+    lik::GammaLikelihood{<:Any, ExpLink}
     )
+    f_μ = mean.(q_f)
+    return sum((lik.α - 1) * log.(y) .- y .* exp.((var.(q_f) / 2) .- f_μ)
+               .- lik.α * f_μ .- loggamma(lik.α))
 end
 
-function expected_loglik(::Analytic, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik)
+function expected_loglik(
+    ::Analytic,
+    y::AbstractVector,
+    q_f::AbstractVector{<:Normal},
+    lik
+)
     return error(
-        "No analytic solution exists for ",
-        typeof(lik),
-        ". Use `Default()`, `Quadrature()` or `MonteCarlo()` instead.",
+        "No analytic solution exists for ", typeof(lik),
+        ". Use `Default()`, `Quadrature()` or `MonteCarlo()` instead."
     )
 end
 
@@ -206,7 +198,7 @@ function expected_loglik(
     mc::MonteCarlo,
     y::AbstractVector,
     q_f::AbstractVector{<:Normal},
-    lik::ScalarLikelihood,
+    lik::ScalarLikelihood
 )
     # take 'n_samples' reparameterised samples
     f_μ = mean.(q_f)
@@ -219,7 +211,7 @@ function expected_loglik(
     gh::Quadrature,
     y::AbstractVector,
     q_f::AbstractVector{<:Normal},
-    lik::ScalarLikelihood,
+    lik::ScalarLikelihood
 )
     # Compute the expectation via Gauss-Hermite quadrature
     # using a reparameterisation by change of variable
@@ -228,12 +220,16 @@ function expected_loglik(
     # size(fs): (length(y), n_points)
     fs = √2 * std.(q_f) .* transpose(xs) .+ mean.(q_f)
     lls = loglikelihood.(lik.(fs), y)
-    return sum((1 / √π) * lls * ws)
+    return sum((1/√π) * lls * ws)
 end
 
 ChainRulesCore.@non_differentiable gausshermite(n)
 
-AnalyticLikelihood =
-    Union{PoissonLikelihood,GaussianLikelihood,ExponentialLikelihood,GammaLikelihood}
+AnalyticLikelihood = Union{
+    PoissonLikelihood,
+    GaussianLikelihood,
+    ExponentialLikelihood,
+    GammaLikelihood
+}
 _default_method(::AnalyticLikelihood) = Analytic()
 _default_method(_) = Quadrature()
diff --git a/src/svgp.jl b/src/svgp.jl
index f9a10838..12b9e293 100644
--- a/src/svgp.jl
+++ b/src/svgp.jl
@@ -21,8 +21,8 @@ function AbstractGPs.approx_posterior(::SVGP, fz::FiniteGP, q::AbstractMvNormal)
     m, A = mean(q), _chol_cov(q)
     Kuu = _chol_cov(fz)
     B = Kuu.L \ A.L
-    α = Kuu \ (m - mean(fz))
-    data = (A = A, m = m, Kuu = Kuu, B = B, α = α, u = fz.x)
+    α=Kuu \ (m - mean(fz))
+    data = (A=A, m=m, Kuu=Kuu, B=B, α=α, u=fz.x)
     return ApproxPosteriorGP(SVGP(), fz.f, data)
 end
 
@@ -33,13 +33,13 @@ end
 function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
     Cux = cov(f.prior, f.data.u, x)
     D = f.data.Kuu.L \ Cux
-    return cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D)
+    return cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) 
 end
 
 function Statistics.var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
     Cux = cov(f.prior, f.data.u, x)
     D = f.data.Kuu.L \ Cux
-    return var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D)
+    return var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) 
 end
 
 function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector, y::AbstractVector)
@@ -55,7 +55,7 @@ function StatsBase.mean_and_cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
     Cux = cov(f.prior, f.data.u, x)
     D = f.data.Kuu.L \ Cux
     μ = Cux' * f.data.α
-    Σ = cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D)
+    Σ = cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) 
     return μ, Σ
 end
 
@@ -63,7 +63,7 @@ function StatsBase.mean_and_var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
     Cux = cov(f.prior, f.data.u, x)
     D = f.data.Kuu.L \ Cux
     μ = Cux' * f.data.α
-    Σ_diag = var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D)
+    Σ_diag = var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) 
     return μ, Σ_diag
 end
 
diff --git a/test/elbo.jl b/test/elbo.jl
index e4394da0..07b89035 100644
--- a/test/elbo.jl
+++ b/test/elbo.jl
@@ -25,12 +25,10 @@
         l = lik()
         methods = [Quadrature(100), MonteCarlo(1000000)]
         def = SparseGPs._default_method(l)
-        if def isa Analytic
-            push!(methods, def)
-        end
+        if def isa Analytic push!(methods, def) end
         y = rand.(rng, l.(zeros(10)))
 
         results = map(m -> SparseGPs.expected_loglik(m, y, q_f, l), methods)
-        @test all(x -> isapprox(x, results[end], rtol = 1e-3), results)
+        @test all(x->isapprox(x, results[end], rtol=1e-3), results)
     end
 end
diff --git a/test/equivalences.jl b/test/equivalences.jl
index 8b375eb9..162e8d3d 100644
--- a/test/equivalences.jl
+++ b/test/equivalences.jl
@@ -4,7 +4,7 @@
     y = sin.(x) + 0.9 * cos.(x * 1.6) + 0.4 * rand(rng, N)
 
     z = copy(x) # Set inducing inputs == training inputs
-
+    
     k_init = [0.2, 0.6] # initial kernel parameters
     lik_noise = 0.1 # The (fixed) Gaussian likelihood noise
 
@@ -25,11 +25,11 @@
         vfe_post = approx_posterior(VFE(), fx, y, fz) # Titsias posterior
         svgp_post = approx_posterior(SVGP(), fz, q_ex) # Hensman (2013) exact posterior
 
-        @test mean(gpr_post, x) ≈ mean(svgp_post, x) atol = 1e-10
-        @test cov(gpr_post, x) ≈ cov(svgp_post, x) atol = 1e-10
+        @test mean(gpr_post, x) ≈ mean(svgp_post, x) atol=1e-10
+        @test cov(gpr_post, x) ≈ cov(svgp_post, x) atol=1e-10
 
-        @test mean(vfe_post, x) ≈ mean(svgp_post, x) atol = 1e-10
-        @test cov(vfe_post, x) ≈ cov(svgp_post, x) atol = 1e-10
+        @test mean(vfe_post, x) ≈ mean(svgp_post, x) atol=1e-10
+        @test cov(vfe_post, x) ≈ cov(svgp_post, x) atol=1e-10
 
         @test elbo(fx, y, fz, q_ex) ≈ logpdf(fx, y)
     end
@@ -42,9 +42,9 @@
         ## FIRST - define the models
         # GPR - Exact GP regression
         struct GPRModel
-            k::Any # kernel parameters
+            k # kernel parameters
         end
-        Flux.@functor GPRModel
+        @Flux.functor GPRModel
 
         function (m::GPRModel)(x)
             f = make_gp(make_kernel(m.k))
@@ -54,12 +54,12 @@
 
         # SVGP - Sparse variational GP regression (Hensman 2014)
         struct SVGPModel
-            k::Any # kernel parameters
-            z::Any # inducing points
-            m::Any # variational mean
-            A::Any # variational covariance sqrt (Σ = A'A)
+            k # kernel parameters
+            z # inducing points
+            m # variational mean
+            A # variational covariance sqrt (Σ = A'A)
         end
-        Flux.@functor SVGPModel (k, m, A) # Don't train the inducing inputs
+        @Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs
 
         function (m::SVGPModel)(x)
             f = make_gp(make_kernel(m.k))
@@ -111,8 +111,9 @@
         svgp_post = posterior(svgp)
 
         ## FIFTH - test equivalences
-        @test all(isapprox.(mean(gpr_post, x), mean(svgp_post, x), atol = 1e-4))
-        @test all(isapprox.(cov(gpr_post, x), cov(svgp_post, x), atol = 1e-4))
+        @test all(isapprox.(mean(gpr_post, x), mean(svgp_post, x), atol=1e-4))
+        @test all(isapprox.(cov(gpr_post, x), cov(svgp_post, x), atol=1e-4))
     end
 
 end
+
diff --git a/test/svgp.jl b/test/svgp.jl
index 9cfef116..7dd90692 100644
--- a/test/svgp.jl
+++ b/test/svgp.jl
@@ -7,14 +7,14 @@
     # Specify prior.
     f = GP(Matern32Kernel())
     # Sample from prior.
-    x = collect(range(-1.0, 1.0; length = N_cond))
+    x = collect(range(-1.0, 1.0; length=N_cond))
     fx = f(x, 1e-15)
     y = rand(rng, fx)
 
     q = exact_variational_posterior(fx, fx, y)
     f_approx_post = approx_posterior(SVGP(), fx, q)
 
-    a = collect(range(-1.0, 1.0; length = N_a))
+    a = collect(range(-1.0, 1.0; length=N_a))
     b = randn(rng, N_b)
     AbstractGPs.TestUtils.test_internal_abstractgps_interface(rng, f_approx_post, a, b)
 end
diff --git a/test/test_utils.jl b/test/test_utils.jl
index 805c799d..02b50670 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -5,13 +5,12 @@ make_kernel(k) = softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(soft
 # q(u) (e.g. # https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/
 # equations (11) & (12)). Assumes a ZeroMean function.
 function exact_variational_posterior(fu, fx, y)
-    fu.f.mean isa AbstractGPs.ZeroMean ||
-        error("The exact posterior requires a GP with ZeroMean.")
+    fu.f.mean isa AbstractGPs.ZeroMean || error("The exact posterior requires a GP with ZeroMean.")
     σ² = fx.Σy[1]
     Kuf = cov(fu, fx)
     Kuu = Symmetric(cov(fu))
-    Σ = (Symmetric(cov(fu) + (1 / σ²) * Kuf * Kuf'))
-    m = ((1 / σ²) * Kuu * (Σ \ Kuf)) * y
+    Σ = (Symmetric(cov(fu) + (1/σ²) * Kuf * Kuf'))
+    m = ((1/σ²)*Kuu* (Σ\Kuf)) * y
     S = Symmetric(Kuu * (Σ \ Kuu))
     return MvNormal(m, S)
 end

From ef3292cb44f74581fce32e6d4563b4611870deb3 Mon Sep 17 00:00:00 2001
From: Ross Viljoen <ross@viljoen.co.uk>
Date: Fri, 30 Jul 2021 12:37:07 +0100
Subject: [PATCH 66/66] Reformat with JuliaFormatter - BlueStyle

---
 .JuliaFormatter.toml       |  1 +
 examples/classification.jl | 36 ++++-------------
 examples/regression.jl     | 14 +++----
 src/SparseGPs.jl           | 14 +------
 src/elbo.jl                | 79 +++++++++++++++++---------------------
 src/svgp.jl                | 10 ++---
 test/elbo.jl               |  6 ++-
 test/equivalences.jl       | 16 ++++----
 test/test_utils.jl         |  7 ++--
 9 files changed, 70 insertions(+), 113 deletions(-)
 create mode 100644 .JuliaFormatter.toml

diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml
new file mode 100644
index 00000000..323237ba
--- /dev/null
+++ b/.JuliaFormatter.toml
@@ -0,0 +1 @@
+style = "blue"
diff --git a/examples/classification.jl b/examples/classification.jl
index ab476bce..b1b7f6fc 100644
--- a/examples/classification.jl
+++ b/examples/classification.jl
@@ -23,7 +23,6 @@ data_file = pkgdir(SparseGPs) * "/examples/data/classif_1D.csv"
 x, y = eachcol(readdlm(data_file))
 scatter(x, y)
 
-
 # %%
 # First, create the GP kernel from given parameters k
 function make_kernel(k)
@@ -36,37 +35,22 @@ kernel = make_kernel(k)
 f = LatentGP(GP(kernel), BernoulliLikelihood(), 0.1)
 fx = f(x)
 
-
 # %%
 # Then, plot some samples from the prior underlying GP
 x_plot = 0:0.02:6
-prior_f_samples = rand(f.f(x_plot, 1e-6),20)
+prior_f_samples = rand(f.f(x_plot, 1e-6), 20)
 
-plt = plot(
-    x_plot,
-    prior_f_samples;
-    seriescolor="red",
-    linealpha=0.2,
-    label=""
-)
+plt = plot(x_plot, prior_f_samples; seriescolor="red", linealpha=0.2, label="")
 scatter!(plt, x, y; seriescolor="blue", label="Data points")
 
-
 # %%
 # Plot the same samples, but pushed through a logistic sigmoid to constrain
 # them in (0, 1).
 prior_y_samples = mean.(f.lik.(prior_f_samples))
 
-plt = plot(
-    x_plot,
-    prior_y_samples;
-    seriescolor="red",
-    linealpha=0.2,
-    label=""
-)
+plt = plot(x_plot, prior_y_samples; seriescolor="red", linealpha=0.2, label="")
 scatter!(plt, x, y; seriescolor="blue", label="Data points")
 
-
 # %%
 # A simple Flux model
 using Flux
@@ -78,7 +62,7 @@ struct SVGPModel
     z # inducing points
 end
 
-@Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs
+Flux.@functor SVGPModel (k, m, A) # Don't train the inducing inputs
 
 lik = BernoulliLikelihood()
 jitter = 1e-4
@@ -121,7 +105,7 @@ Flux.train!(
     (x, y) -> flux_loss(x, y),
     parameters,
     ncycle([(x, y)], 2000), # Train for 1000 epochs
-    opt
+    opt,
 )
 
 # %%
@@ -136,13 +120,7 @@ l_post = LatentGP(post, BernoulliLikelihood(), jitter)
 
 post_f_samples = rand(l_post.f(x_plot, 1e-6), 20)
 
-plt = plot(
-    x_plot,
-    post_f_samples;
-    seriescolor="red",
-    linealpha=0.2,
-    legend=false
-)
+plt = plot(x_plot, post_f_samples; seriescolor="red", linealpha=0.2, legend=false)
 
 # %%
 # As above, push these samples through a logistic sigmoid to get posterior predictions.
@@ -154,7 +132,7 @@ plt = plot(
     seriescolor="red",
     linealpha=0.2,
     # legend=false,
-    label=""
+    label="",
 )
 scatter!(plt, x, y; seriescolor="blue", label="Data points")
 vline!(z; label="Pseudo-points")
diff --git a/examples/regression.jl b/examples/regression.jl
index d537f448..518ee761 100644
--- a/examples/regression.jl
+++ b/examples/regression.jl
@@ -25,7 +25,6 @@ y = g.(x) + 0.3 * randn(N)
 
 scatter(x, y; xlabel="x", ylabel="y", legend=false)
 
-
 # %%
 # A simple Flux model
 using Flux
@@ -40,7 +39,7 @@ struct SVGPModel
     z # inducing points
 end
 
-@Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs
+Flux.@functor SVGPModel (k, m, A) # Don't train the inducing inputs
 
 function make_kernel(k)
     return softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2])))
@@ -73,7 +72,6 @@ function flux_loss(x, y; n_data=length(y))
     return -SparseGPs.elbo(fx, y, fu, q; n_data)
 end
 
-
 # %%
 M = 50 # number of inducing points
 
@@ -90,7 +88,7 @@ model = SVGPModel(k, m, A, z)
 b = 100 # minibatch size
 opt = ADAM(0.001)
 parameters = Flux.params(model)
-data_loader = Flux.Data.DataLoader((x, y), batchsize=b)
+data_loader = Flux.Data.DataLoader((x, y); batchsize=b)
 
 # %%
 # Negative ELBO before training
@@ -102,7 +100,7 @@ Flux.train!(
     (x, y) -> flux_loss(x, y; n_data=N),
     parameters,
     ncycle(data_loader, 300), # Train for 300 epochs
-    opt
+    opt,
 )
 
 # %%
@@ -127,7 +125,6 @@ scatter(
 plot!(-1:0.001:1, post; label="Posterior")
 vline!(z; label="Pseudo-points")
 
-
 # %% There is a closed form optimal solution for the variational posterior q(u)
 # (e.g. https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/
 # equations (11) & (12)). The SVGP posterior with this optimal q(u) should
@@ -137,8 +134,8 @@ function exact_q(fu, fx, y)
     σ² = fx.Σy[1]
     Kuf = cov(fu, fx)
     Kuu = Symmetric(cov(fu))
-    Σ = (Symmetric(cov(fu) + (1/σ²) * Kuf * Kuf'))
-    m = ((1/σ²)*Kuu* (Σ\Kuf)) * y
+    Σ = (Symmetric(cov(fu) + (1 / σ²) * Kuf * Kuf'))
+    m = ((1 / σ²) * Kuu * (Σ \ Kuf)) * y
     S = Symmetric(Kuu * (Σ \ Kuu))
     return MvNormal(m, S)
 end
@@ -175,4 +172,3 @@ scatter(
 plot!(-1:0.001:1, ap_ex; label="SVGP posterior")
 plot!(-1:0.001:1, ap_tits; label="Titsias posterior")
 vline!(z; label="Pseudo-points")
-
diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl
index 575ed4cb..c0a165af 100644
--- a/src/SparseGPs.jl
+++ b/src/SparseGPs.jl
@@ -13,19 +13,9 @@ using FillArrays
 using KLDivergences
 
 using AbstractGPs:
-    AbstractGP,
-    FiniteGP,
-    LatentFiniteGP,
-    ApproxPosteriorGP,
-    At_A,
-    diag_At_A,
-    Xt_invA_X
+    AbstractGP, FiniteGP, LatentFiniteGP, ApproxPosteriorGP, At_A, diag_At_A
 
-export SVGP,
-    Default,
-    Analytic,
-    Quadrature,
-    MonteCarlo
+export SVGP, Default, Analytic, Quadrature, MonteCarlo
 
 include("elbo.jl")
 include("svgp.jl")
diff --git a/src/elbo.jl b/src/elbo.jl
index e25e43da..38627ceb 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -4,10 +4,9 @@ ScalarLikelihood = Union{
     PoissonLikelihood,
     GaussianLikelihood,
     ExponentialLikelihood,
-    GammaLikelihood
+    GammaLikelihood,
 }
 
-
 abstract type ExpectationMethod end
 struct Default <: ExpectationMethod end
 struct Analytic <: ExpectationMethod end
@@ -43,18 +42,22 @@ variational Gaussian process classification." Artificial Intelligence and
 Statistics. PMLR, 2015.
 """
 function AbstractGPs.elbo(
-    fx::FiniteGP{<:AbstractGP, <:AbstractVector, <:Diagonal{<:Real, <:Fill}},
+    fx::FiniteGP{<:AbstractGP,<:AbstractVector,<:Diagonal{<:Real,<:Fill}},
     y::AbstractVector{<:Real},
     fz::FiniteGP,
     q::AbstractMvNormal;
     n_data=length(y),
-    method=Default()
+    method=Default(),
 )
     return _elbo(method, fx, y, fz, q, GaussianLikelihood(fx.Σy[1]), n_data)
 end
 
-function AbstractGPs.elbo(::FiniteGP, ::AbstractVector, ::FiniteGP, ::AbstractMvNormal; kwargs...)
-    return error("The observation noise fx.Σy must be homoscedastic.\n To avoid this error, construct fx using: f = GP(kernel); fx = f(x, σ²)")
+function AbstractGPs.elbo(
+    ::FiniteGP, ::AbstractVector, ::FiniteGP, ::AbstractMvNormal; kwargs...
+)
+    return error(
+        "The observation noise fx.Σy must be homoscedastic.\n To avoid this error, construct fx using: f = GP(kernel); fx = f(x, σ²)",
+    )
 end
 
 """
@@ -68,7 +71,7 @@ function AbstractGPs.elbo(
     fz::FiniteGP,
     q::AbstractMvNormal;
     n_data=length(y),
-    method=Default()
+    method=Default(),
 )
     return _elbo(method, lfx.fx, y, fz, q, lfx.lik, n_data)
 end
@@ -81,7 +84,7 @@ function _elbo(
     fz::FiniteGP,
     q::AbstractMvNormal,
     lik::ScalarLikelihood,
-    n_data::Integer
+    n_data::Integer,
 )
     post = approx_posterior(SVGP(), fz, q)
     q_f = marginals(post(fx.x))
@@ -129,13 +132,10 @@ Defaults to a closed form solution if it exists, otherwise defaults to
 Gauss-Hermite quadrature.
 """
 function expected_loglik(
-    ::Default,
-    y::AbstractVector,
-    q_f::AbstractVector{<:Normal},
-    lik::ScalarLikelihood
+    ::Default, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik::ScalarLikelihood
 )
     method = _default_method(lik)
-    expected_loglik(method, y, q_f, lik)
+    return expected_loglik(method, y, q_f, lik)
 end
 
 # The closed form solution for independent Gaussian noise
@@ -143,9 +143,11 @@ function expected_loglik(
     ::Analytic,
     y::AbstractVector{<:Real},
     q_f::AbstractVector{<:Normal},
-    lik::GaussianLikelihood
+    lik::GaussianLikelihood,
 )
-    return sum(-0.5 * (log(2π) .+ log.(lik.σ²) .+ ((y .- mean.(q_f)).^2 .+ var.(q_f)) / lik.σ²))
+    return sum(
+        -0.5 * (log(2π) .+ log.(lik.σ²) .+ ((y .- mean.(q_f)) .^ 2 .+ var.(q_f)) / lik.σ²)
+    )
 end
 
 # The closed form solution for a Poisson likelihood with an exponential inverse link function
@@ -153,8 +155,8 @@ function expected_loglik(
     ::Analytic,
     y::AbstractVector,
     q_f::AbstractVector{<:Normal},
-    ::PoissonLikelihood{ExpLink}
-    )
+    ::PoissonLikelihood{ExpLink},
+)
     f_μ = mean.(q_f)
     return sum((y .* f_μ) - exp.(f_μ .+ (var.(q_f) / 2)) - loggamma.(y .+ 1))
 end
@@ -164,8 +166,8 @@ function expected_loglik(
     ::Analytic,
     y::AbstractVector{<:Real},
     q_f::AbstractVector{<:Normal},
-    ::ExponentialLikelihood{ExpLink}
-    )
+    ::ExponentialLikelihood{ExpLink},
+)
     f_μ = mean.(q_f)
     return sum(-f_μ - y .* exp.((var.(q_f) / 2) .- f_μ))
 end
@@ -175,30 +177,25 @@ function expected_loglik(
     ::Analytic,
     y::AbstractVector{<:Real},
     q_f::AbstractVector{<:Normal},
-    lik::GammaLikelihood{<:Any, ExpLink}
-    )
+    lik::GammaLikelihood{<:Any,ExpLink},
+)
     f_μ = mean.(q_f)
-    return sum((lik.α - 1) * log.(y) .- y .* exp.((var.(q_f) / 2) .- f_μ)
-               .- lik.α * f_μ .- loggamma(lik.α))
+    return sum(
+        (lik.α - 1) * log.(y) .- y .* exp.((var.(q_f) / 2) .- f_μ) .- lik.α * f_μ .-
+        loggamma(lik.α),
+    )
 end
 
-function expected_loglik(
-    ::Analytic,
-    y::AbstractVector,
-    q_f::AbstractVector{<:Normal},
-    lik
-)
+function expected_loglik(::Analytic, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik)
     return error(
-        "No analytic solution exists for ", typeof(lik),
-        ". Use `Default()`, `Quadrature()` or `MonteCarlo()` instead."
+        "No analytic solution exists for ",
+        typeof(lik),
+        ". Use `Default()`, `Quadrature()` or `MonteCarlo()` instead.",
     )
 end
 
 function expected_loglik(
-    mc::MonteCarlo,
-    y::AbstractVector,
-    q_f::AbstractVector{<:Normal},
-    lik::ScalarLikelihood
+    mc::MonteCarlo, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik::ScalarLikelihood
 )
     # take 'n_samples' reparameterised samples
     f_μ = mean.(q_f)
@@ -208,10 +205,7 @@ function expected_loglik(
 end
 
 function expected_loglik(
-    gh::Quadrature,
-    y::AbstractVector,
-    q_f::AbstractVector{<:Normal},
-    lik::ScalarLikelihood
+    gh::Quadrature, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik::ScalarLikelihood
 )
     # Compute the expectation via Gauss-Hermite quadrature
     # using a reparameterisation by change of variable
@@ -220,16 +214,13 @@ function expected_loglik(
     # size(fs): (length(y), n_points)
     fs = √2 * std.(q_f) .* transpose(xs) .+ mean.(q_f)
     lls = loglikelihood.(lik.(fs), y)
-    return sum((1/√π) * lls * ws)
+    return sum((1 / √π) * lls * ws)
 end
 
 ChainRulesCore.@non_differentiable gausshermite(n)
 
 AnalyticLikelihood = Union{
-    PoissonLikelihood,
-    GaussianLikelihood,
-    ExponentialLikelihood,
-    GammaLikelihood
+    PoissonLikelihood,GaussianLikelihood,ExponentialLikelihood,GammaLikelihood
 }
 _default_method(::AnalyticLikelihood) = Analytic()
 _default_method(_) = Quadrature()
diff --git a/src/svgp.jl b/src/svgp.jl
index 12b9e293..0e71a265 100644
--- a/src/svgp.jl
+++ b/src/svgp.jl
@@ -21,7 +21,7 @@ function AbstractGPs.approx_posterior(::SVGP, fz::FiniteGP, q::AbstractMvNormal)
     m, A = mean(q), _chol_cov(q)
     Kuu = _chol_cov(fz)
     B = Kuu.L \ A.L
-    α=Kuu \ (m - mean(fz))
+    α = Kuu \ (m - mean(fz))
     data = (A=A, m=m, Kuu=Kuu, B=B, α=α, u=fz.x)
     return ApproxPosteriorGP(SVGP(), fz.f, data)
 end
@@ -33,13 +33,13 @@ end
 function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
     Cux = cov(f.prior, f.data.u, x)
     D = f.data.Kuu.L \ Cux
-    return cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) 
+    return cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D)
 end
 
 function Statistics.var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
     Cux = cov(f.prior, f.data.u, x)
     D = f.data.Kuu.L \ Cux
-    return var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) 
+    return var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D)
 end
 
 function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector, y::AbstractVector)
@@ -55,7 +55,7 @@ function StatsBase.mean_and_cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
     Cux = cov(f.prior, f.data.u, x)
     D = f.data.Kuu.L \ Cux
     μ = Cux' * f.data.α
-    Σ = cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) 
+    Σ = cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D)
     return μ, Σ
 end
 
@@ -63,7 +63,7 @@ function StatsBase.mean_and_var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector)
     Cux = cov(f.prior, f.data.u, x)
     D = f.data.Kuu.L \ Cux
     μ = Cux' * f.data.α
-    Σ_diag = var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) 
+    Σ_diag = var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D)
     return μ, Σ_diag
 end
 
diff --git a/test/elbo.jl b/test/elbo.jl
index 07b89035..0f20367c 100644
--- a/test/elbo.jl
+++ b/test/elbo.jl
@@ -25,10 +25,12 @@
         l = lik()
         methods = [Quadrature(100), MonteCarlo(1000000)]
         def = SparseGPs._default_method(l)
-        if def isa Analytic push!(methods, def) end
+        if def isa Analytic
+            push!(methods, def)
+        end
         y = rand.(rng, l.(zeros(10)))
 
         results = map(m -> SparseGPs.expected_loglik(m, y, q_f, l), methods)
-        @test all(x->isapprox(x, results[end], rtol=1e-3), results)
+        @test all(x -> isapprox(x, results[end]; rtol=1e-3), results)
     end
 end
diff --git a/test/equivalences.jl b/test/equivalences.jl
index 162e8d3d..3bb4c7bc 100644
--- a/test/equivalences.jl
+++ b/test/equivalences.jl
@@ -4,7 +4,7 @@
     y = sin.(x) + 0.9 * cos.(x * 1.6) + 0.4 * rand(rng, N)
 
     z = copy(x) # Set inducing inputs == training inputs
-    
+
     k_init = [0.2, 0.6] # initial kernel parameters
     lik_noise = 0.1 # The (fixed) Gaussian likelihood noise
 
@@ -25,11 +25,11 @@
         vfe_post = approx_posterior(VFE(), fx, y, fz) # Titsias posterior
         svgp_post = approx_posterior(SVGP(), fz, q_ex) # Hensman (2013) exact posterior
 
-        @test mean(gpr_post, x) ≈ mean(svgp_post, x) atol=1e-10
-        @test cov(gpr_post, x) ≈ cov(svgp_post, x) atol=1e-10
+        @test mean(gpr_post, x) ≈ mean(svgp_post, x) atol = 1e-10
+        @test cov(gpr_post, x) ≈ cov(svgp_post, x) atol = 1e-10
 
-        @test mean(vfe_post, x) ≈ mean(svgp_post, x) atol=1e-10
-        @test cov(vfe_post, x) ≈ cov(svgp_post, x) atol=1e-10
+        @test mean(vfe_post, x) ≈ mean(svgp_post, x) atol = 1e-10
+        @test cov(vfe_post, x) ≈ cov(svgp_post, x) atol = 1e-10
 
         @test elbo(fx, y, fz, q_ex) ≈ logpdf(fx, y)
     end
@@ -44,7 +44,7 @@
         struct GPRModel
             k # kernel parameters
         end
-        @Flux.functor GPRModel
+        Flux.@functor GPRModel
 
         function (m::GPRModel)(x)
             f = make_gp(make_kernel(m.k))
@@ -59,7 +59,7 @@
             m # variational mean
             A # variational covariance sqrt (Σ = A'A)
         end
-        @Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs
+        Flux.@functor SVGPModel (k, m, A) # Don't train the inducing inputs
 
         function (m::SVGPModel)(x)
             f = make_gp(make_kernel(m.k))
@@ -114,6 +114,4 @@
         @test all(isapprox.(mean(gpr_post, x), mean(svgp_post, x), atol=1e-4))
         @test all(isapprox.(cov(gpr_post, x), cov(svgp_post, x), atol=1e-4))
     end
-
 end
-
diff --git a/test/test_utils.jl b/test/test_utils.jl
index 02b50670..805c799d 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -5,12 +5,13 @@ make_kernel(k) = softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(soft
 # q(u) (e.g. # https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/
 # equations (11) & (12)). Assumes a ZeroMean function.
 function exact_variational_posterior(fu, fx, y)
-    fu.f.mean isa AbstractGPs.ZeroMean || error("The exact posterior requires a GP with ZeroMean.")
+    fu.f.mean isa AbstractGPs.ZeroMean ||
+        error("The exact posterior requires a GP with ZeroMean.")
     σ² = fx.Σy[1]
     Kuf = cov(fu, fx)
     Kuu = Symmetric(cov(fu))
-    Σ = (Symmetric(cov(fu) + (1/σ²) * Kuf * Kuf'))
-    m = ((1/σ²)*Kuu* (Σ\Kuf)) * y
+    Σ = (Symmetric(cov(fu) + (1 / σ²) * Kuf * Kuf'))
+    m = ((1 / σ²) * Kuu * (Σ \ Kuf)) * y
     S = Symmetric(Kuu * (Σ \ Kuu))
     return MvNormal(m, S)
 end