From 3cec6a0935a483ebce5e5fea2b6e4e4729acf60a Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Sat, 19 Jun 2021 17:32:22 +0100 Subject: [PATCH 01/66] Add files --- Manifest.toml | 434 ++++++++++++++++++++++++++++++++++++++++ Project.toml | 13 ++ examples/gpflow_svgp.jl | 132 ++++++++++++ src/SparseGPs.jl | 19 ++ src/svgp.jl | 53 +++++ 5 files changed, 651 insertions(+) create mode 100644 Manifest.toml create mode 100644 Project.toml create mode 100644 examples/gpflow_svgp.jl create mode 100644 src/SparseGPs.jl create mode 100644 src/svgp.jl diff --git a/Manifest.toml b/Manifest.toml new file mode 100644 index 00000000..6953aae4 --- /dev/null +++ b/Manifest.toml @@ -0,0 +1,434 @@ +# This file is machine-generated - editing it directly is not advised + +[[AbstractGPs]] +deps = ["ChainRulesCore", "Distributions", "FillArrays", "KernelFunctions", "LinearAlgebra", "Random", "RecipesBase", "Reexport", "Statistics", "StatsBase"] +git-tree-sha1 = "d3700bd0201d2ec29c0b18d6f3f971f7072fe491" +uuid = "99985d1d-32ba-4be9-9821-2ec096f28918" +version = "0.3.5" + +[[ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" + +[[ArrayInterface]] +deps = ["IfElse", "LinearAlgebra", "Requires", "SparseArrays", "Static"] +git-tree-sha1 = "045ff5e1bc8c6fb1ecb28694abba0a0d55b5f4f5" +uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" +version = "3.1.17" + +[[Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[ChainRulesCore]] +deps = ["Compat", "LinearAlgebra", "SparseArrays"] +git-tree-sha1 = "4289a76df5a8568cca9970e54dd585c6c395c496" +uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +version = "0.10.7" + +[[CommonSubexpressions]] +deps = ["MacroTools", "Test"] +git-tree-sha1 = "7b8a93dba8af7e3b42fecabf646260105ac373f7" +uuid = "bbf7d656-a473-5ed7-a52c-81e309532950" +version = "0.3.0" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "e4e2b39db08f967cc1360951f01e8a75ec441cab" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "3.30.0" + +[[CompilerSupportLibraries_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" + +[[CompositionsBase]] +git-tree-sha1 = "f3955eb38944e5dd0fabf8ca1e267d94941d34a5" +uuid = "a33af91c-f02d-484b-be07-31d278c5ca2b" +version = "0.1.0" + +[[DataAPI]] +git-tree-sha1 = "dfb3b7e89e395be1e25c2ad6d7690dc29cc53b1d" +uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" +version = "1.6.0" + +[[DataStructures]] +deps = ["Compat", "InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "4437b64df1e0adccc3e5d1adbc3ac741095e4677" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.18.9" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[DiffResults]] +deps = ["StaticArrays"] +git-tree-sha1 = "c18e98cba888c6c25d1c3b048e4b3380ca956805" +uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5" +version = "1.0.3" + +[[DiffRules]] +deps = ["NaNMath", "Random", "SpecialFunctions"] +git-tree-sha1 = "214c3fcac57755cfda163d91c58893a8723f93e9" +uuid = "b552c78f-8df3-52c6-915a-8e097449b14b" +version = "1.0.2" + +[[Distances]] +deps = ["LinearAlgebra", "Statistics", "StatsAPI"] +git-tree-sha1 = "abe4ad222b26af3337262b8afb28fab8d215e9f8" +uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" +version = "0.10.3" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[Distributions]] +deps = ["FillArrays", "LinearAlgebra", "PDMats", "Printf", "QuadGK", "Random", "SparseArrays", "SpecialFunctions", "Statistics", "StatsBase", "StatsFuns"] +git-tree-sha1 = "62e1ac52e9adf4234285cd88c94954924aa3f9ef" +uuid = "31c24e10-a181-5473-b8eb-7969acd0382f" +version = "0.25.5" + +[[DocStringExtensions]] +deps = ["LibGit2"] +git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.8.5" + +[[Downloads]] +deps = ["ArgTools", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" + +[[FillArrays]] +deps = ["LinearAlgebra", "Random", "SparseArrays"] +git-tree-sha1 = "31939159aeb8ffad1d4d8ee44d07f8558273120a" +uuid = "1a297f60-69ca-5386-bcde-b61e274b549b" +version = "0.11.7" + +[[FiniteDiff]] +deps = ["ArrayInterface", "LinearAlgebra", "Requires", "SparseArrays", "StaticArrays"] +git-tree-sha1 = "f6f80c8f934efd49a286bb5315360be66956dfc4" +uuid = "6a86dc24-6348-571c-b903-95158fe2bd41" +version = "2.8.0" + +[[ForwardDiff]] +deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "LinearAlgebra", "NaNMath", "Printf", "Random", "SpecialFunctions", "StaticArrays"] +git-tree-sha1 = "e2af66012e08966366a43251e1fd421522908be6" +uuid = "f6369f11-7733-5829-9624-2563aa707210" +version = "0.10.18" + +[[Functors]] +deps = ["MacroTools"] +git-tree-sha1 = "a7bb2af991c43dcf5c3455d276dd83976799634f" +uuid = "d9f16b24-f501-4c13-a1f2-28368ffc5196" +version = "0.2.1" + +[[IfElse]] +git-tree-sha1 = "28e837ff3e7a6c3cdb252ce49fb412c8eb3caeef" +uuid = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173" +version = "0.1.0" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[JLLWrappers]] +deps = ["Preferences"] +git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.3.0" + +[[KernelFunctions]] +deps = ["ChainRulesCore", "Compat", "CompositionsBase", "Distances", "FillArrays", "Functors", "LinearAlgebra", "Random", "Requires", "SpecialFunctions", "StatsBase", "StatsFuns", "TensorCore", "Test", "ZygoteRules"] +git-tree-sha1 = "c7b25bc625ca2ee217021d29e3ddf031967bf0ff" +uuid = "ec8451be-7e33-11e9-00cf-bbf324bd1392" +version = "0.10.5" + +[[LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" + +[[LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" + +[[LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[LineSearches]] +deps = ["LinearAlgebra", "NLSolversBase", "NaNMath", "Parameters", "Printf"] +git-tree-sha1 = "f27132e551e959b3667d8c93eae90973225032dd" +uuid = "d3d80556-e9d4-5f37-9878-2ab0fcc64255" +version = "7.1.1" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[LogExpFunctions]] +deps = ["DocStringExtensions", "LinearAlgebra"] +git-tree-sha1 = "1ba664552f1ef15325e68dc4c05c3ef8c2d5d885" +uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" +version = "0.2.4" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[MacroTools]] +deps = ["Markdown", "Random"] +git-tree-sha1 = "6a8a2a625ab0dea913aba95c11370589e0239ff0" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.6" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" + +[[Missings]] +deps = ["DataAPI"] +git-tree-sha1 = "4ea90bd5d3985ae1f9a908bd4500ae88921c5ce7" +uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" +version = "1.0.0" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" + +[[NLSolversBase]] +deps = ["DiffResults", "Distributed", "FiniteDiff", "ForwardDiff"] +git-tree-sha1 = "50608f411a1e178e0129eab4110bd56efd08816f" +uuid = "d41bc354-129a-5804-8e4c-c37616107c6c" +version = "7.8.0" + +[[NaNMath]] +git-tree-sha1 = "bfe47e760d60b82b66b61d2d44128b62e3a369fb" +uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3" +version = "0.3.5" + +[[NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" + +[[OpenSpecFun_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" +uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" +version = "0.5.5+0" + +[[Optim]] +deps = ["Compat", "FillArrays", "LineSearches", "LinearAlgebra", "NLSolversBase", "NaNMath", "Parameters", "PositiveFactorizations", "Printf", "SparseArrays", "StatsBase"] +git-tree-sha1 = "d34366a3abc25c41f88820762ef7dfdfe9306711" +uuid = "429524aa-4258-5aef-a3af-852621145aeb" +version = "1.3.0" + +[[OrderedCollections]] +git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.4.1" + +[[PDMats]] +deps = ["LinearAlgebra", "SparseArrays", "SuiteSparse"] +git-tree-sha1 = "4dd403333bcf0909341cfe57ec115152f937d7d8" +uuid = "90014a1f-27ba-587c-ab20-58faa44d9150" +version = "0.11.1" + +[[Parameters]] +deps = ["OrderedCollections", "UnPack"] +git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345" +uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" +version = "0.12.2" + +[[Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[PositiveFactorizations]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "17275485f373e6673f7e7f97051f703ed5b15b20" +uuid = "85a6dd25-e78a-55b7-8502-1745935b8125" +version = "0.2.4" + +[[Preferences]] +deps = ["TOML"] +git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +uuid = "21216c6a-2e73-6563-6e65-726566657250" +version = "1.2.2" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[QuadGK]] +deps = ["DataStructures", "LinearAlgebra"] +git-tree-sha1 = "12fbe86da16df6679be7521dfb39fbc861e1dc7b" +uuid = "1fd47b50-473d-5c70-9696-f719f8f3bcdc" +version = "2.4.1" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[RecipesBase]] +git-tree-sha1 = "b3fb709f3c97bfc6e948be68beeecb55a0b340ae" +uuid = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" +version = "1.1.1" + +[[Reexport]] +git-tree-sha1 = "5f6c21241f0f655da3952fd60aa18477cf96c220" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "1.1.0" + +[[Requires]] +deps = ["UUIDs"] +git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.1.3" + +[[Rmath]] +deps = ["Random", "Rmath_jll"] +git-tree-sha1 = "bf3188feca147ce108c76ad82c2792c57abe7b1f" +uuid = "79098fc4-a85e-5d69-aa6a-4863f24498fa" +version = "0.7.0" + +[[Rmath_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "68db32dff12bb6127bac73c209881191bf0efbb7" +uuid = "f50d1b31-88e8-58de-be2c-1cc44531875f" +version = "0.3.0+0" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SortingAlgorithms]] +deps = ["DataStructures"] +git-tree-sha1 = "2ec1962eba973f383239da22e75218565c390a96" +uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" +version = "1.0.0" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[SpecialFunctions]] +deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"] +git-tree-sha1 = "a50550fa3164a8c46747e62063b4d774ac1bcf49" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "1.5.1" + +[[Static]] +deps = ["IfElse"] +git-tree-sha1 = "2740ea27b66a41f9d213561a04573da5d3823d4b" +uuid = "aedffcd0-7271-4cad-89d0-dc628f76c6d3" +version = "0.2.5" + +[[StaticArrays]] +deps = ["LinearAlgebra", "Random", "Statistics"] +git-tree-sha1 = "57a9b3c69933e15e5b7041b6a57d1533ef1a9882" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "1.2.3" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[StatsAPI]] +git-tree-sha1 = "1958272568dc176a1d881acb797beb909c785510" +uuid = "82ae8749-77ed-4fe6-ae5f-f523153014b0" +version = "1.0.0" + +[[StatsBase]] +deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics", "StatsAPI"] +git-tree-sha1 = "2f6792d523d7448bbe2fec99eca9218f06cc746d" +uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +version = "0.33.8" + +[[StatsFuns]] +deps = ["LogExpFunctions", "Rmath", "SpecialFunctions"] +git-tree-sha1 = "30cd8c360c54081f806b1ee14d2eecbef3c04c49" +uuid = "4c63d2b9-4356-54db-8cca-17b64c39e42c" +version = "0.9.8" + +[[SuiteSparse]] +deps = ["Libdl", "LinearAlgebra", "Serialization", "SparseArrays"] +uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9" + +[[TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" + +[[Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" + +[[TensorCore]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "1feb45f88d133a655e001435632f019a9a1bcdb6" +uuid = "62fd8b95-f654-4bbd-a8a5-9c27f68ccd50" +version = "0.1.1" + +[[Test]] +deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[UnPack]] +git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" +uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" +version = "1.0.2" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" + +[[ZygoteRules]] +deps = ["MacroTools"] +git-tree-sha1 = "9e7a1e8ca60b742e508a315c17eef5211e7fbfd7" +uuid = "700de1a5-db45-46bc-99cf-38207098b444" +version = "0.2.1" + +[[nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" + +[[p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/Project.toml b/Project.toml new file mode 100644 index 00000000..e97fe5a2 --- /dev/null +++ b/Project.toml @@ -0,0 +1,13 @@ +name = "SparseGPs" +uuid = "298c2ebc-0411-48ad-af38-99e88101b606" +authors = ["Ross Viljoen "] +version = "0.1.0" + +[deps] +AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918" +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Optim = "429524aa-4258-5aef-a3af-852621145aeb" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c" diff --git a/examples/gpflow_svgp.jl b/examples/gpflow_svgp.jl new file mode 100644 index 00000000..935f5330 --- /dev/null +++ b/examples/gpflow_svgp.jl @@ -0,0 +1,132 @@ +# An attempted recreation of https://gpflow.readthedocs.io/en/master/notebooks/advanced/gps_for_big_data.html + +using AbstractGPs +using SparseGPs +using Distributions +using LinearAlgebra +using StatsFuns +using Optim + +using Plots +default(; legend=:outertopright, size=(700, 400)) + +using Random +Random.seed!(1234) + +# %% +function g(x) + return sin(3π * x) + 0.3 * cos(9π * x) + 0.5 * sin(7π * x) +end + +N = 1000 # Number of training points +x = rand(Uniform(-1, 1), N) +y = g.(x) + 0.3 * randn(N) + +scatter(x, y; xlabel="x", ylabel="y", legend=false) + +# %% +M = 30 # number of inducing points + +function pack_params(θ, m, A) + return vcat(θ, m, vec(A)) +end + +function unpack_params(params, m; include_z=false) + if include_z + k = params[1:2] + z = params[3:m+2] + μ = params[m+3:2m+2] + s = params[2m+3:end] + Σ = reshape(s, (M, M)) + return k, z, μ, Σ + else + k = params[1:2] + μ = params[3:m+2] + s = params[m+3:end] + Σ = reshape(s, (M, M)) + return k, μ, Σ + end +end + +x0 = pack_params(rand(2), zeros(M), vec(Matrix{Float64}(I, M, M))) +z = x[1:M] + +# %% +function objective_function(x, y) + function neg_elbo(params) + # k, z, qμ, qΣ_L = split_params(params, M) + k, m, A = unpack_params(params, M) + kernel = + (softplus(k[1])) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2]))) + f = GP(kernel) + fx = f(x, 0.1) + q = MvNormal(m, A'A) + return -SparseGPs.elbo(fx, y, f(z), q) + end + return neg_elbo +end + +# Currently fails at the cholesky factorisation of cov(f(z)) +opt = optimize(objective_function(x, y), x0, LBFGS()) + +# %% +opt_k, opt_μ, opt_Σ_L = unpack_params(opt.minimizer, M) +opt_kernel = + softplus(opt_k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(opt_k[2]) + 0.01)) +opt_f = GP(opt_kernel) +opt_q = MvNormal(opt_μ, opt_Σ_L * opt_Σ_L') +ap = SparseGPs.approx_posterior(SVGP(), opt_f(z), opt_q) +logpdf(ap(x), y) + +# %% +scatter( + x, + y; + xlim=(0, 1), + xlabel="x", + ylabel="y", + title="posterior (VI with sparse grid)", + label="Train Data", +) +# scatter!(x, y; label="Test Data") +plot!(-1:0.001:1, ap; label=false) +vline!(z; label="Pseudo-points") + + +# %% Find the exact posterior over u (e.g. +# https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/ equations +# (11) & (12)) As a sanity check -- this seems to work. + +function exact_q(fu, fx, y) + σ² = fx.Σy[1] + Kuf = cov(fu, fx) + Kuu = Symmetric(cov(fu)) + Σ = (Symmetric(cov(fu) + (1/σ²) * Kuf * Kuf')) + m = ((1/σ²)*Kuu* (Σ\Kuf)) * y + A = Symmetric(Kuu * (Σ \ Kuu)) + return MvNormal(m, A) +end + +kernel = 0.3 * (SqExponentialKernel() ∘ ScaleTransform(10)) +f = GP(kernel) +fx = f(x) +fu = f(z) +q_ex = exact_q(fu, fx, y) + +scatter(x, y) +scatter!(z, q_ex.μ) + +ap_ex = SparseGPs.approx_posterior(SVGP(), fu, q_ex) + +# %% +scatter( + x, + y; + xlim=(0, 1), + xlabel="x", + ylabel="y", + title="posterior (VI with sparse grid)", + label="Train Data", +) +plot!(-1:0.001:1, ap_ex; label=false) +vline!(z; label="Pseudo-points") diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl new file mode 100644 index 00000000..24f7faa3 --- /dev/null +++ b/src/SparseGPs.jl @@ -0,0 +1,19 @@ +module SparseGPs + +using AbstractGPs +using Distributions +using Optim +using StatsFuns +using LinearAlgebra +using Statistics +using StatsBase + +using AbstractGPs: FiniteGP, ApproxPosteriorGP, _cholesky, _symmetric, Xt_invA_X, diag_At_A + +export elbo, + approx_posterior, + SVGP + +include("svgp.jl") + +end diff --git a/src/svgp.jl b/src/svgp.jl new file mode 100644 index 00000000..64a19d63 --- /dev/null +++ b/src/svgp.jl @@ -0,0 +1,53 @@ +struct SVGP end # TODO: should probably just be VFE? + +function approx_posterior(::SVGP, fu::FiniteGP, q::MvNormal) + m, A = q.μ, q.Σ.chol + Kuu = cholesky(Symmetric(cov(fu))) + B = Kuu.L \ A.L + data = (A=A, m=m, Kuu=Kuu, B=B, α=Kuu \ m, u=fu.x) + return ApproxPosteriorGP(SVGP(), fu.f, data) +end + +function Statistics.var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) + # TODO: Don't compute the full covar + return diag(cov(f, x)) +end + +function Statistics.mean(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) + return cov(f.prior, x, f.data.u) * f.data.α +end + +function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) + Cux = cov(f.prior, f.data.u, x) + Kuu = f.data.Kuu + B = f.data.B + D = f.data.Kuu.L \ Cux + return cov(f.prior, x) - D' * B * B' * D +end + +function StatsBase.mean_and_cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) + # TODO: implement properly + return mean(f, x), cov(f, x) +end + +function kl_divergence(q::MvNormal, p::AbstractMvNormal) + (1/2) * (logdet(q.Σ.chol) + - logdet(cov(p)) - length(mean(p)) + + tr(inv(q.Σ.chol) * cov(p)) + Xt_invA_X(q.Σ.chol, (mean(q)-mean(p)))) +end + +function elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal) + kl_term = kl_divergence(q, fu) + post = approx_posterior(SVGP(), fu, q) + f_mean = mean(post, fx.x) + f_var = var(post, fx.x) + + Σy = diag(fx.Σy) + + # TODO: general method for likelihoods - quadrature like GPFlow? + variational_exp = -0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy) + + # TODO: rescale for minibatches + return sum(variational_exp) - kl_term +end + From 6c814a6c23e35e833b879b28f7712b2ea7585265 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Sun, 20 Jun 2021 23:54:06 +0100 Subject: [PATCH 02/66] Fixed KL and posterior covariance. --- .gitignore | 11 ++ Manifest.toml | 434 -------------------------------------------------- src/svgp.jl | 14 +- 3 files changed, 18 insertions(+), 441 deletions(-) create mode 100644 .gitignore delete mode 100644 Manifest.toml diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..47e36092 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +.DS_Store +.idea +*.log +tmp/ + +*.result +*.json +*.jld2 +*.cov +*.info +Manifest.toml diff --git a/Manifest.toml b/Manifest.toml deleted file mode 100644 index 6953aae4..00000000 --- a/Manifest.toml +++ /dev/null @@ -1,434 +0,0 @@ -# This file is machine-generated - editing it directly is not advised - -[[AbstractGPs]] -deps = ["ChainRulesCore", "Distributions", "FillArrays", "KernelFunctions", "LinearAlgebra", "Random", "RecipesBase", "Reexport", "Statistics", "StatsBase"] -git-tree-sha1 = "d3700bd0201d2ec29c0b18d6f3f971f7072fe491" -uuid = "99985d1d-32ba-4be9-9821-2ec096f28918" -version = "0.3.5" - -[[ArgTools]] -uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" - -[[ArrayInterface]] -deps = ["IfElse", "LinearAlgebra", "Requires", "SparseArrays", "Static"] -git-tree-sha1 = "045ff5e1bc8c6fb1ecb28694abba0a0d55b5f4f5" -uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" -version = "3.1.17" - -[[Artifacts]] -uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" - -[[Base64]] -uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" - -[[ChainRulesCore]] -deps = ["Compat", "LinearAlgebra", "SparseArrays"] -git-tree-sha1 = "4289a76df5a8568cca9970e54dd585c6c395c496" -uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" -version = "0.10.7" - -[[CommonSubexpressions]] -deps = ["MacroTools", "Test"] -git-tree-sha1 = "7b8a93dba8af7e3b42fecabf646260105ac373f7" -uuid = "bbf7d656-a473-5ed7-a52c-81e309532950" -version = "0.3.0" - -[[Compat]] -deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] -git-tree-sha1 = "e4e2b39db08f967cc1360951f01e8a75ec441cab" -uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "3.30.0" - -[[CompilerSupportLibraries_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" - -[[CompositionsBase]] -git-tree-sha1 = "f3955eb38944e5dd0fabf8ca1e267d94941d34a5" -uuid = "a33af91c-f02d-484b-be07-31d278c5ca2b" -version = "0.1.0" - -[[DataAPI]] -git-tree-sha1 = "dfb3b7e89e395be1e25c2ad6d7690dc29cc53b1d" -uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" -version = "1.6.0" - -[[DataStructures]] -deps = ["Compat", "InteractiveUtils", "OrderedCollections"] -git-tree-sha1 = "4437b64df1e0adccc3e5d1adbc3ac741095e4677" -uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" -version = "0.18.9" - -[[Dates]] -deps = ["Printf"] -uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" - -[[DelimitedFiles]] -deps = ["Mmap"] -uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" - -[[DiffResults]] -deps = ["StaticArrays"] -git-tree-sha1 = "c18e98cba888c6c25d1c3b048e4b3380ca956805" -uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5" -version = "1.0.3" - -[[DiffRules]] -deps = ["NaNMath", "Random", "SpecialFunctions"] -git-tree-sha1 = "214c3fcac57755cfda163d91c58893a8723f93e9" -uuid = "b552c78f-8df3-52c6-915a-8e097449b14b" -version = "1.0.2" - -[[Distances]] -deps = ["LinearAlgebra", "Statistics", "StatsAPI"] -git-tree-sha1 = "abe4ad222b26af3337262b8afb28fab8d215e9f8" -uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" -version = "0.10.3" - -[[Distributed]] -deps = ["Random", "Serialization", "Sockets"] -uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" - -[[Distributions]] -deps = ["FillArrays", "LinearAlgebra", "PDMats", "Printf", "QuadGK", "Random", "SparseArrays", "SpecialFunctions", "Statistics", "StatsBase", "StatsFuns"] -git-tree-sha1 = "62e1ac52e9adf4234285cd88c94954924aa3f9ef" -uuid = "31c24e10-a181-5473-b8eb-7969acd0382f" -version = "0.25.5" - -[[DocStringExtensions]] -deps = ["LibGit2"] -git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f" -uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" -version = "0.8.5" - -[[Downloads]] -deps = ["ArgTools", "LibCURL", "NetworkOptions"] -uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" - -[[FillArrays]] -deps = ["LinearAlgebra", "Random", "SparseArrays"] -git-tree-sha1 = "31939159aeb8ffad1d4d8ee44d07f8558273120a" -uuid = "1a297f60-69ca-5386-bcde-b61e274b549b" -version = "0.11.7" - -[[FiniteDiff]] -deps = ["ArrayInterface", "LinearAlgebra", "Requires", "SparseArrays", "StaticArrays"] -git-tree-sha1 = "f6f80c8f934efd49a286bb5315360be66956dfc4" -uuid = "6a86dc24-6348-571c-b903-95158fe2bd41" -version = "2.8.0" - -[[ForwardDiff]] -deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "LinearAlgebra", "NaNMath", "Printf", "Random", "SpecialFunctions", "StaticArrays"] -git-tree-sha1 = "e2af66012e08966366a43251e1fd421522908be6" -uuid = "f6369f11-7733-5829-9624-2563aa707210" -version = "0.10.18" - -[[Functors]] -deps = ["MacroTools"] -git-tree-sha1 = "a7bb2af991c43dcf5c3455d276dd83976799634f" -uuid = "d9f16b24-f501-4c13-a1f2-28368ffc5196" -version = "0.2.1" - -[[IfElse]] -git-tree-sha1 = "28e837ff3e7a6c3cdb252ce49fb412c8eb3caeef" -uuid = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173" -version = "0.1.0" - -[[InteractiveUtils]] -deps = ["Markdown"] -uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" - -[[JLLWrappers]] -deps = ["Preferences"] -git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" -uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.3.0" - -[[KernelFunctions]] -deps = ["ChainRulesCore", "Compat", "CompositionsBase", "Distances", "FillArrays", "Functors", "LinearAlgebra", "Random", "Requires", "SpecialFunctions", "StatsBase", "StatsFuns", "TensorCore", "Test", "ZygoteRules"] -git-tree-sha1 = "c7b25bc625ca2ee217021d29e3ddf031967bf0ff" -uuid = "ec8451be-7e33-11e9-00cf-bbf324bd1392" -version = "0.10.5" - -[[LibCURL]] -deps = ["LibCURL_jll", "MozillaCACerts_jll"] -uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" - -[[LibCURL_jll]] -deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] -uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" - -[[LibGit2]] -deps = ["Base64", "NetworkOptions", "Printf", "SHA"] -uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" - -[[LibSSH2_jll]] -deps = ["Artifacts", "Libdl", "MbedTLS_jll"] -uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" - -[[Libdl]] -uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" - -[[LineSearches]] -deps = ["LinearAlgebra", "NLSolversBase", "NaNMath", "Parameters", "Printf"] -git-tree-sha1 = "f27132e551e959b3667d8c93eae90973225032dd" -uuid = "d3d80556-e9d4-5f37-9878-2ab0fcc64255" -version = "7.1.1" - -[[LinearAlgebra]] -deps = ["Libdl"] -uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" - -[[LogExpFunctions]] -deps = ["DocStringExtensions", "LinearAlgebra"] -git-tree-sha1 = "1ba664552f1ef15325e68dc4c05c3ef8c2d5d885" -uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" -version = "0.2.4" - -[[Logging]] -uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" - -[[MacroTools]] -deps = ["Markdown", "Random"] -git-tree-sha1 = "6a8a2a625ab0dea913aba95c11370589e0239ff0" -uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" -version = "0.5.6" - -[[Markdown]] -deps = ["Base64"] -uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" - -[[MbedTLS_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" - -[[Missings]] -deps = ["DataAPI"] -git-tree-sha1 = "4ea90bd5d3985ae1f9a908bd4500ae88921c5ce7" -uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" -version = "1.0.0" - -[[Mmap]] -uuid = "a63ad114-7e13-5084-954f-fe012c677804" - -[[MozillaCACerts_jll]] -uuid = "14a3606d-f60d-562e-9121-12d972cd8159" - -[[NLSolversBase]] -deps = ["DiffResults", "Distributed", "FiniteDiff", "ForwardDiff"] -git-tree-sha1 = "50608f411a1e178e0129eab4110bd56efd08816f" -uuid = "d41bc354-129a-5804-8e4c-c37616107c6c" -version = "7.8.0" - -[[NaNMath]] -git-tree-sha1 = "bfe47e760d60b82b66b61d2d44128b62e3a369fb" -uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3" -version = "0.3.5" - -[[NetworkOptions]] -uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" - -[[OpenSpecFun_jll]] -deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" -uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" -version = "0.5.5+0" - -[[Optim]] -deps = ["Compat", "FillArrays", "LineSearches", "LinearAlgebra", "NLSolversBase", "NaNMath", "Parameters", "PositiveFactorizations", "Printf", "SparseArrays", "StatsBase"] -git-tree-sha1 = "d34366a3abc25c41f88820762ef7dfdfe9306711" -uuid = "429524aa-4258-5aef-a3af-852621145aeb" -version = "1.3.0" - -[[OrderedCollections]] -git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" -uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.4.1" - -[[PDMats]] -deps = ["LinearAlgebra", "SparseArrays", "SuiteSparse"] -git-tree-sha1 = "4dd403333bcf0909341cfe57ec115152f937d7d8" -uuid = "90014a1f-27ba-587c-ab20-58faa44d9150" -version = "0.11.1" - -[[Parameters]] -deps = ["OrderedCollections", "UnPack"] -git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345" -uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" -version = "0.12.2" - -[[Pkg]] -deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] -uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" - -[[PositiveFactorizations]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "17275485f373e6673f7e7f97051f703ed5b15b20" -uuid = "85a6dd25-e78a-55b7-8502-1745935b8125" -version = "0.2.4" - -[[Preferences]] -deps = ["TOML"] -git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" -uuid = "21216c6a-2e73-6563-6e65-726566657250" -version = "1.2.2" - -[[Printf]] -deps = ["Unicode"] -uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" - -[[QuadGK]] -deps = ["DataStructures", "LinearAlgebra"] -git-tree-sha1 = "12fbe86da16df6679be7521dfb39fbc861e1dc7b" -uuid = "1fd47b50-473d-5c70-9696-f719f8f3bcdc" -version = "2.4.1" - -[[REPL]] -deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] -uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" - -[[Random]] -deps = ["Serialization"] -uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" - -[[RecipesBase]] -git-tree-sha1 = "b3fb709f3c97bfc6e948be68beeecb55a0b340ae" -uuid = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" -version = "1.1.1" - -[[Reexport]] -git-tree-sha1 = "5f6c21241f0f655da3952fd60aa18477cf96c220" -uuid = "189a3867-3050-52da-a836-e630ba90ab69" -version = "1.1.0" - -[[Requires]] -deps = ["UUIDs"] -git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621" -uuid = "ae029012-a4dd-5104-9daa-d747884805df" -version = "1.1.3" - -[[Rmath]] -deps = ["Random", "Rmath_jll"] -git-tree-sha1 = "bf3188feca147ce108c76ad82c2792c57abe7b1f" -uuid = "79098fc4-a85e-5d69-aa6a-4863f24498fa" -version = "0.7.0" - -[[Rmath_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "68db32dff12bb6127bac73c209881191bf0efbb7" -uuid = "f50d1b31-88e8-58de-be2c-1cc44531875f" -version = "0.3.0+0" - -[[SHA]] -uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" - -[[Serialization]] -uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" - -[[SharedArrays]] -deps = ["Distributed", "Mmap", "Random", "Serialization"] -uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" - -[[Sockets]] -uuid = "6462fe0b-24de-5631-8697-dd941f90decc" - -[[SortingAlgorithms]] -deps = ["DataStructures"] -git-tree-sha1 = "2ec1962eba973f383239da22e75218565c390a96" -uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" -version = "1.0.0" - -[[SparseArrays]] -deps = ["LinearAlgebra", "Random"] -uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" - -[[SpecialFunctions]] -deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"] -git-tree-sha1 = "a50550fa3164a8c46747e62063b4d774ac1bcf49" -uuid = "276daf66-3868-5448-9aa4-cd146d93841b" -version = "1.5.1" - -[[Static]] -deps = ["IfElse"] -git-tree-sha1 = "2740ea27b66a41f9d213561a04573da5d3823d4b" -uuid = "aedffcd0-7271-4cad-89d0-dc628f76c6d3" -version = "0.2.5" - -[[StaticArrays]] -deps = ["LinearAlgebra", "Random", "Statistics"] -git-tree-sha1 = "57a9b3c69933e15e5b7041b6a57d1533ef1a9882" -uuid = "90137ffa-7385-5640-81b9-e52037218182" -version = "1.2.3" - -[[Statistics]] -deps = ["LinearAlgebra", "SparseArrays"] -uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" - -[[StatsAPI]] -git-tree-sha1 = "1958272568dc176a1d881acb797beb909c785510" -uuid = "82ae8749-77ed-4fe6-ae5f-f523153014b0" -version = "1.0.0" - -[[StatsBase]] -deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics", "StatsAPI"] -git-tree-sha1 = "2f6792d523d7448bbe2fec99eca9218f06cc746d" -uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" -version = "0.33.8" - -[[StatsFuns]] -deps = ["LogExpFunctions", "Rmath", "SpecialFunctions"] -git-tree-sha1 = "30cd8c360c54081f806b1ee14d2eecbef3c04c49" -uuid = "4c63d2b9-4356-54db-8cca-17b64c39e42c" -version = "0.9.8" - -[[SuiteSparse]] -deps = ["Libdl", "LinearAlgebra", "Serialization", "SparseArrays"] -uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9" - -[[TOML]] -deps = ["Dates"] -uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" - -[[Tar]] -deps = ["ArgTools", "SHA"] -uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" - -[[TensorCore]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "1feb45f88d133a655e001435632f019a9a1bcdb6" -uuid = "62fd8b95-f654-4bbd-a8a5-9c27f68ccd50" -version = "0.1.1" - -[[Test]] -deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] -uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - -[[UUIDs]] -deps = ["Random", "SHA"] -uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" - -[[UnPack]] -git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" -uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" -version = "1.0.2" - -[[Unicode]] -uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" - -[[Zlib_jll]] -deps = ["Libdl"] -uuid = "83775a58-1f1d-513f-b197-d71354ab007a" - -[[ZygoteRules]] -deps = ["MacroTools"] -git-tree-sha1 = "9e7a1e8ca60b742e508a315c17eef5211e7fbfd7" -uuid = "700de1a5-db45-46bc-99cf-38207098b444" -version = "0.2.1" - -[[nghttp2_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" - -[[p7zip_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/src/svgp.jl b/src/svgp.jl index 64a19d63..47fa7b57 100644 --- a/src/svgp.jl +++ b/src/svgp.jl @@ -1,7 +1,7 @@ struct SVGP end # TODO: should probably just be VFE? function approx_posterior(::SVGP, fu::FiniteGP, q::MvNormal) - m, A = q.μ, q.Σ.chol + m, A = q.μ, cholesky(q.Σ) Kuu = cholesky(Symmetric(cov(fu))) B = Kuu.L \ A.L data = (A=A, m=m, Kuu=Kuu, B=B, α=Kuu \ m, u=fu.x) @@ -22,7 +22,7 @@ function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) Kuu = f.data.Kuu B = f.data.B D = f.data.Kuu.L \ Cux - return cov(f.prior, x) - D' * B * B' * D + return cov(f.prior, x) - D'D + D' * B * B' * D end function StatsBase.mean_and_cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) @@ -31,9 +31,8 @@ function StatsBase.mean_and_cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) end function kl_divergence(q::MvNormal, p::AbstractMvNormal) - (1/2) * (logdet(q.Σ.chol) - - logdet(cov(p)) - length(mean(p)) - + tr(inv(q.Σ.chol) * cov(p)) + Xt_invA_X(q.Σ.chol, (mean(q)-mean(p)))) + (1/2) .* (logdet(cov(p)) - logdet(cov(q)) - length(mean(p)) + tr(cov(p) \ cov(q)) + + AbstractGPs.Xt_invA_X(cholesky(q.Σ), (mean(q) - mean(p)))) end function elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal) @@ -45,8 +44,9 @@ function elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal Σy = diag(fx.Σy) # TODO: general method for likelihoods - quadrature like GPFlow? - variational_exp = -0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy) - + variational_exp = -0.5 * ( + log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy + ) # TODO: rescale for minibatches return sum(variational_exp) - kl_term end From 798f77a6f742e69a286507e807298aac3a98c6b4 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Mon, 21 Jun 2021 00:53:48 +0100 Subject: [PATCH 03/66] Update example to use Flux --- Project.toml | 1 + examples/gpflow_svgp.jl | 118 ++++++++++++++++++++++------------------ 2 files changed, 67 insertions(+), 52 deletions(-) diff --git a/Project.toml b/Project.toml index e97fe5a2..92dd78b6 100644 --- a/Project.toml +++ b/Project.toml @@ -6,6 +6,7 @@ version = "0.1.0" [deps] AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Optim = "429524aa-4258-5aef-a3af-852621145aeb" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" diff --git a/examples/gpflow_svgp.jl b/examples/gpflow_svgp.jl index 935f5330..158001e9 100644 --- a/examples/gpflow_svgp.jl +++ b/examples/gpflow_svgp.jl @@ -4,7 +4,6 @@ using AbstractGPs using SparseGPs using Distributions using LinearAlgebra -using StatsFuns using Optim using Plots @@ -25,59 +24,67 @@ y = g.(x) + 0.3 * randn(N) scatter(x, y; xlabel="x", ylabel="y", legend=false) # %% -M = 30 # number of inducing points +M = 50 # number of inducing points -function pack_params(θ, m, A) - return vcat(θ, m, vec(A)) +# TODO: incorporate better inducing point selection from +# https://github.com/JuliaGaussianProcesses/InducingPoints.jl? +z = x[1:M] + +# %% +# A simple Flux model +using Flux + +struct SVGPModel + k + m + A + z end -function unpack_params(params, m; include_z=false) - if include_z - k = params[1:2] - z = params[3:m+2] - μ = params[m+3:2m+2] - s = params[2m+3:end] - Σ = reshape(s, (M, M)) - return k, z, μ, Σ - else - k = params[1:2] - μ = params[3:m+2] - s = params[m+3:end] - Σ = reshape(s, (M, M)) - return k, μ, Σ - end +function make_kernel(k) + return Flux.softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(Flux.softplus(k[2]))) end -x0 = pack_params(rand(2), zeros(M), vec(Matrix{Float64}(I, M, M))) -z = x[1:M] +function (m::SVGPModel)(x, y) + kernel = make_kernel(m.k) + f = GP(kernel) + q = MvNormal(m.m, m.A'm.A + 0.001I) + fx = f(x, 0.1) + fu = f(m.z, 0.1) + return -SparseGPs.elbo(fx, y, fu, q) +end -# %% -function objective_function(x, y) - function neg_elbo(params) - # k, z, qμ, qΣ_L = split_params(params, M) - k, m, A = unpack_params(params, M) - kernel = - (softplus(k[1])) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2]))) - f = GP(kernel) - fx = f(x, 0.1) - q = MvNormal(m, A'A) - return -SparseGPs.elbo(fx, y, f(z), q) - end - return neg_elbo +function posterior(m::SVGPModel) + kernel = make_kernel(m.k) + f = GP(kernel) + fu = f(m.z, 0.1) + q = MvNormal(m.m, m.A'm.A + 0.0001I) + return SparseGPs.approx_posterior(SVGP(), fu, q) end -# Currently fails at the cholesky factorisation of cov(f(z)) -opt = optimize(objective_function(x, y), x0, LBFGS()) +k = [0.3, 10] +m = zeros(M) +A = Matrix{Float64}(I, M, M) -# %% -opt_k, opt_μ, opt_Σ_L = unpack_params(opt.minimizer, M) -opt_kernel = - softplus(opt_k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(opt_k[2]) + 0.01)) -opt_f = GP(opt_kernel) -opt_q = MvNormal(opt_μ, opt_Σ_L * opt_Σ_L') -ap = SparseGPs.approx_posterior(SVGP(), opt_f(z), opt_q) -logpdf(ap(x), y) +model = SVGPModel(k, m, A, z) + +function flux_loss(x, y) + return model(x, y) +end + +data = [(x, y)] +opt = ADAM(0.01) +parameters = Flux.params(k, m, A) + +println(flux_loss(x, y)) + +for epoch in 1:300 + Flux.train!(flux_loss, parameters, data, opt) +end + +println(flux_loss(x, y)) +post = posterior(model) # %% scatter( x, @@ -88,14 +95,13 @@ scatter( title="posterior (VI with sparse grid)", label="Train Data", ) -# scatter!(x, y; label="Test Data") -plot!(-1:0.001:1, ap; label=false) +plot!(-1:0.001:1, post; label="Posterior") vline!(z; label="Pseudo-points") # %% Find the exact posterior over u (e.g. # https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/ equations -# (11) & (12)) As a sanity check -- this seems to work. +# (11) & (12)) As a sanity check. function exact_q(fu, fx, y) σ² = fx.Σy[1] @@ -107,16 +113,22 @@ function exact_q(fu, fx, y) return MvNormal(m, A) end -kernel = 0.3 * (SqExponentialKernel() ∘ ScaleTransform(10)) +kernel = make_kernel([0.2, 11]) f = GP(kernel) -fx = f(x) -fu = f(z) +fx = f(x, 0.1) +fu = f(z, 0.1) q_ex = exact_q(fu, fx, y) scatter(x, y) scatter!(z, q_ex.μ) -ap_ex = SparseGPs.approx_posterior(SVGP(), fu, q_ex) +# These two should be the same - and they are, the plot below shows almost identical predictions +ap_ex = SparseGPs.approx_posterior(SVGP(), fu, q_ex) # Hensman 2013 (exact) posterior +ap_tits = AbstractGPs.approx_posterior(VFE(), fx, y, fu) # Titsias posterior + +# Should these be the same? (they currently aren't) +SparseGPs.elbo(fx, y, fu, q_ex) +AbstractGPs.elbo(fx, y, fu) # %% scatter( @@ -128,5 +140,7 @@ scatter( title="posterior (VI with sparse grid)", label="Train Data", ) -plot!(-1:0.001:1, ap_ex; label=false) +plot!(-1:0.001:1, ap_ex; label="SVGP posterior") +plot!(-1:0.001:1, ap_tits; label="Titsias posterior") vline!(z; label="Pseudo-points") + From 0641423c1cfbee868ca66ceb7fabba3cd026bba9 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Mon, 21 Jun 2021 20:01:27 +0100 Subject: [PATCH 04/66] Remove Flux as a dep & factor out expected_loglik --- Project.toml | 1 - src/svgp.jl | 18 ++++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/Project.toml b/Project.toml index 92dd78b6..e97fe5a2 100644 --- a/Project.toml +++ b/Project.toml @@ -6,7 +6,6 @@ version = "0.1.0" [deps] AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" -Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Optim = "429524aa-4258-5aef-a3af-852621145aeb" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" diff --git a/src/svgp.jl b/src/svgp.jl index 47fa7b57..5e57c721 100644 --- a/src/svgp.jl +++ b/src/svgp.jl @@ -26,8 +26,13 @@ function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) end function StatsBase.mean_and_cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) - # TODO: implement properly - return mean(f, x), cov(f, x) + Cux = cov(f.prior, f.data.u, x) + Kuu = f.data.Kuu + B = f.data.B + D = f.data.Kuu.L \ Cux + μ = Cux' * f.data.α + Σ = cov(f.prior, x) - D'D + D' * B * B' * D + return μ, Σ end function kl_divergence(q::MvNormal, p::AbstractMvNormal) @@ -35,6 +40,10 @@ function kl_divergence(q::MvNormal, p::AbstractMvNormal) AbstractGPs.Xt_invA_X(cholesky(q.Σ), (mean(q) - mean(p)))) end +function expected_loglik(y::AbstractVector{<:Real}, f_mean::AbstractVector, f_var::AbstractVector, Σy::AbstractVector) + return -0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy) +end + function elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal) kl_term = kl_divergence(q, fu) post = approx_posterior(SVGP(), fu, q) @@ -43,10 +52,7 @@ function elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal Σy = diag(fx.Σy) - # TODO: general method for likelihoods - quadrature like GPFlow? - variational_exp = -0.5 * ( - log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy - ) + variational_exp = expected_loglik(y, f_mean, f_var, Σy) # TODO: rescale for minibatches return sum(variational_exp) - kl_term end From 1e4fc90986ed5e7ba5296a59e2b732f1de9e9853 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Mon, 21 Jun 2021 21:32:00 +0100 Subject: [PATCH 05/66] Update example to use basic Flux layer --- examples/gpflow_svgp.jl | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/examples/gpflow_svgp.jl b/examples/gpflow_svgp.jl index 158001e9..4f56b605 100644 --- a/examples/gpflow_svgp.jl +++ b/examples/gpflow_svgp.jl @@ -34,31 +34,31 @@ z = x[1:M] # A simple Flux model using Flux -struct SVGPModel - k - m - A - z +struct SVGPLayer + k # kernel parameters + m # variational mean + A # variational covariance + z # inducing points end function make_kernel(k) return Flux.softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(Flux.softplus(k[2]))) end -function (m::SVGPModel)(x, y) +function (m::SVGPLayer)(x) kernel = make_kernel(m.k) f = GP(kernel) - q = MvNormal(m.m, m.A'm.A + 0.001I) + q = MvNormal(m.m, m.A'm.A) fx = f(x, 0.1) fu = f(m.z, 0.1) - return -SparseGPs.elbo(fx, y, fu, q) + return fx, fu, q end -function posterior(m::SVGPModel) +function posterior(m::SVGPLayer) kernel = make_kernel(m.k) f = GP(kernel) fu = f(m.z, 0.1) - q = MvNormal(m.m, m.A'm.A + 0.0001I) + q = MvNormal(m.m, m.A'm.A) return SparseGPs.approx_posterior(SVGP(), fu, q) end @@ -66,10 +66,11 @@ k = [0.3, 10] m = zeros(M) A = Matrix{Float64}(I, M, M) -model = SVGPModel(k, m, A, z) +model = SVGPLayer(k, m, A, z) function flux_loss(x, y) - return model(x, y) + fx, fu, q = model(x) + return -SparseGPs.elbo(fx, y, fu, q) end data = [(x, y)] From bb42044683078cc106a2dc0dbfb9daf62fd3f321 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Tue, 22 Jun 2021 02:49:55 +0100 Subject: [PATCH 06/66] Add minibatching. --- examples/gpflow_svgp.jl | 47 ++++++++++++++++++++++++++--------------- src/svgp.jl | 7 +++--- 2 files changed, 33 insertions(+), 21 deletions(-) diff --git a/examples/gpflow_svgp.jl b/examples/gpflow_svgp.jl index 4f56b605..63f06668 100644 --- a/examples/gpflow_svgp.jl +++ b/examples/gpflow_svgp.jl @@ -5,6 +5,7 @@ using SparseGPs using Distributions using LinearAlgebra using Optim +using IterTools using Plots default(; legend=:outertopright, size=(700, 400)) @@ -17,7 +18,7 @@ function g(x) return sin(3π * x) + 0.3 * cos(9π * x) + 0.5 * sin(7π * x) end -N = 1000 # Number of training points +N = 10000 # Number of training points x = rand(Uniform(-1, 1), N) y = g.(x) + 0.3 * randn(N) @@ -41,56 +42,68 @@ struct SVGPLayer z # inducing points end +@Flux.functor SVGPLayer + function make_kernel(k) - return Flux.softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(Flux.softplus(k[2]))) + return softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2]))) end function (m::SVGPLayer)(x) kernel = make_kernel(m.k) f = GP(kernel) q = MvNormal(m.m, m.A'm.A) - fx = f(x, 0.1) - fu = f(m.z, 0.1) + fx = f(x, 0.3) + fu = f(m.z, 0.3) return fx, fu, q end function posterior(m::SVGPLayer) kernel = make_kernel(m.k) f = GP(kernel) - fu = f(m.z, 0.1) + fu = f(m.z, 0.3) q = MvNormal(m.m, m.A'm.A) return SparseGPs.approx_posterior(SVGP(), fu, q) end +function flux_loss(x, y; n_data=1, n_batch=1) + fx, fu, q = model(x) + return -SparseGPs.elbo(fx, y, fu, q; n_data, n_batch) +end + +# Initialise the parameters k = [0.3, 10] m = zeros(M) A = Matrix{Float64}(I, M, M) model = SVGPLayer(k, m, A, z) -function flux_loss(x, y) - fx, fu, q = model(x) - return -SparseGPs.elbo(fx, y, fu, q) -end - -data = [(x, y)] +b = 100 # minibatch size opt = ADAM(0.01) -parameters = Flux.params(k, m, A) +# parameters = Flux.params(k, s, m, A) +parameters = Flux.params(model) +data_loader = Flux.Data.DataLoader((x, y), batchsize=b) +# %% println(flux_loss(x, y)) -for epoch in 1:300 - Flux.train!(flux_loss, parameters, data, opt) -end +Flux.train!( + (x, y) -> flux_loss(x, y; n_data=N, n_batch=b), + parameters, + ncycle(data_loader, 100), # Train for 100 epochs + opt +) println(flux_loss(x, y)) -post = posterior(model) # %% +post = posterior(model) + scatter( x, y; - xlim=(0, 1), + markershape=:xcross, + markeralpha=0.1, + xlim=(-1, 1), xlabel="x", ylabel="y", title="posterior (VI with sparse grid)", diff --git a/src/svgp.jl b/src/svgp.jl index 5e57c721..2b050616 100644 --- a/src/svgp.jl +++ b/src/svgp.jl @@ -44,16 +44,15 @@ function expected_loglik(y::AbstractVector{<:Real}, f_mean::AbstractVector, f_va return -0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy) end -function elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal) +function elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal; n_data=1, n_batch=1) kl_term = kl_divergence(q, fu) post = approx_posterior(SVGP(), fu, q) f_mean = mean(post, fx.x) f_var = var(post, fx.x) - Σy = diag(fx.Σy) variational_exp = expected_loglik(y, f_mean, f_var, Σy) - # TODO: rescale for minibatches - return sum(variational_exp) - kl_term + scale = n_data / n_batch + return sum(variational_exp) * scale - kl_term end From 102d8128369f57a462b18ee67db3370dc86fcdd9 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Thu, 24 Jun 2021 00:28:44 +0100 Subject: [PATCH 07/66] Improved variance calculation. --- src/SparseGPs.jl | 2 +- src/svgp.jl | 32 ++++++++++++++++++-------------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl index 24f7faa3..57697c8d 100644 --- a/src/SparseGPs.jl +++ b/src/SparseGPs.jl @@ -8,7 +8,7 @@ using LinearAlgebra using Statistics using StatsBase -using AbstractGPs: FiniteGP, ApproxPosteriorGP, _cholesky, _symmetric, Xt_invA_X, diag_At_A +using AbstractGPs: FiniteGP, ApproxPosteriorGP, _cholesky, _symmetric, At_A, diag_At_A export elbo, approx_posterior, diff --git a/src/svgp.jl b/src/svgp.jl index 2b050616..c082a83c 100644 --- a/src/svgp.jl +++ b/src/svgp.jl @@ -1,16 +1,17 @@ -struct SVGP end # TODO: should probably just be VFE? +struct SVGP end function approx_posterior(::SVGP, fu::FiniteGP, q::MvNormal) - m, A = q.μ, cholesky(q.Σ) + m, A = mean(q), cholesky(cov(q)) Kuu = cholesky(Symmetric(cov(fu))) - B = Kuu.L \ A.L + B = Kuu.L \ A.L data = (A=A, m=m, Kuu=Kuu, B=B, α=Kuu \ m, u=fu.x) return ApproxPosteriorGP(SVGP(), fu.f, data) end function Statistics.var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) - # TODO: Don't compute the full covar - return diag(cov(f, x)) + Cux = cov(f.prior, f.data.u, x) + D = f.data.Kuu.L \ Cux + return var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) end function Statistics.mean(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) @@ -19,25 +20,29 @@ end function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) Cux = cov(f.prior, f.data.u, x) - Kuu = f.data.Kuu - B = f.data.B D = f.data.Kuu.L \ Cux - return cov(f.prior, x) - D'D + D' * B * B' * D + return cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) end function StatsBase.mean_and_cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) Cux = cov(f.prior, f.data.u, x) - Kuu = f.data.Kuu - B = f.data.B D = f.data.Kuu.L \ Cux μ = Cux' * f.data.α - Σ = cov(f.prior, x) - D'D + D' * B * B' * D + Σ = cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) return μ, Σ end +function StatsBase.mean_and_var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) + Cux = cov(f.prior, f.data.u, x) + D = f.data.Kuu.L \ Cux + μ = Cux' * f.data.α + Σ_diag = var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) + return μ, Σ_diag +end + function kl_divergence(q::MvNormal, p::AbstractMvNormal) (1/2) .* (logdet(cov(p)) - logdet(cov(q)) - length(mean(p)) + tr(cov(p) \ cov(q)) + - AbstractGPs.Xt_invA_X(cholesky(q.Σ), (mean(q) - mean(p)))) + AbstractGPs.Xt_invA_X(cholesky(cov(q)), (mean(q) - mean(p)))) end function expected_loglik(y::AbstractVector{<:Real}, f_mean::AbstractVector, f_var::AbstractVector, Σy::AbstractVector) @@ -47,8 +52,7 @@ end function elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal; n_data=1, n_batch=1) kl_term = kl_divergence(q, fu) post = approx_posterior(SVGP(), fu, q) - f_mean = mean(post, fx.x) - f_var = var(post, fx.x) + f_mean, f_var = mean_and_var(post, fx.x) Σy = diag(fx.Σy) variational_exp = expected_loglik(y, f_mean, f_var, Σy) From 3089d939eb4c9dc34da953c6605942d69435e848 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Thu, 24 Jun 2021 15:11:55 +0100 Subject: [PATCH 08/66] Initial quadrature implementation --- Project.toml | 2 ++ src/SparseGPs.jl | 12 +++++++++++- src/svgp.jl | 39 ++++++++++++++++++++++++++++++++++++--- 3 files changed, 49 insertions(+), 4 deletions(-) diff --git a/Project.toml b/Project.toml index e97fe5a2..f0fe1a01 100644 --- a/Project.toml +++ b/Project.toml @@ -6,6 +6,8 @@ version = "0.1.0" [deps] AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +FastGaussQuadrature = "442a2c76-b920-505d-bb47-c5924d526838" +GPLikelihoods = "6031954c-0455-49d7-b3b9-3e1c99afaf40" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Optim = "429524aa-4258-5aef-a3af-852621145aeb" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl index 57697c8d..309864b2 100644 --- a/src/SparseGPs.jl +++ b/src/SparseGPs.jl @@ -7,8 +7,18 @@ using StatsFuns using LinearAlgebra using Statistics using StatsBase +using FastGaussQuadrature +using GPLikelihoods -using AbstractGPs: FiniteGP, ApproxPosteriorGP, _cholesky, _symmetric, At_A, diag_At_A +using AbstractGPs: + FiniteGP, + LatentFiniteGP, + ApproxPosteriorGP, + _cholesky, + _symmetric, + At_A, + diag_At_A, + Xt_invA_X export elbo, approx_posterior, diff --git a/src/svgp.jl b/src/svgp.jl index c082a83c..264b88dc 100644 --- a/src/svgp.jl +++ b/src/svgp.jl @@ -41,14 +41,37 @@ function StatsBase.mean_and_var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) end function kl_divergence(q::MvNormal, p::AbstractMvNormal) - (1/2) .* (logdet(cov(p)) - logdet(cov(q)) - length(mean(p)) + tr(cov(p) \ cov(q)) + - AbstractGPs.Xt_invA_X(cholesky(cov(q)), (mean(q) - mean(p)))) + p_μ, p_Σ = mean(p), cov(p) + (1/2) .* (logdet(p_Σ) - logdet(q.Σ) - length(p_μ) + tr(p_Σ \ q.Σ) + + Xt_invA_X(cholesky(q.Σ), (q.μ - p_μ))) end -function expected_loglik(y::AbstractVector{<:Real}, f_mean::AbstractVector, f_var::AbstractVector, Σy::AbstractVector) +# The closed form expected loglikelihood for a Gaussian likelihood +function expected_loglik( + y::AbstractVector{<:Real}, + f_mean::AbstractVector, + f_var::AbstractVector, + Σy::AbstractVector +) return -0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy) end +function expected_loglik( + y::AbstractVector{<:Real}, + f_mean::AbstractVector, + f_var::AbstractVector, + lik::BernoulliLikelihood; + n_quad_points=20 +) + # Compute the expectation via Gauss-Hermite quadrature + # using a reparameterisation by change of variable + # (see eg. en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature) + v, w = gausshermite(n_quad_points) + h = √2 * .√f_var' .* v .+ f_mean' + lls = loglikelihood.(lik.(h), y') + return ((1/√π) * w'lls)' +end + function elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal; n_data=1, n_batch=1) kl_term = kl_divergence(q, fu) post = approx_posterior(SVGP(), fu, q) @@ -60,3 +83,13 @@ function elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal return sum(variational_exp) * scale - kl_term end +function elbo(fx::LatentFiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal; n_data=1, n_batch=1) + kl_term = kl_divergence(q, fu) + post = approx_posterior(SVGP(), fu, q) + f_mean, f_var = mean_and_var(post, fx.fx.x) + + variational_exp = expected_loglik(y, f_mean, f_var, fx.lik) + scale = n_data / n_batch + return sum(variational_exp) * scale - kl_term +end + From 59474c5e357a3700eb58757d2bc060043bcf5060 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Fri, 2 Jul 2021 01:01:39 +0100 Subject: [PATCH 09/66] Moved quadrature to new file. --- Project.toml | 1 + examples/classification.jl | 41 ++++++++++++++++++++++++++++++++++++++ src/SparseGPs.jl | 2 ++ src/quadrature.jl | 18 +++++++++++++++++ src/svgp.jl | 12 +++-------- 5 files changed, 65 insertions(+), 9 deletions(-) create mode 100644 examples/classification.jl create mode 100644 src/quadrature.jl diff --git a/Project.toml b/Project.toml index f0fe1a01..45712703 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ version = "0.1.0" [deps] AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918" +ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" FastGaussQuadrature = "442a2c76-b920-505d-bb47-c5924d526838" GPLikelihoods = "6031954c-0455-49d7-b3b9-3e1c99afaf40" diff --git a/examples/classification.jl b/examples/classification.jl new file mode 100644 index 00000000..a522f016 --- /dev/null +++ b/examples/classification.jl @@ -0,0 +1,41 @@ +# Recreation of https://gpflow.readthedocs.io/en/master/notebooks/basics/classification.html + +using SparseGPs +using AbstractGPs +using GPLikelihoods +using StatsFuns +using FastGaussQuadrature +using Distributions +using LinearAlgebra + +using Plots + +x = [5.668341708542713242, 5.758793969849246075, 5.517587939698492150, 2.954773869346733584, 3.648241206030150785, 2.110552763819095290, 4.613065326633165597, 4.793969849246231263, 4.703517587939698430, 6.030150753768843686, 3.015075376884421843, 3.979899497487437099, 3.226130653266331638, 1.899497487437185939, 1.145728643216080256, 3.316582914572864249, 6.030150753768843686, 2.231155778894472252, 3.256281407035175768, 1.085427135678391997, 1.809045226130653106, 4.492462311557789079, 1.959798994974874198, 0.000000000000000000, 3.346733668341708601, 1.507537688442210921, 1.809045226130653328, 5.517587939698492150, 2.201005025125628123, 5.577889447236180409, 1.809045226130653328, 1.688442211055276365, 4.160804020100502321, 2.170854271356783993, 4.311557788944723413, 3.075376884422110546, 5.125628140703517133, 1.989949748743718549, 5.366834170854271058, 4.100502512562814061, 7.236180904522613311, 2.261306532663316382, 3.467336683417085119, 1.085427135678391997, 5.095477386934673447, 5.185929648241205392, 2.743718592964823788, 2.773869346733668362, 1.417085427135678311, 1.989949748743718549] +y = [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1] + +function make_kernel(k) + return softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2]))) +end + +k = [0.1, 0.1] + +kernel = make_kernel(k) +f = LatentGP(GP(kernel), BernoulliLikelihood(), 0.1) +fx = f(x) +z = x[1:10] +fu = f(z).fx # want the underlying FiniteGP +q = MvNormal(zeros(length(z)), I) + +SparseGPs.kl_divergence(q, fu) +SparseGPs.elbo(fx, y, fu, q) + +post = SparseGPs.approx_posterior(SVGP(), fu, q) +f_mean, f_var = mean_and_var(post, fx.fx.x) + + +# v = inputs to evaluate +# w = weights +v, w = gausshermite(20); +h = √2 * .√f_var' .* v .+ f_mean' +lls = loglikelihood.(f.lik.(h), y') +var_exp = (1/√π) * sum(w'lls) diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl index 309864b2..335af352 100644 --- a/src/SparseGPs.jl +++ b/src/SparseGPs.jl @@ -9,6 +9,7 @@ using Statistics using StatsBase using FastGaussQuadrature using GPLikelihoods +using ChainRulesCore using AbstractGPs: FiniteGP, @@ -24,6 +25,7 @@ export elbo, approx_posterior, SVGP +include("quadrature.jl") include("svgp.jl") end diff --git a/src/quadrature.jl b/src/quadrature.jl new file mode 100644 index 00000000..7ba26e20 --- /dev/null +++ b/src/quadrature.jl @@ -0,0 +1,18 @@ + +function gauss_hermite_quadrature( + y::AbstractVector, + f_mean::AbstractVector, + f_var::AbstractVector, + lik::BernoulliLikelihood; + n_points=20 +) + # Compute the expectation via Gauss-Hermite quadrature + # using a reparameterisation by change of variable + # (see eg. en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature) + xs, ws = gausshermite(n_points) + fs = √2 * .√f_var' .* xs .+ f_mean' + lls = loglikelihood.(lik.(fs), y') + return ((1/√π) * ws'lls)' +end + +ChainRulesCore.@non_differentiable gausshermite(n) diff --git a/src/svgp.jl b/src/svgp.jl index 264b88dc..7cc31515 100644 --- a/src/svgp.jl +++ b/src/svgp.jl @@ -57,19 +57,13 @@ function expected_loglik( end function expected_loglik( - y::AbstractVector{<:Real}, + y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, lik::BernoulliLikelihood; - n_quad_points=20 + n_points=20 ) - # Compute the expectation via Gauss-Hermite quadrature - # using a reparameterisation by change of variable - # (see eg. en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature) - v, w = gausshermite(n_quad_points) - h = √2 * .√f_var' .* v .+ f_mean' - lls = loglikelihood.(lik.(h), y') - return ((1/√π) * w'lls)' + return gauss_hermite_quadrature(y, f_mean, f_var, lik; n_points=n_points) end function elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal; n_data=1, n_batch=1) From 25e662791c1ba444803b618a7afb06fd9124e301 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Sat, 3 Jul 2021 23:48:05 +0100 Subject: [PATCH 10/66] Fixed AD for quadrature. --- src/quadrature.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/quadrature.jl b/src/quadrature.jl index 7ba26e20..268e450d 100644 --- a/src/quadrature.jl +++ b/src/quadrature.jl @@ -1,18 +1,18 @@ - function gauss_hermite_quadrature( y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, - lik::BernoulliLikelihood; + lik; n_points=20 ) # Compute the expectation via Gauss-Hermite quadrature # using a reparameterisation by change of variable # (see eg. en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature) xs, ws = gausshermite(n_points) - fs = √2 * .√f_var' .* xs .+ f_mean' - lls = loglikelihood.(lik.(fs), y') - return ((1/√π) * ws'lls)' + # size(fs): (n_points, length(y)) + fs = √2 * .√f_var .* transpose(xs) .+ f_mean + lls = loglikelihood.(lik.(fs), y) + return (1/√π) * lls * ws end ChainRulesCore.@non_differentiable gausshermite(n) From 54b5470bc470f1422464b423887756e5ba656f7c Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Sat, 3 Jul 2021 23:48:29 +0100 Subject: [PATCH 11/66] Fixed AD for KL divergence. --- src/svgp.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/svgp.jl b/src/svgp.jl index 7cc31515..6962a148 100644 --- a/src/svgp.jl +++ b/src/svgp.jl @@ -42,7 +42,7 @@ end function kl_divergence(q::MvNormal, p::AbstractMvNormal) p_μ, p_Σ = mean(p), cov(p) - (1/2) .* (logdet(p_Σ) - logdet(q.Σ) - length(p_μ) + tr(p_Σ \ q.Σ) + + (1/2) .* (logdet(p_Σ) - logdet(q.Σ) - length(p_μ) + tr(p_Σ \ cov(q)) + Xt_invA_X(cholesky(q.Σ), (q.μ - p_μ))) end From 5e1c8829eee86dd6139ade8fc8c264952ff7511a Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Sun, 4 Jul 2021 01:15:07 +0100 Subject: [PATCH 12/66] Added classification example. --- examples/classification.jl | 143 +++++++++++++++++++++++++++++++---- examples/data/classif_1D.csv | 50 ++++++++++++ src/quadrature.jl | 2 +- 3 files changed, 179 insertions(+), 16 deletions(-) create mode 100644 examples/data/classif_1D.csv diff --git a/examples/classification.jl b/examples/classification.jl index a522f016..21266bda 100644 --- a/examples/classification.jl +++ b/examples/classification.jl @@ -1,5 +1,6 @@ # Recreation of https://gpflow.readthedocs.io/en/master/notebooks/basics/classification.html +# %% using SparseGPs using AbstractGPs using GPLikelihoods @@ -7,35 +8,147 @@ using StatsFuns using FastGaussQuadrature using Distributions using LinearAlgebra +using DelimitedFiles +using IterTools using Plots -x = [5.668341708542713242, 5.758793969849246075, 5.517587939698492150, 2.954773869346733584, 3.648241206030150785, 2.110552763819095290, 4.613065326633165597, 4.793969849246231263, 4.703517587939698430, 6.030150753768843686, 3.015075376884421843, 3.979899497487437099, 3.226130653266331638, 1.899497487437185939, 1.145728643216080256, 3.316582914572864249, 6.030150753768843686, 2.231155778894472252, 3.256281407035175768, 1.085427135678391997, 1.809045226130653106, 4.492462311557789079, 1.959798994974874198, 0.000000000000000000, 3.346733668341708601, 1.507537688442210921, 1.809045226130653328, 5.517587939698492150, 2.201005025125628123, 5.577889447236180409, 1.809045226130653328, 1.688442211055276365, 4.160804020100502321, 2.170854271356783993, 4.311557788944723413, 3.075376884422110546, 5.125628140703517133, 1.989949748743718549, 5.366834170854271058, 4.100502512562814061, 7.236180904522613311, 2.261306532663316382, 3.467336683417085119, 1.085427135678391997, 5.095477386934673447, 5.185929648241205392, 2.743718592964823788, 2.773869346733668362, 1.417085427135678311, 1.989949748743718549] -y = [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1] +# %% +# Read in the classification data +data_file = pkgdir(SparseGPs) * "/examples/data/classif_1D.csv" +x, y = eachcol(readdlm(data_file)) +scatter(x, y) + + +# %% +# First, create the GP kernel from given parameters k function make_kernel(k) return softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2]))) end -k = [0.1, 0.1] +k = [10, 0.1] kernel = make_kernel(k) f = LatentGP(GP(kernel), BernoulliLikelihood(), 0.1) fx = f(x) -z = x[1:10] + + +# %% +# Then, plot some samples from the prior underlying GP +x_plot = 0:0.02:6 +prior_f_samples = rand(f.f(x_plot, 1e-6),20) + +plt = plot( + x_plot, + prior_f_samples; + seriescolor="red", + linealpha=0.2, + label="" +) +scatter!(plt, x, y; seriescolor="blue", label="Data points") + +# %% +# Plot the same samples, but pushed through a logistic sigmoid to constrain +# them in (0, 1). +prior_y_samples = mean.(f.lik.(prior_f_samples)) + +plt = plot( + x_plot, + prior_y_samples; + seriescolor="red", + linealpha=0.2, + label="" +) +scatter!(plt, x, y; seriescolor="blue", label="Data points") + + +# %% +using Flux + +struct SVGPLayer + k # kernel parameters + m # variational mean + A # variational covariance + z # inducing points +end + +@Flux.functor SVGPLayer (k, m, A,) # Don't train the inducing inputs + +lik = BernoulliLikelihood() +function (m::SVGPLayer)(x) + kernel = make_kernel(m.k) + f = LatentGP(GP(kernel), BernoulliLikelihood(), 0.1) + q = MvNormal(m.m, m.A'm.A) + fx = f(x) + fu = f(m.z).fx + return fx, fu, q +end + +function flux_loss(x, y; n_data=1, n_batch=1) + fx, fu, q = model(x) + return -SparseGPs.elbo(fx, y, fu, q; n_data, n_batch) +end + +# %% +M = 15 # number of inducing points + +# Initialise the parameters +k = [10, 0.1] +m = zeros(M) +A = Matrix{Float64}(I, M, M) +z = x[1:M] + +model = SVGPLayer(k, m, A, z) + +opt = ADAM(0.1) +parameters = Flux.params(model) + +# %% +# Negative ELBO before training +println(flux_loss(x, y)) + +# %% +# Train the model +Flux.train!( + (x, y) -> flux_loss(x, y), + parameters, + ncycle([(x, y)], 500), # Train for 1000 epochs + opt +) + +# %% +# Negative ELBO after training +println(flux_loss(x, y)) + +# %% +# After optimisation, plot samples from the underlying posterior GP. + fu = f(z).fx # want the underlying FiniteGP -q = MvNormal(zeros(length(z)), I) +post = SparseGPs.approx_posterior(SVGP(), fu, MvNormal(m, A'A)) +l_post = LatentGP(post, BernoulliLikelihood(), 0.1) -SparseGPs.kl_divergence(q, fu) -SparseGPs.elbo(fx, y, fu, q) +post_f_samples = rand(l_post.f(x_plot, 1e-6),20) -post = SparseGPs.approx_posterior(SVGP(), fu, q) -f_mean, f_var = mean_and_var(post, fx.fx.x) +plt = plot( + x_plot, + post_f_samples; + seriescolor="red", + linealpha=0.2, + legend=false +) +# %% +# As above, push these samples through a logistic sigmoid to get posterior predictions. +post_y_samples = mean.(l_post.lik.(post_f_samples)) -# v = inputs to evaluate -# w = weights -v, w = gausshermite(20); -h = √2 * .√f_var' .* v .+ f_mean' -lls = loglikelihood.(f.lik.(h), y') -var_exp = (1/√π) * sum(w'lls) +plt = plot( + x_plot, + post_y_samples; + seriescolor="red", + linealpha=0.2, + # legend=false, + label="" +) +scatter!(plt, x, y; seriescolor="blue", label="Data points") +vline!(z; label="Pseudo-points") diff --git a/examples/data/classif_1D.csv b/examples/data/classif_1D.csv new file mode 100644 index 00000000..70ddb862 --- /dev/null +++ b/examples/data/classif_1D.csv @@ -0,0 +1,50 @@ +5.668341708542713242e+00 0.000000000000000000e+00 +5.758793969849246075e+00 0.000000000000000000e+00 +5.517587939698492150e+00 0.000000000000000000e+00 +2.954773869346733584e+00 1.000000000000000000e+00 +3.648241206030150785e+00 1.000000000000000000e+00 +2.110552763819095290e+00 1.000000000000000000e+00 +4.613065326633165597e+00 0.000000000000000000e+00 +4.793969849246231263e+00 0.000000000000000000e+00 +4.703517587939698430e+00 0.000000000000000000e+00 +6.030150753768843686e-01 1.000000000000000000e+00 +3.015075376884421843e-01 0.000000000000000000e+00 +3.979899497487437099e+00 0.000000000000000000e+00 +3.226130653266331638e+00 1.000000000000000000e+00 +1.899497487437185939e+00 1.000000000000000000e+00 +1.145728643216080256e+00 1.000000000000000000e+00 +3.316582914572864249e-01 0.000000000000000000e+00 +6.030150753768843686e-01 1.000000000000000000e+00 +2.231155778894472252e+00 1.000000000000000000e+00 +3.256281407035175768e+00 1.000000000000000000e+00 +1.085427135678391997e+00 1.000000000000000000e+00 +1.809045226130653106e+00 1.000000000000000000e+00 +4.492462311557789079e+00 0.000000000000000000e+00 +1.959798994974874198e+00 1.000000000000000000e+00 +0.000000000000000000e+00 0.000000000000000000e+00 +3.346733668341708601e+00 1.000000000000000000e+00 +1.507537688442210921e-01 0.000000000000000000e+00 +1.809045226130653328e-01 1.000000000000000000e+00 +5.517587939698492150e+00 0.000000000000000000e+00 +2.201005025125628123e+00 1.000000000000000000e+00 +5.577889447236180409e+00 0.000000000000000000e+00 +1.809045226130653328e-01 0.000000000000000000e+00 +1.688442211055276365e+00 1.000000000000000000e+00 +4.160804020100502321e+00 0.000000000000000000e+00 +2.170854271356783993e+00 1.000000000000000000e+00 +4.311557788944723413e+00 0.000000000000000000e+00 +3.075376884422110546e+00 1.000000000000000000e+00 +5.125628140703517133e+00 0.000000000000000000e+00 +1.989949748743718549e+00 1.000000000000000000e+00 +5.366834170854271058e+00 0.000000000000000000e+00 +4.100502512562814061e+00 0.000000000000000000e+00 +7.236180904522613311e-01 1.000000000000000000e+00 +2.261306532663316382e+00 1.000000000000000000e+00 +3.467336683417085119e+00 1.000000000000000000e+00 +1.085427135678391997e+00 1.000000000000000000e+00 +5.095477386934673447e+00 0.000000000000000000e+00 +5.185929648241205392e+00 0.000000000000000000e+00 +2.743718592964823788e+00 1.000000000000000000e+00 +2.773869346733668362e+00 1.000000000000000000e+00 +1.417085427135678311e+00 1.000000000000000000e+00 +1.989949748743718549e+00 1.000000000000000000e+00 diff --git a/src/quadrature.jl b/src/quadrature.jl index 268e450d..7a1de617 100644 --- a/src/quadrature.jl +++ b/src/quadrature.jl @@ -9,7 +9,7 @@ function gauss_hermite_quadrature( # using a reparameterisation by change of variable # (see eg. en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature) xs, ws = gausshermite(n_points) - # size(fs): (n_points, length(y)) + # size(fs): (length(y), n_points) fs = √2 * .√f_var .* transpose(xs) .+ f_mean lls = loglikelihood.(lik.(fs), y) return (1/√π) * lls * ws From ce20ebac3395c595fe59599a54fbd840781ac851 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Sun, 4 Jul 2021 01:43:58 +0100 Subject: [PATCH 13/66] Updated examples. --- examples/classification.jl | 4 ++ examples/{gpflow_svgp.jl => regression.jl} | 55 +++++++++++++--------- 2 files changed, 38 insertions(+), 21 deletions(-) rename examples/{gpflow_svgp.jl => regression.jl} (66%) diff --git a/examples/classification.jl b/examples/classification.jl index 21266bda..211c01af 100644 --- a/examples/classification.jl +++ b/examples/classification.jl @@ -12,7 +12,10 @@ using DelimitedFiles using IterTools using Plots +default(; legend=:outertopright, size=(700, 400)) +using Random +Random.seed!(1234) # %% # Read in the classification data @@ -64,6 +67,7 @@ scatter!(plt, x, y; seriescolor="blue", label="Data points") # %% +# A simple Flux model using Flux struct SVGPLayer diff --git a/examples/gpflow_svgp.jl b/examples/regression.jl similarity index 66% rename from examples/gpflow_svgp.jl rename to examples/regression.jl index 63f06668..59b10b35 100644 --- a/examples/gpflow_svgp.jl +++ b/examples/regression.jl @@ -1,4 +1,4 @@ -# An attempted recreation of https://gpflow.readthedocs.io/en/master/notebooks/advanced/gps_for_big_data.html +# A recreation of https://gpflow.readthedocs.io/en/master/notebooks/advanced/gps_for_big_data.html using AbstractGPs using SparseGPs @@ -14,6 +14,7 @@ using Random Random.seed!(1234) # %% +# The data generating function function g(x) return sin(3π * x) + 0.3 * cos(9π * x) + 0.5 * sin(7π * x) end @@ -24,31 +25,28 @@ y = g.(x) + 0.3 * randn(N) scatter(x, y; xlabel="x", ylabel="y", legend=false) -# %% -M = 50 # number of inducing points - -# TODO: incorporate better inducing point selection from -# https://github.com/JuliaGaussianProcesses/InducingPoints.jl? -z = x[1:M] # %% # A simple Flux model using Flux -struct SVGPLayer +struct SVGPModel k # kernel parameters m # variational mean A # variational covariance z # inducing points end -@Flux.functor SVGPLayer +@Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs function make_kernel(k) return softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2]))) end -function (m::SVGPLayer)(x) +# Create the 'model' from the parameters - i.e. return the FiniteGP at inputs x, +# the FiniteGP at inducing inputs z and the variational posterior over inducing +# points - q(u). +function (m::SVGPModel)(x) kernel = make_kernel(m.k) f = GP(kernel) q = MvNormal(m.m, m.A'm.A) @@ -57,7 +55,8 @@ function (m::SVGPLayer)(x) return fx, fu, q end -function posterior(m::SVGPLayer) +# Create the posterior GP from the model parameters. +function posterior(m::SVGPModel) kernel = make_kernel(m.k) f = GP(kernel) fu = f(m.z, 0.3) @@ -65,37 +64,50 @@ function posterior(m::SVGPLayer) return SparseGPs.approx_posterior(SVGP(), fu, q) end +# Return the loss given data - in this case the negative ELBO. function flux_loss(x, y; n_data=1, n_batch=1) fx, fu, q = model(x) return -SparseGPs.elbo(fx, y, fu, q; n_data, n_batch) end + +# %% +M = 50 # number of inducing points + +# Select the first M inputs as inducing inputs +z = x[1:M] + # Initialise the parameters k = [0.3, 10] m = zeros(M) A = Matrix{Float64}(I, M, M) -model = SVGPLayer(k, m, A, z) +model = SVGPModel(k, m, A, z) b = 100 # minibatch size opt = ADAM(0.01) -# parameters = Flux.params(k, s, m, A) parameters = Flux.params(model) data_loader = Flux.Data.DataLoader((x, y), batchsize=b) # %% +# Negative ELBO before training println(flux_loss(x, y)) +# %% +# Train the model Flux.train!( (x, y) -> flux_loss(x, y; n_data=N, n_batch=b), parameters, - ncycle(data_loader, 100), # Train for 100 epochs + ncycle(data_loader, 300), # Train for 400 epochs opt ) +# %% +# Negative ELBO after training println(flux_loss(x, y)) # %% +# Plot samples from the optmimised approximate posterior. post = posterior(model) scatter( @@ -113,9 +125,10 @@ plot!(-1:0.001:1, post; label="Posterior") vline!(z; label="Pseudo-points") -# %% Find the exact posterior over u (e.g. -# https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/ equations -# (11) & (12)) As a sanity check. +# %% There is a closed form optimal solution for the variational posterior q(u) +# (e.g. https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/ +# equations (11) & (12)). The SVGP posterior with this optimal q(u) should +# therefore be equivalent to the 'exact' sparse GP (Titsias) posterior. function exact_q(fu, fx, y) σ² = fx.Σy[1] @@ -123,8 +136,8 @@ function exact_q(fu, fx, y) Kuu = Symmetric(cov(fu)) Σ = (Symmetric(cov(fu) + (1/σ²) * Kuf * Kuf')) m = ((1/σ²)*Kuu* (Σ\Kuf)) * y - A = Symmetric(Kuu * (Σ \ Kuu)) - return MvNormal(m, A) + S = Symmetric(Kuu * (Σ \ Kuu)) + return MvNormal(m, S) end kernel = make_kernel([0.2, 11]) @@ -136,8 +149,8 @@ q_ex = exact_q(fu, fx, y) scatter(x, y) scatter!(z, q_ex.μ) -# These two should be the same - and they are, the plot below shows almost identical predictions -ap_ex = SparseGPs.approx_posterior(SVGP(), fu, q_ex) # Hensman 2013 (exact) posterior +# These two should be the same - and they are, as the plot below shows +ap_ex = SparseGPs.approx_posterior(SVGP(), fu, q_ex) # Hensman (2013) exact posterior ap_tits = AbstractGPs.approx_posterior(VFE(), fx, y, fu) # Titsias posterior # Should these be the same? (they currently aren't) From 359b3d54d621c5a0b6af21c2ee2071506e58e45c Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Sun, 4 Jul 2021 01:49:12 +0100 Subject: [PATCH 14/66] Renamed SVGPLayer to SVGPModel. --- examples/classification.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/classification.jl b/examples/classification.jl index 211c01af..85b875ca 100644 --- a/examples/classification.jl +++ b/examples/classification.jl @@ -70,17 +70,17 @@ scatter!(plt, x, y; seriescolor="blue", label="Data points") # A simple Flux model using Flux -struct SVGPLayer +struct SVGPModel k # kernel parameters m # variational mean A # variational covariance z # inducing points end -@Flux.functor SVGPLayer (k, m, A,) # Don't train the inducing inputs +@Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs lik = BernoulliLikelihood() -function (m::SVGPLayer)(x) +function (m::SVGPModel)(x) kernel = make_kernel(m.k) f = LatentGP(GP(kernel), BernoulliLikelihood(), 0.1) q = MvNormal(m.m, m.A'm.A) @@ -103,7 +103,7 @@ m = zeros(M) A = Matrix{Float64}(I, M, M) z = x[1:M] -model = SVGPLayer(k, m, A, z) +model = SVGPModel(k, m, A, z) opt = ADAM(0.1) parameters = Flux.params(model) From 3bdbedb4a4fe8e05f79960c10618548f53f8371a Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Sun, 4 Jul 2021 13:39:00 +0100 Subject: [PATCH 15/66] Added basic test structure. --- test/Project.toml | 3 +++ test/runtests.jl | 12 ++++++++++++ test/svgp.jl | 4 ++++ 3 files changed, 19 insertions(+) create mode 100644 test/Project.toml create mode 100644 test/runtests.jl create mode 100644 test/svgp.jl diff --git a/test/Project.toml b/test/Project.toml new file mode 100644 index 00000000..7a21f898 --- /dev/null +++ b/test/Project.toml @@ -0,0 +1,3 @@ +[deps] +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/runtests.jl b/test/runtests.jl new file mode 100644 index 00000000..914269c5 --- /dev/null +++ b/test/runtests.jl @@ -0,0 +1,12 @@ +using Random +using Test +using SparseGPs + +const GROUP = get(ENV, "GROUP", "All") +const PKGDIR = dirname(dirname(pathof(SparseGPs))) + +@testset "SparseGPs" begin + include("svgp.jl") + println(" ") + @info "Ran svgp tests" +end diff --git a/test/svgp.jl b/test/svgp.jl new file mode 100644 index 00000000..b5e84b04 --- /dev/null +++ b/test/svgp.jl @@ -0,0 +1,4 @@ +@testset "svgp" begin + x = 4 + @test x == 4 +end From cb3a341d6f2bcb1b4474d7e3b2c8b8f48ef94681 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Sun, 4 Jul 2021 14:57:54 +0100 Subject: [PATCH 16/66] Started equivalence tests --- test/equivalences.jl | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 test/equivalences.jl diff --git a/test/equivalences.jl b/test/equivalences.jl new file mode 100644 index 00000000..69e83a76 --- /dev/null +++ b/test/equivalences.jl @@ -0,0 +1,8 @@ +@testset "equivalences" begin + rng, N = MersenneTwister(654321), 20 + x = rand(rng, N) + y = sin.(x) + 0.9 * cos.(x * 1.6) + 0.4 * rand(rng, N) + + z = copy(x) # Set inducing inputs == training inputs + +end From 3a2c8a921fc2ab339a00196bcb23034419bc6537 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Sun, 4 Jul 2021 16:55:29 +0100 Subject: [PATCH 17/66] First pass (doesn't work yet) --- test/Project.toml | 2 + test/equivalences.jl | 88 +++++++++++++++++++++++++++++++++++++++++++- test/runtests.jl | 10 +++++ test/test_utils.jl | 0 4 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 test/test_utils.jl diff --git a/test/Project.toml b/test/Project.toml index 7a21f898..e089a59d 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,3 +1,5 @@ [deps] +Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" +IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/equivalences.jl b/test/equivalences.jl index 69e83a76..70e81aab 100644 --- a/test/equivalences.jl +++ b/test/equivalences.jl @@ -4,5 +4,91 @@ y = sin.(x) + 0.9 * cos.(x * 1.6) + 0.4 * rand(rng, N) z = copy(x) # Set inducing inputs == training inputs - + + # Create a kernel from parameters k + kernel(k) = softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2]))) + k_init = [0.1, 0.1] # initial kernel parameters + + lik_noise = 0.1 # The (fixed) Gaussian likelihood noise + + ## FIRST - define the models + # GPR - Exact GP regression + struct GPRModel + k # kernel parameters + end + @Flux.functor GPRModel + + function (m::GPRModel)(x) + f = GP(kernel(m.k)) + fx = f(x, lik_noise) + return fx + end + + # SGPR - Sparse GP regression (Titsias 2009) + struct SGPRModel + k # kernel parameters + z # inducing points + end + @Flux.functor SGPRModel (k,) # Don't train the inducing inputs + + function (m::SGPRModel)(x) + f = GP(kernel(m.k)) + fx = f(x, lik_noise) + fz = f(m.z, lik_noise) + return fx, fz + end + + # SVGP - Sparse variational GP regression (Hensman 2014) + struct SVGPModel + k # kernel parameters + z # inducing points + m # variational mean + A # variational covariance sqrt (Σ = A'A) + end + @Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs + + function (m::SVGPModel)(x) + f = GP(kernel(m.k)) + q = MvNormal(m.m, m.A'm.A) + fx = f(x, lik_noise) + fz = f(m.z, lik_noise) + return fx, fz, q + end + + ## SECOND - create the models and associated training losses + gpr = GPRModel(copy(k_init)) + function GPR_loss(x, y) + fx = gpr(x) + return -logpdf(fx, y) + end + + sgpr = SGPRModel(copy(k_init), copy(z)) + function SGPR_loss(x, y) + fx, fz = sgpr(x) + return -AbstractGPs.elbo(fx, y, fz) + end + + m, A = rand(rng, N), rand(rng, N, N) # initialise the variational parameters + svgp = SVGPModel(copy(k_init), copy(z), m, A) + function SVGP_loss(x, y) + fx, fz, q = svgp(x) + return -SparseGPs.elbo(fx, y, fz, q) + end + + ## THIRD - train the models + data = [(x, y)] + opt = ADAM(0.01) + + Flux.train!((x, y) -> GPR_loss(x, y), Flux.params(gpr), ncycle(data, 300), opt) + Flux.train!((x, y) -> SGPR_loss(x, y), Flux.params(sgpr), ncycle(data, 300), opt) + Flux.train!((x, y) -> SVGP_loss(x, y), Flux.params(svgp), ncycle(data, 300), opt) + + ## FOURTH - test equivalence + println(gpr.k) + println(sgpr.k) + println(svgp.k) + @test gpr.k ≈ svgp.k + + # TODO: test posterior predictions end + diff --git a/test/runtests.jl b/test/runtests.jl index 914269c5..c2146c68 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,12 +1,22 @@ using Random using Test using SparseGPs +using Flux +using IterTools +using AbstractGPs +using SparseGPs const GROUP = get(ENV, "GROUP", "All") const PKGDIR = dirname(dirname(pathof(SparseGPs))) +include("test_utils.jl") + @testset "SparseGPs" begin include("svgp.jl") println(" ") @info "Ran svgp tests" + + include("equivalences.jl") + println(" ") + @info "Ran equivalences tests" end diff --git a/test/test_utils.jl b/test/test_utils.jl new file mode 100644 index 00000000..e69de29b From 005f8f03c9d896774260b760df33d77cad00d42d Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Tue, 6 Jul 2021 02:43:17 +0100 Subject: [PATCH 18/66] Working tests --- test/Project.toml | 2 ++ test/equivalences.jl | 85 ++++++++++++++++++++++++++++---------------- test/runtests.jl | 2 +- 3 files changed, 58 insertions(+), 31 deletions(-) diff --git a/test/Project.toml b/test/Project.toml index e089a59d..a4a781f3 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,4 +1,6 @@ [deps] +AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918" +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" diff --git a/test/equivalences.jl b/test/equivalences.jl index 70e81aab..46fb2ba4 100644 --- a/test/equivalences.jl +++ b/test/equivalences.jl @@ -1,15 +1,16 @@ @testset "equivalences" begin rng, N = MersenneTwister(654321), 20 - x = rand(rng, N) + x = rand(rng, N) * 10 y = sin.(x) + 0.9 * cos.(x * 1.6) + 0.4 * rand(rng, N) z = copy(x) # Set inducing inputs == training inputs # Create a kernel from parameters k kernel(k) = softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2]))) - k_init = [0.1, 0.1] # initial kernel parameters + k_init = [0.2, 0.6] # initial kernel parameters lik_noise = 0.1 # The (fixed) Gaussian likelihood noise + jitter = 1e-5 ## FIRST - define the models # GPR - Exact GP regression @@ -24,19 +25,19 @@ return fx end - # SGPR - Sparse GP regression (Titsias 2009) - struct SGPRModel - k # kernel parameters - z # inducing points - end - @Flux.functor SGPRModel (k,) # Don't train the inducing inputs + # # SGPR - Sparse GP regression (Titsias 2009) + # struct SGPRModel + # k # kernel parameters + # z # inducing points + # end + # @Flux.functor SGPRModel (k,) # Don't train the inducing inputs - function (m::SGPRModel)(x) - f = GP(kernel(m.k)) - fx = f(x, lik_noise) - fz = f(m.z, lik_noise) - return fx, fz - end + # function (m::SGPRModel)(x) + # f = GP(kernel(m.k)) + # fx = f(x, lik_noise) + # fz = f(m.z, lik_noise) + # return fx, fz + # end # SVGP - Sparse variational GP regression (Hensman 2014) struct SVGPModel @@ -51,7 +52,7 @@ f = GP(kernel(m.k)) q = MvNormal(m.m, m.A'm.A) fx = f(x, lik_noise) - fz = f(m.z, lik_noise) + fz = f(m.z, jitter) return fx, fz, q end @@ -62,13 +63,13 @@ return -logpdf(fx, y) end - sgpr = SGPRModel(copy(k_init), copy(z)) - function SGPR_loss(x, y) - fx, fz = sgpr(x) - return -AbstractGPs.elbo(fx, y, fz) - end + # sgpr = SGPRModel(copy(k_init), copy(z)) + # function SGPR_loss(x, y) + # fx, fz = sgpr(x) + # return -AbstractGPs.elbo(fx, y, fz) + # end - m, A = rand(rng, N), rand(rng, N, N) # initialise the variational parameters + m, A = rand(rng, N), rand(rng, N, N)/2 # initialise the variational parameters svgp = SVGPModel(copy(k_init), copy(z), m, A) function SVGP_loss(x, y) fx, fz, q = svgp(x) @@ -79,16 +80,40 @@ data = [(x, y)] opt = ADAM(0.01) - Flux.train!((x, y) -> GPR_loss(x, y), Flux.params(gpr), ncycle(data, 300), opt) - Flux.train!((x, y) -> SGPR_loss(x, y), Flux.params(sgpr), ncycle(data, 300), opt) - Flux.train!((x, y) -> SVGP_loss(x, y), Flux.params(svgp), ncycle(data, 300), opt) + svgp_ps = Flux.params(svgp) + delete!(svgp_ps, svgp.k) # Don't train the kernel parameters + + # Flux.train!((x, y) -> GPR_loss(x, y), Flux.params(gpr), ncycle(data, 3000), opt) + # Flux.train!((x, y) -> SGPR_loss(x, y), Flux.params(sgpr), ncycle(data, 3000), opt) + Flux.train!((x, y) -> SVGP_loss(x, y), svgp_ps, ncycle(data, 9000), opt) + + ## FOURTH - construct the posteriors + function posterior(m::GPRModel, x, y) + f = GP(kernel(m.k)) + fx = f(x, lik_noise) + return AbstractGPs.posterior(fx, y) + end + + # function posterior(m::SGPRModel, x, y) + # f = GP(kernel(m.k)) + # fx = f(x, lik_noise) + # fz = f(m.z) + # return AbstractGPs.approx_posterior(VFE(), fx, y, fz) + # end + + function posterior(m::SVGPModel) + f = GP(kernel(m.k)) + fz = f(m.z, jitter) + q = MvNormal(m.m, m.A'm.A) + return SparseGPs.approx_posterior(SVGP(), fz, q) + end + gpr_post = posterior(gpr, x, y) + # sgpr_post = posterior(sgpr, x, y) + svgp_post = posterior(svgp) - ## FOURTH - test equivalence - println(gpr.k) - println(sgpr.k) - println(svgp.k) - @test gpr.k ≈ svgp.k + ## FIFTH - test equivalences + @test all(isapprox.(mean(gpr_post, x), mean(svgp_post, x), atol=1e-3)) + @test all(isapprox.(cov(gpr_post, x), cov(svgp_post, x), atol=1e-3)) - # TODO: test posterior predictions end diff --git a/test/runtests.jl b/test/runtests.jl index c2146c68..e5d8346a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,7 +4,7 @@ using SparseGPs using Flux using IterTools using AbstractGPs -using SparseGPs +using Distributions const GROUP = get(ENV, "GROUP", "All") const PKGDIR = dirname(dirname(pathof(SparseGPs))) From 443a2d4f1c5675b9bf21b9cedbe98a1483f53e92 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Tue, 6 Jul 2021 02:47:52 +0100 Subject: [PATCH 19/66] Fixed KL divergence --- examples/regression.jl | 2 +- src/svgp.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/regression.jl b/examples/regression.jl index 59b10b35..6f5b0766 100644 --- a/examples/regression.jl +++ b/examples/regression.jl @@ -98,7 +98,7 @@ println(flux_loss(x, y)) Flux.train!( (x, y) -> flux_loss(x, y; n_data=N, n_batch=b), parameters, - ncycle(data_loader, 300), # Train for 400 epochs + ncycle(data_loader, 300), # Train for 300 epochs opt ) diff --git a/src/svgp.jl b/src/svgp.jl index 6962a148..0245ed71 100644 --- a/src/svgp.jl +++ b/src/svgp.jl @@ -43,7 +43,7 @@ end function kl_divergence(q::MvNormal, p::AbstractMvNormal) p_μ, p_Σ = mean(p), cov(p) (1/2) .* (logdet(p_Σ) - logdet(q.Σ) - length(p_μ) + tr(p_Σ \ cov(q)) + - Xt_invA_X(cholesky(q.Σ), (q.μ - p_μ))) + Xt_invA_X(cholesky(p_Σ), (q.μ - p_μ))) end # The closed form expected loglikelihood for a Gaussian likelihood From 92da73c4871becf5418257be2c11ce28e691a181 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Tue, 6 Jul 2021 18:06:41 +0100 Subject: [PATCH 20/66] Refactored elbo stuff --- src/SparseGPs.jl | 2 +- src/elbo.jl | 100 ++++++++++++++++++++++++++++++++++++++++++++++ src/quadrature.jl | 18 --------- src/svgp.jl | 74 ++++++++++------------------------ 4 files changed, 123 insertions(+), 71 deletions(-) create mode 100644 src/elbo.jl delete mode 100644 src/quadrature.jl diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl index 335af352..e34e34c1 100644 --- a/src/SparseGPs.jl +++ b/src/SparseGPs.jl @@ -25,7 +25,7 @@ export elbo, approx_posterior, SVGP -include("quadrature.jl") +include("elbo.jl") include("svgp.jl") end diff --git a/src/elbo.jl b/src/elbo.jl new file mode 100644 index 00000000..0df7f051 --- /dev/null +++ b/src/elbo.jl @@ -0,0 +1,100 @@ +""" + elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::MvNormal; n_data=1, n_batch=1) + +Compute the Evidence Lower BOund from [1] for the process `fx.f` where `y` are +observations of `fx`, pseudo-inputs are given by `z = fz.z` and `q(u)` is a +variational distribution over inducing points `u = f(z)`. + +[1] - Hensman, James, Alexander Matthews, and Zoubin Ghahramani. "Scalable +variational Gaussian process classification." Artificial Intelligence and +Statistics. PMLR, 2015. +""" + +function elbo( + fx::FiniteGP, + y::AbstractVector{<:Real}, + fz::FiniteGP, + q::MvNormal; + n_data=1, + n_batch=1 +) + kl_term, f_mean, f_var = _elbo_intermediates(lfx.fx) + + Σy = diag(fx.Σy) # n.b. this assumes uncorrelated observation noise + variational_exp = expected_loglik(y, f_mean, f_var, Σy) + scale = n_data / n_batch + return sum(variational_exp) * scale - kl_term +end + +function elbo( + lfx::LatentFiniteGP, + y::AbstractVector{<:Real}, + fz::FiniteGP, + q::MvNormal; + n_data=1, + n_batch=1 +) + kl_term, f_mean, f_var = _elbo_intermediates(lfx.fx) + + variational_exp = expected_loglik(y, f_mean, f_var, lfx.lik) + scale = n_data / n_batch + return sum(variational_exp) * scale - kl_term +end + +# Computes the common intermediates needed for the ELBO +function _elbo_intermediates( + fx::FiniteGP, + y::AbstractVector{<:Real}, + fz::FiniteGP, + q::MvNormal +) + kl_term = kl_divergence(q, fz) + post = approx_posterior(SVGP(), fz, q) + f_mean, f_var = mean_and_var(post, fx.fx.x) + return kl_term, f_mean, f_var +end + +# The closed form expected loglikelihood for a Gaussian likelihood +function expected_loglik( + y::AbstractVector{<:Real}, + f_mean::AbstractVector, + f_var::AbstractVector, + Σy::AbstractVector +) + return -0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy) +end + +function expected_loglik( + y::AbstractVector, + f_mean::AbstractVector, + f_var::AbstractVector, + lik::BernoulliLikelihood; + n_points=20 +) + return gauss_hermite_quadrature(y, f_mean, f_var, lik; n_points=n_points) +end + +function kl_divergence(q::MvNormal, p::AbstractMvNormal) + p_μ, p_Σ = mean(p), cov(p) + (1/2) .* (logdet(p_Σ) - logdet(q.Σ) - length(p_μ) + tr(p_Σ \ cov(q)) + + Xt_invA_X(cholesky(p_Σ), (q.μ - p_μ))) +end + +function gauss_hermite_quadrature( + y::AbstractVector, + f_mean::AbstractVector, + f_var::AbstractVector, + lik; + n_points=20 +) + # Compute the expectation via Gauss-Hermite quadrature + # using a reparameterisation by change of variable + # (see eg. en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature) + xs, ws = gausshermite(n_points) + # size(fs): (length(y), n_points) + fs = √2 * .√f_var .* transpose(xs) .+ f_mean + lls = loglikelihood.(lik.(fs), y) + return (1/√π) * lls * ws +end + +ChainRulesCore.@non_differentiable gausshermite(n) diff --git a/src/quadrature.jl b/src/quadrature.jl deleted file mode 100644 index 7a1de617..00000000 --- a/src/quadrature.jl +++ /dev/null @@ -1,18 +0,0 @@ -function gauss_hermite_quadrature( - y::AbstractVector, - f_mean::AbstractVector, - f_var::AbstractVector, - lik; - n_points=20 -) - # Compute the expectation via Gauss-Hermite quadrature - # using a reparameterisation by change of variable - # (see eg. en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature) - xs, ws = gausshermite(n_points) - # size(fs): (length(y), n_points) - fs = √2 * .√f_var .* transpose(xs) .+ f_mean - lls = loglikelihood.(lik.(fs), y) - return (1/√π) * lls * ws -end - -ChainRulesCore.@non_differentiable gausshermite(n) diff --git a/src/svgp.jl b/src/svgp.jl index 0245ed71..229d2173 100644 --- a/src/svgp.jl +++ b/src/svgp.jl @@ -1,11 +1,29 @@ struct SVGP end -function approx_posterior(::SVGP, fu::FiniteGP, q::MvNormal) +""" + approx_posterior(::SVGP, fz::FiniteGP, q::MvNormal) + +Compute the approximate posterior [1] over the process `f = fz.f`, given inducing +inputs `z = fz.x` and a variational distribution over inducing points `q(u)` where `u = +f(z)`. The approximate posterior at test points ``x^*`` where ``f^* = f(x^*)`` +is then given by: + +```math +q(f^*) = \int p(f | u) q(u) du +``` +which can be found in closed form. + +[1] - Hensman, James, Alexander Matthews, and Zoubin Ghahramani. "Scalable +variational Gaussian process classification." Artificial Intelligence and +Statistics. PMLR, 2015. +""" + +function approx_posterior(::SVGP, fz::FiniteGP, q::MvNormal) m, A = mean(q), cholesky(cov(q)) - Kuu = cholesky(Symmetric(cov(fu))) + Kuu = cholesky(Symmetric(cov(fz))) B = Kuu.L \ A.L - data = (A=A, m=m, Kuu=Kuu, B=B, α=Kuu \ m, u=fu.x) - return ApproxPosteriorGP(SVGP(), fu.f, data) + data = (A=A, m=m, Kuu=Kuu, B=B, α=Kuu \ m, u=fz.x) + return ApproxPosteriorGP(SVGP(), fz.f, data) end function Statistics.var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) @@ -39,51 +57,3 @@ function StatsBase.mean_and_var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) Σ_diag = var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) return μ, Σ_diag end - -function kl_divergence(q::MvNormal, p::AbstractMvNormal) - p_μ, p_Σ = mean(p), cov(p) - (1/2) .* (logdet(p_Σ) - logdet(q.Σ) - length(p_μ) + tr(p_Σ \ cov(q)) + - Xt_invA_X(cholesky(p_Σ), (q.μ - p_μ))) -end - -# The closed form expected loglikelihood for a Gaussian likelihood -function expected_loglik( - y::AbstractVector{<:Real}, - f_mean::AbstractVector, - f_var::AbstractVector, - Σy::AbstractVector -) - return -0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy) -end - -function expected_loglik( - y::AbstractVector, - f_mean::AbstractVector, - f_var::AbstractVector, - lik::BernoulliLikelihood; - n_points=20 -) - return gauss_hermite_quadrature(y, f_mean, f_var, lik; n_points=n_points) -end - -function elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal; n_data=1, n_batch=1) - kl_term = kl_divergence(q, fu) - post = approx_posterior(SVGP(), fu, q) - f_mean, f_var = mean_and_var(post, fx.x) - Σy = diag(fx.Σy) - - variational_exp = expected_loglik(y, f_mean, f_var, Σy) - scale = n_data / n_batch - return sum(variational_exp) * scale - kl_term -end - -function elbo(fx::LatentFiniteGP, y::AbstractVector{<:Real}, fu::FiniteGP, q::MvNormal; n_data=1, n_batch=1) - kl_term = kl_divergence(q, fu) - post = approx_posterior(SVGP(), fu, q) - f_mean, f_var = mean_and_var(post, fx.fx.x) - - variational_exp = expected_loglik(y, f_mean, f_var, fx.lik) - scale = n_data / n_batch - return sum(variational_exp) * scale - kl_term -end - From 7d05d1b92969d79ca5e4a7b093cab1f67ad3611e Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Wed, 7 Jul 2021 12:46:50 +0100 Subject: [PATCH 21/66] Fixed elbo mistakes --- examples/classification.jl | 4 ++-- src/elbo.jl | 7 +++---- src/svgp.jl | 18 ++++++++++-------- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/examples/classification.jl b/examples/classification.jl index 85b875ca..6dfd0c00 100644 --- a/examples/classification.jl +++ b/examples/classification.jl @@ -117,7 +117,7 @@ println(flux_loss(x, y)) Flux.train!( (x, y) -> flux_loss(x, y), parameters, - ncycle([(x, y)], 500), # Train for 1000 epochs + ncycle([(x, y)], 1000), # Train for 1000 epochs opt ) @@ -132,7 +132,7 @@ fu = f(z).fx # want the underlying FiniteGP post = SparseGPs.approx_posterior(SVGP(), fu, MvNormal(m, A'A)) l_post = LatentGP(post, BernoulliLikelihood(), 0.1) -post_f_samples = rand(l_post.f(x_plot, 1e-6),20) +post_f_samples = rand(l_post.f(x_plot, 1e-6), 20) plt = plot( x_plot, diff --git a/src/elbo.jl b/src/elbo.jl index 0df7f051..827af5f3 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -18,7 +18,7 @@ function elbo( n_data=1, n_batch=1 ) - kl_term, f_mean, f_var = _elbo_intermediates(lfx.fx) + kl_term, f_mean, f_var = _elbo_intermediates(fx, fz, q) Σy = diag(fx.Σy) # n.b. this assumes uncorrelated observation noise variational_exp = expected_loglik(y, f_mean, f_var, Σy) @@ -34,7 +34,7 @@ function elbo( n_data=1, n_batch=1 ) - kl_term, f_mean, f_var = _elbo_intermediates(lfx.fx) + kl_term, f_mean, f_var = _elbo_intermediates(lfx.fx, fz, q) variational_exp = expected_loglik(y, f_mean, f_var, lfx.lik) scale = n_data / n_batch @@ -44,13 +44,12 @@ end # Computes the common intermediates needed for the ELBO function _elbo_intermediates( fx::FiniteGP, - y::AbstractVector{<:Real}, fz::FiniteGP, q::MvNormal ) kl_term = kl_divergence(q, fz) post = approx_posterior(SVGP(), fz, q) - f_mean, f_var = mean_and_var(post, fx.fx.x) + f_mean, f_var = mean_and_var(post, fx.x) return kl_term, f_mean, f_var end diff --git a/src/svgp.jl b/src/svgp.jl index 229d2173..15b0a94d 100644 --- a/src/svgp.jl +++ b/src/svgp.jl @@ -9,7 +9,7 @@ f(z)`. The approximate posterior at test points ``x^*`` where ``f^* = f(x^*)`` is then given by: ```math -q(f^*) = \int p(f | u) q(u) du +q(f^*) = \\int p(f | u) q(u) du ``` which can be found in closed form. @@ -19,19 +19,13 @@ Statistics. PMLR, 2015. """ function approx_posterior(::SVGP, fz::FiniteGP, q::MvNormal) - m, A = mean(q), cholesky(cov(q)) + m, A = q.μ, cholesky(q.Σ) Kuu = cholesky(Symmetric(cov(fz))) B = Kuu.L \ A.L data = (A=A, m=m, Kuu=Kuu, B=B, α=Kuu \ m, u=fz.x) return ApproxPosteriorGP(SVGP(), fz.f, data) end -function Statistics.var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) - Cux = cov(f.prior, f.data.u, x) - D = f.data.Kuu.L \ Cux - return var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) -end - function Statistics.mean(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) return cov(f.prior, x, f.data.u) * f.data.α end @@ -42,6 +36,14 @@ function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) return cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) end +function Statistics.var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) + Cux = cov(f.prior, f.data.u, x) + D = f.data.Kuu.L \ Cux + return var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) +end + +#TODO: cov(x, y) + function StatsBase.mean_and_cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) Cux = cov(f.prior, f.data.u, x) D = f.data.Kuu.L \ Cux From c0dd7372e70331b63ff00fffae831d2c18bcc74d Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Wed, 7 Jul 2021 14:01:17 +0100 Subject: [PATCH 22/66] Remove type restiction in ELBO --- src/elbo.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/elbo.jl b/src/elbo.jl index 827af5f3..425cf6bf 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -28,7 +28,7 @@ end function elbo( lfx::LatentFiniteGP, - y::AbstractVector{<:Real}, + y::AbstractVector, fz::FiniteGP, q::MvNormal; n_data=1, From 92dcdf5a9bf0f4003767a5ef9196e36c2a0b4973 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Wed, 7 Jul 2021 16:30:49 +0100 Subject: [PATCH 23/66] Infer batch size --- examples/classification.jl | 4 ++-- examples/regression.jl | 6 +++--- src/elbo.jl | 11 +++++------ 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/examples/classification.jl b/examples/classification.jl index 6dfd0c00..03146024 100644 --- a/examples/classification.jl +++ b/examples/classification.jl @@ -89,9 +89,9 @@ function (m::SVGPModel)(x) return fx, fu, q end -function flux_loss(x, y; n_data=1, n_batch=1) +function flux_loss(x, y; n_data=length(y)) fx, fu, q = model(x) - return -SparseGPs.elbo(fx, y, fu, q; n_data, n_batch) + return -SparseGPs.elbo(fx, y, fu, q; n_data) end # %% diff --git a/examples/regression.jl b/examples/regression.jl index 6f5b0766..0cba8ac6 100644 --- a/examples/regression.jl +++ b/examples/regression.jl @@ -65,9 +65,9 @@ function posterior(m::SVGPModel) end # Return the loss given data - in this case the negative ELBO. -function flux_loss(x, y; n_data=1, n_batch=1) +function flux_loss(x, y; n_data=length(y)) fx, fu, q = model(x) - return -SparseGPs.elbo(fx, y, fu, q; n_data, n_batch) + return -SparseGPs.elbo(fx, y, fu, q; n_data) end @@ -96,7 +96,7 @@ println(flux_loss(x, y)) # %% # Train the model Flux.train!( - (x, y) -> flux_loss(x, y; n_data=N, n_batch=b), + (x, y) -> flux_loss(x, y; n_data=N), parameters, ncycle(data_loader, 300), # Train for 300 epochs opt diff --git a/src/elbo.jl b/src/elbo.jl index 425cf6bf..20e37b96 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -1,5 +1,5 @@ """ - elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::MvNormal; n_data=1, n_batch=1) + elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::MvNormal; n_data=1) Compute the Evidence Lower BOund from [1] for the process `fx.f` where `y` are observations of `fx`, pseudo-inputs are given by `z = fz.z` and `q(u)` is a @@ -9,15 +9,14 @@ variational distribution over inducing points `u = f(z)`. variational Gaussian process classification." Artificial Intelligence and Statistics. PMLR, 2015. """ - function elbo( fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::MvNormal; - n_data=1, - n_batch=1 + n_data=length(y) ) + n_batch = length(y) kl_term, f_mean, f_var = _elbo_intermediates(fx, fz, q) Σy = diag(fx.Σy) # n.b. this assumes uncorrelated observation noise @@ -31,9 +30,9 @@ function elbo( y::AbstractVector, fz::FiniteGP, q::MvNormal; - n_data=1, - n_batch=1 + n_data=length(y) ) + n_batch = length(y) kl_term, f_mean, f_var = _elbo_intermediates(lfx.fx, fz, q) variational_exp = expected_loglik(y, f_mean, f_var, lfx.lik) From ec5fa05e546fe7966a88f46fe2b14b695352c27c Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Tue, 13 Jul 2021 18:25:00 +0100 Subject: [PATCH 24/66] Added docstrings to elbo.jl --- src/elbo.jl | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/src/elbo.jl b/src/elbo.jl index 20e37b96..4f6783bd 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -1,5 +1,5 @@ """ - elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::MvNormal; n_data=1) + elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::MvNormal; n_data=length(y)) Compute the Evidence Lower BOund from [1] for the process `fx.f` where `y` are observations of `fx`, pseudo-inputs are given by `z = fz.z` and `q(u)` is a @@ -25,6 +25,12 @@ function elbo( return sum(variational_exp) * scale - kl_term end + +""" + elbo(fx::LatentFiniteGP, y::AbstractVector, fz::FiniteGP, q::MvNormal; n_data=length(y)) + +Compute the ELBO for a LatentGP with a possibly non-conjugate likelihood. +""" function elbo( lfx::LatentFiniteGP, y::AbstractVector, @@ -52,6 +58,42 @@ function _elbo_intermediates( return kl_term, f_mean, f_var end +"Likelihoods which take a scalar (or vector of scalars) as input and return a single scalar." +ScalarLikelihood = Union{BernoulliLikelihood, CategoricalLikelihood, PoissonLikelihood} + +""" + expected_loglik(y, f_mean, f_var, [Σy | lik]) + +This function computes the expected log likelihood: + +```math + ∫ q(f) log p(y | f) df +``` +where `p(y | f)` is the process likelihood. + +`q(f)` is an approximation to the latent function values `f` given by: +```math + q(f) = ∫ p(f | u) q(u) du +``` +where `q(u)` is the variational distribution over inducing points (see +[`elbo`](@ref)). + +Where possible, this expectation is calculated in closed form. Otherwise, it is +approximated using Gauss-Hermite quadrature by default. + +# Extended help + +`q(f)` is assumed to be an `MvNormal` distribution and `p(y | f)` is assumed to +have independent marginals such that only the marginals of `q(f)` are required. +""" + +function expected_loglik end + +""" + expected_loglik(y::AbstractVector{<:Real}, f_mean::AbstractVector, f_var::AbstractVector, Σy::AbstractVector) + +The expected log likelihood for a Gaussian likelihood, computed in closed form. +""" # The closed form expected loglikelihood for a Gaussian likelihood function expected_loglik( y::AbstractVector{<:Real}, @@ -62,11 +104,18 @@ function expected_loglik( return -0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy) end +""" + expected_loglik(y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, lik::ScalarLikelihood; n_points=20) + +The expected log likelihood for a `ScalarLikelihood`, approximated via +Gauss-Hermite quadrature with `n_points` quadrature points. +""" + function expected_loglik( y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, - lik::BernoulliLikelihood; + lik::ScalarLikelihood; n_points=20 ) return gauss_hermite_quadrature(y, f_mean, f_var, lik; n_points=n_points) From 6d4e87b87f5329eb00750c0c4c5c7cadf2c97328 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Tue, 13 Jul 2021 20:56:06 +0100 Subject: [PATCH 25/66] Added cross-covariance --- src/svgp.jl | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/svgp.jl b/src/svgp.jl index 15b0a94d..d4f0e944 100644 --- a/src/svgp.jl +++ b/src/svgp.jl @@ -42,7 +42,14 @@ function Statistics.var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) return var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) end -#TODO: cov(x, y) +function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector, y::AbstractVector) + B = f.data.B + Cxu = cov(f.prior, x, f.data.u) + Cuy = cov(f.prior, f.data.u, y) + D = f.data.Kuu.L \ Cuy + E = Cxu / f.data.Kuu.L' + return cov(f.prior, x, y) - (E * D) + (E * B * B' * D) +end function StatsBase.mean_and_cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) Cux = cov(f.prior, f.data.u, x) From 22c999ae39f11989fd556be47b5254cee0af8487 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Tue, 13 Jul 2021 21:46:44 +0100 Subject: [PATCH 26/66] Removed unnecessary dependencies --- Project.toml | 2 -- src/SparseGPs.jl | 2 -- 2 files changed, 4 deletions(-) diff --git a/Project.toml b/Project.toml index 45712703..e2645e29 100644 --- a/Project.toml +++ b/Project.toml @@ -10,7 +10,5 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" FastGaussQuadrature = "442a2c76-b920-505d-bb47-c5924d526838" GPLikelihoods = "6031954c-0455-49d7-b3b9-3e1c99afaf40" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -Optim = "429524aa-4258-5aef-a3af-852621145aeb" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" -StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c" diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl index e34e34c1..4bd6092b 100644 --- a/src/SparseGPs.jl +++ b/src/SparseGPs.jl @@ -2,8 +2,6 @@ module SparseGPs using AbstractGPs using Distributions -using Optim -using StatsFuns using LinearAlgebra using Statistics using StatsBase From 27639723ce0d4d4c7d9f85e6ec453e387f9a53bb Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Tue, 13 Jul 2021 22:07:50 +0100 Subject: [PATCH 27/66] Updated regression example --- examples/regression.jl | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/examples/regression.jl b/examples/regression.jl index 0cba8ac6..d537f448 100644 --- a/examples/regression.jl +++ b/examples/regression.jl @@ -30,6 +30,9 @@ scatter(x, y; xlabel="x", ylabel="y", legend=false) # A simple Flux model using Flux +lik_noise = 0.3 +jitter = 1e-5 + struct SVGPModel k # kernel parameters m # variational mean @@ -50,8 +53,8 @@ function (m::SVGPModel)(x) kernel = make_kernel(m.k) f = GP(kernel) q = MvNormal(m.m, m.A'm.A) - fx = f(x, 0.3) - fu = f(m.z, 0.3) + fx = f(x, lik_noise) + fu = f(m.z, jitter) return fx, fu, q end @@ -59,7 +62,7 @@ end function posterior(m::SVGPModel) kernel = make_kernel(m.k) f = GP(kernel) - fu = f(m.z, 0.3) + fu = f(m.z, jitter) q = MvNormal(m.m, m.A'm.A) return SparseGPs.approx_posterior(SVGP(), fu, q) end @@ -85,7 +88,7 @@ A = Matrix{Float64}(I, M, M) model = SVGPModel(k, m, A, z) b = 100 # minibatch size -opt = ADAM(0.01) +opt = ADAM(0.001) parameters = Flux.params(model) data_loader = Flux.Data.DataLoader((x, y), batchsize=b) @@ -140,10 +143,10 @@ function exact_q(fu, fx, y) return MvNormal(m, S) end -kernel = make_kernel([0.2, 11]) +kernel = make_kernel([0.3, 10]) f = GP(kernel) -fx = f(x, 0.1) -fu = f(z, 0.1) +fx = f(x, lik_noise) +fu = f(z, jitter) q_ex = exact_q(fu, fx, y) scatter(x, y) @@ -153,7 +156,7 @@ scatter!(z, q_ex.μ) ap_ex = SparseGPs.approx_posterior(SVGP(), fu, q_ex) # Hensman (2013) exact posterior ap_tits = AbstractGPs.approx_posterior(VFE(), fx, y, fu) # Titsias posterior -# Should these be the same? (they currently aren't) +# These are also approximately equal SparseGPs.elbo(fx, y, fu, q_ex) AbstractGPs.elbo(fx, y, fu) @@ -161,7 +164,9 @@ AbstractGPs.elbo(fx, y, fu) scatter( x, y; - xlim=(0, 1), + markershape=:xcross, + markeralpha=0.1, + xlim=(-1, 1), xlabel="x", ylabel="y", title="posterior (VI with sparse grid)", From 23e5c2e3c6ba7174d679002d0a482853d2227778 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Wed, 14 Jul 2021 11:56:42 +0100 Subject: [PATCH 28/66] Added exact posterior tests --- test/Project.toml | 1 + test/equivalences.jl | 206 ++++++++++++++++++++++--------------------- test/runtests.jl | 1 + 3 files changed, 109 insertions(+), 99 deletions(-) diff --git a/test/Project.toml b/test/Project.toml index a4a781f3..47a7de77 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -3,5 +3,6 @@ AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/equivalences.jl b/test/equivalences.jl index 46fb2ba4..e3f70fab 100644 --- a/test/equivalences.jl +++ b/test/equivalences.jl @@ -4,116 +4,124 @@ y = sin.(x) + 0.9 * cos.(x * 1.6) + 0.4 * rand(rng, N) z = copy(x) # Set inducing inputs == training inputs + + make_kernel(k) = softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2]))) - # Create a kernel from parameters k - kernel(k) = softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2]))) k_init = [0.2, 0.6] # initial kernel parameters - lik_noise = 0.1 # The (fixed) Gaussian likelihood noise - jitter = 1e-5 - - ## FIRST - define the models - # GPR - Exact GP regression - struct GPRModel - k # kernel parameters - end - @Flux.functor GPRModel - - function (m::GPRModel)(x) - f = GP(kernel(m.k)) - fx = f(x, lik_noise) - return fx - end - - # # SGPR - Sparse GP regression (Titsias 2009) - # struct SGPRModel - # k # kernel parameters - # z # inducing points - # end - # @Flux.functor SGPRModel (k,) # Don't train the inducing inputs - - # function (m::SGPRModel)(x) - # f = GP(kernel(m.k)) - # fx = f(x, lik_noise) - # fz = f(m.z, lik_noise) - # return fx, fz - # end - - # SVGP - Sparse variational GP regression (Hensman 2014) - struct SVGPModel - k # kernel parameters - z # inducing points - m # variational mean - A # variational covariance sqrt (Σ = A'A) - end - @Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs - function (m::SVGPModel)(x) - f = GP(kernel(m.k)) - q = MvNormal(m.m, m.A'm.A) + @testset "exact posterior" begin + # There is a closed form optimal solution for the variational posterior + # q(u) (e.g. # https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/ + # equations (11) & (12)). The SVGP posterior with this optimal q(u) + # should therefore be equivalent to the sparse GP (Titsias) posterior + # and exact GP regression (when z == x). + + function exact_q(fu, fx, y) + σ² = fx.Σy[1] + Kuf = cov(fu, fx) + Kuu = Symmetric(cov(fu)) + Σ = (Symmetric(cov(fu) + (1/σ²) * Kuf * Kuf')) + m = ((1/σ²)*Kuu* (Σ\Kuf)) * y + S = Symmetric(Kuu * (Σ \ Kuu)) + return MvNormal(m, S) + end + + kernel = make_kernel(k_init) + f = GP(kernel) fx = f(x, lik_noise) - fz = f(m.z, jitter) - return fx, fz, q - end - - ## SECOND - create the models and associated training losses - gpr = GPRModel(copy(k_init)) - function GPR_loss(x, y) - fx = gpr(x) - return -logpdf(fx, y) - end - - # sgpr = SGPRModel(copy(k_init), copy(z)) - # function SGPR_loss(x, y) - # fx, fz = sgpr(x) - # return -AbstractGPs.elbo(fx, y, fz) - # end - - m, A = rand(rng, N), rand(rng, N, N)/2 # initialise the variational parameters - svgp = SVGPModel(copy(k_init), copy(z), m, A) - function SVGP_loss(x, y) - fx, fz, q = svgp(x) - return -SparseGPs.elbo(fx, y, fz, q) - end - - ## THIRD - train the models - data = [(x, y)] - opt = ADAM(0.01) + fu = f(z) + q_ex = exact_q(fu, fx, y) - svgp_ps = Flux.params(svgp) - delete!(svgp_ps, svgp.k) # Don't train the kernel parameters + gpr_post = AbstractGPs.posterior(fx, y) # Exact GP regression + vfe_post = AbstractGPs.approx_posterior(VFE(), fx, y, fu) # Titsias posterior + svgp_post = SparseGPs.approx_posterior(SVGP(), fu, q_ex) # Hensman (2013) exact posterior - # Flux.train!((x, y) -> GPR_loss(x, y), Flux.params(gpr), ncycle(data, 3000), opt) - # Flux.train!((x, y) -> SGPR_loss(x, y), Flux.params(sgpr), ncycle(data, 3000), opt) - Flux.train!((x, y) -> SVGP_loss(x, y), svgp_ps, ncycle(data, 9000), opt) + @test mean(gpr_post, x) ≈ mean(svgp_post, x) atol=1e-10 + @test cov(gpr_post, x) ≈ cov(svgp_post, x) atol=1e-10 - ## FOURTH - construct the posteriors - function posterior(m::GPRModel, x, y) - f = GP(kernel(m.k)) - fx = f(x, lik_noise) - return AbstractGPs.posterior(fx, y) + @test mean(vfe_post, x) ≈ mean(svgp_post, x) atol=1e-10 + @test cov(vfe_post, x) ≈ cov(svgp_post, x) atol=1e-10 end - # function posterior(m::SGPRModel, x, y) - # f = GP(kernel(m.k)) - # fx = f(x, lik_noise) - # fz = f(m.z) - # return AbstractGPs.approx_posterior(VFE(), fx, y, fz) - # end - - function posterior(m::SVGPModel) - f = GP(kernel(m.k)) - fz = f(m.z, jitter) - q = MvNormal(m.m, m.A'm.A) - return SparseGPs.approx_posterior(SVGP(), fz, q) + @testset "optimised posterior" begin + jitter = 1e-5 + + ## FIRST - define the models + # GPR - Exact GP regression + struct GPRModel + k # kernel parameters + end + @Flux.functor GPRModel + + function (m::GPRModel)(x) + f = GP(make_kernel(m.k)) + fx = f(x, lik_noise) + return fx + end + + # SVGP - Sparse variational GP regression (Hensman 2014) + struct SVGPModel + k # kernel parameters + z # inducing points + m # variational mean + A # variational covariance sqrt (Σ = A'A) + end + @Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs + + function (m::SVGPModel)(x) + f = GP(make_kernel(m.k)) + q = MvNormal(m.m, m.A'm.A) + fx = f(x, lik_noise) + fz = f(m.z, jitter) + return fx, fz, q + end + + ## SECOND - create the models and associated training losses + gpr = GPRModel(copy(k_init)) + function GPR_loss(x, y) + fx = gpr(x) + return -logpdf(fx, y) + end + + m, A = zeros(N), Matrix{Float64}(I, N, N) # initialise the variational parameters + svgp = SVGPModel(copy(k_init), copy(z), m, A) + function SVGP_loss(x, y) + fx, fz, q = svgp(x) + return -SparseGPs.elbo(fx, y, fz, q) + end + + ## THIRD - train the models + data = [(x, y)] + opt = ADAM(0.001) + + svgp_ps = Flux.params(svgp) + delete!(svgp_ps, svgp.k) # Don't train the kernel parameters + + # Optimise q(u) + Flux.train!((x, y) -> SVGP_loss(x, y), svgp_ps, ncycle(data, 20000), opt) + + ## FOURTH - construct the posteriors + function posterior(m::GPRModel, x, y) + f = GP(make_kernel(m.k)) + fx = f(x, lik_noise) + return AbstractGPs.posterior(fx, y) + end + + function posterior(m::SVGPModel) + f = GP(make_kernel(m.k)) + fz = f(m.z, jitter) + q = MvNormal(m.m, m.A'm.A) + return SparseGPs.approx_posterior(SVGP(), fz, q) + end + + gpr_post = posterior(gpr, x, y) + svgp_post = posterior(svgp) + + ## FIFTH - test equivalences + @test all(isapprox.(mean(gpr_post, x), mean(svgp_post, x), atol=1e-4)) + @test all(isapprox.(cov(gpr_post, x), cov(svgp_post, x), atol=1e-4)) end - gpr_post = posterior(gpr, x, y) - # sgpr_post = posterior(sgpr, x, y) - svgp_post = posterior(svgp) - - ## FIFTH - test equivalences - @test all(isapprox.(mean(gpr_post, x), mean(svgp_post, x), atol=1e-3)) - @test all(isapprox.(cov(gpr_post, x), cov(svgp_post, x), atol=1e-3)) end diff --git a/test/runtests.jl b/test/runtests.jl index e5d8346a..5419760a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -5,6 +5,7 @@ using Flux using IterTools using AbstractGPs using Distributions +using LinearAlgebra const GROUP = get(ENV, "GROUP", "All") const PKGDIR = dirname(dirname(pathof(SparseGPs))) From a8e5cbe740e26e0090021cc0cc05cf67e07435e0 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Wed, 14 Jul 2021 22:39:20 +0100 Subject: [PATCH 29/66] Address review comments --- src/elbo.jl | 21 +++++++++++---------- src/svgp.jl | 4 ++-- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/elbo.jl b/src/elbo.jl index 4f6783bd..ad2e2f54 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -13,7 +13,7 @@ function elbo( fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, - q::MvNormal; + q::AbstractMvNormal; n_data=length(y) ) n_batch = length(y) @@ -35,7 +35,7 @@ function elbo( lfx::LatentFiniteGP, y::AbstractVector, fz::FiniteGP, - q::MvNormal; + q::AbstractMvNormal; n_data=length(y) ) n_batch = length(y) @@ -43,14 +43,14 @@ function elbo( variational_exp = expected_loglik(y, f_mean, f_var, lfx.lik) scale = n_data / n_batch - return sum(variational_exp) * scale - kl_term + return variational_exp * scale - kl_term end # Computes the common intermediates needed for the ELBO function _elbo_intermediates( fx::FiniteGP, fz::FiniteGP, - q::MvNormal + q::AbstractMvNormal ) kl_term = kl_divergence(q, fz) post = approx_posterior(SVGP(), fz, q) @@ -59,7 +59,7 @@ function _elbo_intermediates( end "Likelihoods which take a scalar (or vector of scalars) as input and return a single scalar." -ScalarLikelihood = Union{BernoulliLikelihood, CategoricalLikelihood, PoissonLikelihood} +ScalarLikelihood = Union{BernoulliLikelihood,PoissonLikelihood} """ expected_loglik(y, f_mean, f_var, [Σy | lik]) @@ -101,7 +101,7 @@ function expected_loglik( f_var::AbstractVector, Σy::AbstractVector ) - return -0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy) + return sum(-0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy)) end """ @@ -118,13 +118,14 @@ function expected_loglik( lik::ScalarLikelihood; n_points=20 ) - return gauss_hermite_quadrature(y, f_mean, f_var, lik; n_points=n_points) + return sum(gauss_hermite_quadrature(y, f_mean, f_var, lik; n_points=n_points)) end -function kl_divergence(q::MvNormal, p::AbstractMvNormal) +function kl_divergence(q::AbstractMvNormal, p::AbstractMvNormal) p_μ, p_Σ = mean(p), cov(p) - (1/2) .* (logdet(p_Σ) - logdet(q.Σ) - length(p_μ) + tr(p_Σ \ cov(q)) + - Xt_invA_X(cholesky(p_Σ), (q.μ - p_μ))) + q_μ, q_Σ = mean(q), cov(q) + (1/2) .* (logdet(p_Σ) - logdet(q_Σ) - length(p_μ) + tr(p_Σ \ q_Σ) + + Xt_invA_X(cholesky(p_Σ), (q_μ - p_μ))) end function gauss_hermite_quadrature( diff --git a/src/svgp.jl b/src/svgp.jl index d4f0e944..f0724905 100644 --- a/src/svgp.jl +++ b/src/svgp.jl @@ -18,8 +18,8 @@ variational Gaussian process classification." Artificial Intelligence and Statistics. PMLR, 2015. """ -function approx_posterior(::SVGP, fz::FiniteGP, q::MvNormal) - m, A = q.μ, cholesky(q.Σ) +function approx_posterior(::SVGP, fz::FiniteGP, q::AbstractMvNormal) + m, A = mean(q), cholesky(cov(q)) Kuu = cholesky(Symmetric(cov(fz))) B = Kuu.L \ A.L data = (A=A, m=m, Kuu=Kuu, B=B, α=Kuu \ m, u=fz.x) From 1bbeae0e18f5f18de08d8e5a2a07afc69040a744 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Fri, 16 Jul 2021 16:29:28 +0100 Subject: [PATCH 30/66] Fix docstrings Co-authored-by: st-- --- src/elbo.jl | 9 +++------ src/svgp.jl | 4 ++-- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/elbo.jl b/src/elbo.jl index ad2e2f54..c12afccc 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -1,8 +1,8 @@ """ - elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::MvNormal; n_data=length(y)) + elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y)) Compute the Evidence Lower BOund from [1] for the process `fx.f` where `y` are -observations of `fx`, pseudo-inputs are given by `z = fz.z` and `q(u)` is a +observations of `fx`, pseudo-inputs are given by `z = fz.x` and `q(u)` is a variational distribution over inducing points `u = f(z)`. [1] - Hensman, James, Alexander Matthews, and Zoubin Ghahramani. "Scalable @@ -27,7 +27,7 @@ end """ - elbo(fx::LatentFiniteGP, y::AbstractVector, fz::FiniteGP, q::MvNormal; n_data=length(y)) + elbo(lfx::LatentFiniteGP, y::AbstractVector, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y)) Compute the ELBO for a LatentGP with a possibly non-conjugate likelihood. """ @@ -86,7 +86,6 @@ approximated using Gauss-Hermite quadrature by default. `q(f)` is assumed to be an `MvNormal` distribution and `p(y | f)` is assumed to have independent marginals such that only the marginals of `q(f)` are required. """ - function expected_loglik end """ @@ -94,7 +93,6 @@ function expected_loglik end The expected log likelihood for a Gaussian likelihood, computed in closed form. """ -# The closed form expected loglikelihood for a Gaussian likelihood function expected_loglik( y::AbstractVector{<:Real}, f_mean::AbstractVector, @@ -110,7 +108,6 @@ end The expected log likelihood for a `ScalarLikelihood`, approximated via Gauss-Hermite quadrature with `n_points` quadrature points. """ - function expected_loglik( y::AbstractVector, f_mean::AbstractVector, diff --git a/src/svgp.jl b/src/svgp.jl index f0724905..e87da8bc 100644 --- a/src/svgp.jl +++ b/src/svgp.jl @@ -1,6 +1,6 @@ struct SVGP end -""" +raw""" approx_posterior(::SVGP, fz::FiniteGP, q::MvNormal) Compute the approximate posterior [1] over the process `f = fz.f`, given inducing @@ -9,7 +9,7 @@ f(z)`. The approximate posterior at test points ``x^*`` where ``f^* = f(x^*)`` is then given by: ```math -q(f^*) = \\int p(f | u) q(u) du +q(f^*) = \int p(f | u) q(u) du ``` which can be found in closed form. From 1a0782ffbb0ead758ba80b7892fcd0094306c576 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Sun, 18 Jul 2021 20:07:59 +0100 Subject: [PATCH 31/66] Rename kldivergence --- src/elbo.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/elbo.jl b/src/elbo.jl index c12afccc..5894faeb 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -52,7 +52,7 @@ function _elbo_intermediates( fz::FiniteGP, q::AbstractMvNormal ) - kl_term = kl_divergence(q, fz) + kl_term = StatsBase.kldivergence(q, fz) post = approx_posterior(SVGP(), fz, q) f_mean, f_var = mean_and_var(post, fx.x) return kl_term, f_mean, f_var @@ -118,7 +118,7 @@ function expected_loglik( return sum(gauss_hermite_quadrature(y, f_mean, f_var, lik; n_points=n_points)) end -function kl_divergence(q::AbstractMvNormal, p::AbstractMvNormal) +function StatsBase.kldivergence(q::AbstractMvNormal, p::AbstractMvNormal) p_μ, p_Σ = mean(p), cov(p) q_μ, q_Σ = mean(q), cov(q) (1/2) .* (logdet(p_Σ) - logdet(q_Σ) - length(p_μ) + tr(p_Σ \ q_Σ) + From eddc7ab8ebaa675efb463d601ca21696c31018bb Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Mon, 19 Jul 2021 13:18:58 +0100 Subject: [PATCH 32/66] Factor out exact posterior --- test/equivalences.jl | 12 +----------- test/test_utils.jl | 12 ++++++++++++ 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/test/equivalences.jl b/test/equivalences.jl index e3f70fab..1765c790 100644 --- a/test/equivalences.jl +++ b/test/equivalences.jl @@ -17,21 +17,11 @@ # should therefore be equivalent to the sparse GP (Titsias) posterior # and exact GP regression (when z == x). - function exact_q(fu, fx, y) - σ² = fx.Σy[1] - Kuf = cov(fu, fx) - Kuu = Symmetric(cov(fu)) - Σ = (Symmetric(cov(fu) + (1/σ²) * Kuf * Kuf')) - m = ((1/σ²)*Kuu* (Σ\Kuf)) * y - S = Symmetric(Kuu * (Σ \ Kuu)) - return MvNormal(m, S) - end - kernel = make_kernel(k_init) f = GP(kernel) fx = f(x, lik_noise) fu = f(z) - q_ex = exact_q(fu, fx, y) + q_ex = exact_variational_posterior(fu, fx, y) gpr_post = AbstractGPs.posterior(fx, y) # Exact GP regression vfe_post = AbstractGPs.approx_posterior(VFE(), fx, y, fu) # Titsias posterior diff --git a/test/test_utils.jl b/test/test_utils.jl index e69de29b..0bae973c 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -0,0 +1,12 @@ +# Computes the optimal closed form solution for the variational posterior +# q(u) (e.g. # https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/ +# equations (11) & (12)). +function exact_variational_posterior(fu, fx, y) + σ² = fx.Σy[1] + Kuf = cov(fu, fx) + Kuu = Symmetric(cov(fu)) + Σ = (Symmetric(cov(fu) + (1/σ²) * Kuf * Kuf')) + m = ((1/σ²)*Kuu* (Σ\Kuf)) * y + S = Symmetric(Kuu * (Σ \ Kuu)) + return MvNormal(m, S) +end From 7ea3c2f7e2e9256ceb68047bebd3f087c0849244 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Mon, 19 Jul 2021 13:19:19 +0100 Subject: [PATCH 33/66] Use AbstractGPs TestUtils --- test/svgp.jl | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/test/svgp.jl b/test/svgp.jl index b5e84b04..a55b9bf4 100644 --- a/test/svgp.jl +++ b/test/svgp.jl @@ -1,4 +1,20 @@ @testset "svgp" begin - x = 4 - @test x == 4 + rng = MersenneTwister(123456) + N_cond = 5 + N_a = 6 + N_b = 7 + + # Specify prior. + f = GP(Matern32Kernel()) + # Sample from prior. + x = collect(range(-1.0, 1.0; length=N_cond)) + fx = f(x, 1e-15) + y = rand(rng, fx) + + q = exact_variational_posterior(fx, fx, y) + f_approx_post = SparseGPs.approx_posterior(SVGP(), fx, q) + + a = collect(range(-1.0, 1.0; length=N_a)) + b = randn(rng, N_b) + AbstractGPs.TestUtils.test_internal_abstractgps_interface(rng, f_approx_post, a, b) end From 9b6557fcc7b2481512a82b350fef18e97e522837 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Mon, 19 Jul 2021 17:00:57 +0100 Subject: [PATCH 34/66] Added support for prior mean function --- src/svgp.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/svgp.jl b/src/svgp.jl index e87da8bc..2a5edd68 100644 --- a/src/svgp.jl +++ b/src/svgp.jl @@ -22,12 +22,13 @@ function approx_posterior(::SVGP, fz::FiniteGP, q::AbstractMvNormal) m, A = mean(q), cholesky(cov(q)) Kuu = cholesky(Symmetric(cov(fz))) B = Kuu.L \ A.L - data = (A=A, m=m, Kuu=Kuu, B=B, α=Kuu \ m, u=fz.x) + α=Kuu \ (m - mean(fz)) + data = (A=A, m=m, Kuu=Kuu, B=B, α=α, u=fz.x) return ApproxPosteriorGP(SVGP(), fz.f, data) end function Statistics.mean(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) - return cov(f.prior, x, f.data.u) * f.data.α + return mean(f.prior, x) + cov(f.prior, x, f.data.u) * f.data.α end function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) From 0e59e49ae9d32be55c20c96336addb85304e350c Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Wed, 21 Jul 2021 02:52:56 +0100 Subject: [PATCH 35/66] Added MC expectation and refactored elbo --- Project.toml | 1 + examples/classification.jl | 2 +- src/SparseGPs.jl | 1 + src/elbo.jl | 121 ++++++++++++++++++++++++++----------- 4 files changed, 89 insertions(+), 36 deletions(-) diff --git a/Project.toml b/Project.toml index e2645e29..5f55a43d 100644 --- a/Project.toml +++ b/Project.toml @@ -10,5 +10,6 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" FastGaussQuadrature = "442a2c76-b920-505d-bb47-c5924d526838" GPLikelihoods = "6031954c-0455-49d7-b3b9-3e1c99afaf40" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" diff --git a/examples/classification.jl b/examples/classification.jl index 03146024..153f633c 100644 --- a/examples/classification.jl +++ b/examples/classification.jl @@ -91,7 +91,7 @@ end function flux_loss(x, y; n_data=length(y)) fx, fu, q = model(x) - return -SparseGPs.elbo(fx, y, fu, q; n_data) + return -SparseGPs.elbo(fx, y, fu, q; n_data, method=:montecarlo) end # %% diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl index 4bd6092b..ad55bb39 100644 --- a/src/SparseGPs.jl +++ b/src/SparseGPs.jl @@ -7,6 +7,7 @@ using Statistics using StatsBase using FastGaussQuadrature using GPLikelihoods +using SpecialFunctions using ChainRulesCore using AbstractGPs: diff --git a/src/elbo.jl b/src/elbo.jl index 5894faeb..eab1afa1 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -1,3 +1,6 @@ +"Likelihoods which take a scalar (or vector of scalars) as input and return a single scalar." +ScalarLikelihood = Union{BernoulliLikelihood,PoissonLikelihood,GaussianLikelihood} + """ elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y)) @@ -14,15 +17,11 @@ function elbo( y::AbstractVector{<:Real}, fz::FiniteGP, q::AbstractMvNormal; - n_data=length(y) + n_data=length(y), + method=:default, + kwargs... ) - n_batch = length(y) - kl_term, f_mean, f_var = _elbo_intermediates(fx, fz, q) - - Σy = diag(fx.Σy) # n.b. this assumes uncorrelated observation noise - variational_exp = expected_loglik(y, f_mean, f_var, Σy) - scale = n_data / n_batch - return sum(variational_exp) * scale - kl_term + return _elbo(fx, y, fz, q, fx.Σy, n_data, method; kwargs...) end @@ -36,30 +35,34 @@ function elbo( y::AbstractVector, fz::FiniteGP, q::AbstractMvNormal; - n_data=length(y) + n_data=length(y), + method=:default, + kwargs... ) - n_batch = length(y) - kl_term, f_mean, f_var = _elbo_intermediates(lfx.fx, fz, q) - - variational_exp = expected_loglik(y, f_mean, f_var, lfx.lik) - scale = n_data / n_batch - return variational_exp * scale - kl_term + return _elbo(lfx.fx, y, fz, q, lfx.lik, n_data, method; kwargs...) end -# Computes the common intermediates needed for the ELBO -function _elbo_intermediates( + +function _elbo( fx::FiniteGP, + y::AbstractVector, fz::FiniteGP, - q::AbstractMvNormal + q::AbstractMvNormal, + lik::Union{AbstractVecOrMat,ScalarLikelihood}, + n_data::Integer, + method::Symbol; + kwargs... ) - kl_term = StatsBase.kldivergence(q, fz) post = approx_posterior(SVGP(), fz, q) f_mean, f_var = mean_and_var(post, fx.x) - return kl_term, f_mean, f_var -end + variational_exp = expected_loglik(y, f_mean, f_var, lik; method, kwargs...) -"Likelihoods which take a scalar (or vector of scalars) as input and return a single scalar." -ScalarLikelihood = Union{BernoulliLikelihood,PoissonLikelihood} + kl_term = StatsBase.kldivergence(q, fz) + + n_batch = length(y) + scale = n_data / n_batch + return sum(variational_exp) * scale - kl_term +end """ expected_loglik(y, f_mean, f_var, [Σy | lik]) @@ -89,17 +92,23 @@ have independent marginals such that only the marginals of `q(f)` are required. function expected_loglik end """ - expected_loglik(y::AbstractVector{<:Real}, f_mean::AbstractVector, f_var::AbstractVector, Σy::AbstractVector) + expected_loglik(y::AbstractVector{<:Real}, f_mean::AbstractVector, f_var::AbstractVector, Σy::AbstractMatrix) -The expected log likelihood for a Gaussian likelihood, computed in closed form. +The expected log likelihood for a Gaussian likelihood, computed in closed form by default. """ function expected_loglik( y::AbstractVector{<:Real}, f_mean::AbstractVector, f_var::AbstractVector, - Σy::AbstractVector + Σy::AbstractMatrix; + method=:default, + kwargs... ) - return sum(-0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy)) + if method === :default + return closed_form_expectation(y, f_mean, f_var, diag(Σy)) + else + return expected_loglik(y, f_mean, f_var, GaussianLikelihood(Σy[1]); method, kwargs...) + end end """ @@ -113,16 +122,48 @@ function expected_loglik( f_mean::AbstractVector, f_var::AbstractVector, lik::ScalarLikelihood; - n_points=20 + method=:default, + n_points=20, + n_samples=20 ) - return sum(gauss_hermite_quadrature(y, f_mean, f_var, lik; n_points=n_points)) + if method === :default && has_closed_form_expectation(lik) + return closed_form_expectation(y, f_mean, f_var, lik) + elseif method === :default || method === :gausshermite + return gauss_hermite_quadrature(y, f_mean, f_var, lik; n_points) + elseif method === :montecarlo + return monte_carlo_expectation(y, f_mean, f_var, lik; n_samples) + end end -function StatsBase.kldivergence(q::AbstractMvNormal, p::AbstractMvNormal) - p_μ, p_Σ = mean(p), cov(p) - q_μ, q_Σ = mean(q), cov(q) - (1/2) .* (logdet(p_Σ) - logdet(q_Σ) - length(p_μ) + tr(p_Σ \ q_Σ) + - Xt_invA_X(cholesky(p_Σ), (q_μ - p_μ))) +function closed_form_expectation( + y::AbstractVector, + f_mean::AbstractVector, + f_var::AbstractVector, + Σy::AbstractVector + ) + return sum(-0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy)) +end + +function closed_form_expectation( + y::AbstractVector, + f_mean::AbstractVector, + f_var::AbstractVector, + ::PoissonLikelihood + ) + return sum(y .* f_mean - exp(f_mean .+ (f_var / 2) - loggamma.(y))) +end + +function monte_carlo_expectation( + y::AbstractVector, + f_mean::AbstractVector, + f_var::AbstractVector, + lik::ScalarLikelihood; + n_samples=20 +) + # take 'n_samples' reparameterised samples with μ=f_mean and σ²=f_var + fs = f_mean .+ .√f_var .* randn(eltype(f_mean), length(f_mean), n_samples) + lls = loglikelihood.(lik.(fs), y) + return sum(lls) / n_samples end function gauss_hermite_quadrature( @@ -139,7 +180,17 @@ function gauss_hermite_quadrature( # size(fs): (length(y), n_points) fs = √2 * .√f_var .* transpose(xs) .+ f_mean lls = loglikelihood.(lik.(fs), y) - return (1/√π) * lls * ws + return sum((1/√π) * lls * ws) end ChainRulesCore.@non_differentiable gausshermite(n) + +function StatsBase.kldivergence(q::AbstractMvNormal, p::AbstractMvNormal) + p_μ, p_Σ = mean(p), cov(p) + q_μ, q_Σ = mean(q), cov(q) + (1/2) .* (logdet(p_Σ) - logdet(q_Σ) - length(p_μ) + tr(p_Σ \ q_Σ) + + Xt_invA_X(cholesky(p_Σ), (q_μ - p_μ))) +end + +has_closed_form_expectation(lik::Union{PoissonLikelihood,GaussianLikelihood}) = true +has_closed_form_expectation(lik) = false From 38ed15ff973beb73fd694ad5f4f1f25c8373129e Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Wed, 21 Jul 2021 03:08:16 +0100 Subject: [PATCH 36/66] Updated docstrings --- src/elbo.jl | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/elbo.jl b/src/elbo.jl index eab1afa1..58dd9d9b 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -2,12 +2,18 @@ ScalarLikelihood = Union{BernoulliLikelihood,PoissonLikelihood,GaussianLikelihood} """ - elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y)) + elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), method=:default) Compute the Evidence Lower BOund from [1] for the process `fx.f` where `y` are observations of `fx`, pseudo-inputs are given by `z = fz.x` and `q(u)` is a variational distribution over inducing points `u = f(z)`. +`method` selects which method is used to calculate the expected loglikelihood in +the ELBO. The options are: `:default`, `:gausshermite` and `:montecarlo`. For +likelihoods with a closed form solution, `:default` uses this exact solution. If +there is no such solution, `:default` is instead synonymous with +`:gausshermite`. + [1] - Hensman, James, Alexander Matthews, and Zoubin Ghahramani. "Scalable variational Gaussian process classification." Artificial Intelligence and Statistics. PMLR, 2015. @@ -26,7 +32,7 @@ end """ - elbo(lfx::LatentFiniteGP, y::AbstractVector, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y)) + elbo(lfx::LatentFiniteGP, y::AbstractVector, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), method=:default) Compute the ELBO for a LatentGP with a possibly non-conjugate likelihood. """ @@ -42,7 +48,7 @@ function elbo( return _elbo(lfx.fx, y, fz, q, lfx.lik, n_data, method; kwargs...) end - +# Compute the common elements of the ELBO function _elbo( fx::FiniteGP, y::AbstractVector, @@ -94,7 +100,10 @@ function expected_loglik end """ expected_loglik(y::AbstractVector{<:Real}, f_mean::AbstractVector, f_var::AbstractVector, Σy::AbstractMatrix) -The expected log likelihood for a Gaussian likelihood, computed in closed form by default. +The expected log likelihood for a Gaussian likelihood, computed in closed form +by default. If using the closed form solution, the noise Σy is assumed to be +uncorrelated (i.e. only diag(Σy) is used). If using `:gausshermite` or `:montecarlo`, +the noise is assumed to be homoscedastic as well (i.e. only Σy[1] is used). """ function expected_loglik( y::AbstractVector{<:Real}, @@ -112,10 +121,11 @@ function expected_loglik( end """ - expected_loglik(y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, lik::ScalarLikelihood; n_points=20) + expected_loglik(y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, lik::ScalarLikelihood; method=:default, n_points=20, n_samples=20) -The expected log likelihood for a `ScalarLikelihood`, approximated via -Gauss-Hermite quadrature with `n_points` quadrature points. +The expected log likelihood for a `ScalarLikelihood`, computed via `method`. +Defaults to a closed form solution if it exists, otherwise defaults to +Gauss-Hermite quadrature. """ function expected_loglik( y::AbstractVector, @@ -135,6 +145,7 @@ function expected_loglik( end end +# The closed form solution for independent Gaussian noise function closed_form_expectation( y::AbstractVector, f_mean::AbstractVector, @@ -144,6 +155,7 @@ function closed_form_expectation( return sum(-0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy)) end +# The closed form solution for a Poisson likelihood function closed_form_expectation( y::AbstractVector, f_mean::AbstractVector, From c8a974f5d77f379a3688de35931c16983329e92b Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Wed, 21 Jul 2021 14:59:19 +0100 Subject: [PATCH 37/66] Dispatch on types instead of symbols --- examples/classification.jl | 2 +- src/SparseGPs.jl | 7 ++- src/elbo.jl | 118 +++++++++++++++++++++---------------- 3 files changed, 73 insertions(+), 54 deletions(-) diff --git a/examples/classification.jl b/examples/classification.jl index 153f633c..b1442a09 100644 --- a/examples/classification.jl +++ b/examples/classification.jl @@ -91,7 +91,7 @@ end function flux_loss(x, y; n_data=length(y)) fx, fu, q = model(x) - return -SparseGPs.elbo(fx, y, fu, q; n_data, method=:montecarlo) + return -SparseGPs.elbo(fx, y, fu, q; n_data, method=MonteCarlo()) end # %% diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl index ad55bb39..b39b7a51 100644 --- a/src/SparseGPs.jl +++ b/src/SparseGPs.jl @@ -22,7 +22,12 @@ using AbstractGPs: export elbo, approx_posterior, - SVGP + SVGP, + Default, + Analytic, + GaussHermite, + MonteCarlo + include("elbo.jl") include("svgp.jl") diff --git a/src/elbo.jl b/src/elbo.jl index 58dd9d9b..61739a69 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -1,6 +1,21 @@ "Likelihoods which take a scalar (or vector of scalars) as input and return a single scalar." ScalarLikelihood = Union{BernoulliLikelihood,PoissonLikelihood,GaussianLikelihood} + +abstract type ExpectationMethod end +struct Default <: ExpectationMethod end +struct Analytic <: ExpectationMethod end + +struct GaussHermite <: ExpectationMethod + n_points +end +GaussHermite() = GaussHermite(20) + +struct MonteCarlo <: ExpectationMethod + n_samples +end +MonteCarlo() = MonteCarlo(20) + """ elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), method=:default) @@ -24,10 +39,9 @@ function elbo( fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), - method=:default, - kwargs... + method=Default() ) - return _elbo(fx, y, fz, q, fx.Σy, n_data, method; kwargs...) + return _elbo(method, fx, y, fz, q, fx.Σy, n_data) end @@ -42,26 +56,24 @@ function elbo( fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), - method=:default, - kwargs... + method=Default() ) - return _elbo(lfx.fx, y, fz, q, lfx.lik, n_data, method; kwargs...) + return _elbo(method, lfx.fx, y, fz, q, lfx.lik, n_data) end # Compute the common elements of the ELBO function _elbo( + method::ExpectationMethod, fx::FiniteGP, y::AbstractVector, fz::FiniteGP, q::AbstractMvNormal, lik::Union{AbstractVecOrMat,ScalarLikelihood}, - n_data::Integer, - method::Symbol; - kwargs... + n_data::Integer ) post = approx_posterior(SVGP(), fz, q) f_mean, f_var = mean_and_var(post, fx.x) - variational_exp = expected_loglik(y, f_mean, f_var, lik; method, kwargs...) + variational_exp = expected_loglik(method, y, f_mean, f_var, lik) kl_term = StatsBase.kldivergence(q, fz) @@ -106,18 +118,36 @@ uncorrelated (i.e. only diag(Σy) is used). If using `:gausshermite` or `:montec the noise is assumed to be homoscedastic as well (i.e. only Σy[1] is used). """ function expected_loglik( + ::Default, y::AbstractVector{<:Real}, f_mean::AbstractVector, f_var::AbstractVector, - Σy::AbstractMatrix; - method=:default, - kwargs... + Σy::AbstractMatrix +) + method = _default_method(GaussianLikelihood()) + expected_loglik(method, y, f_mean, f_var, Σy) +end + +# The closed form solution for independent Gaussian noise +function expected_loglik( + ::Analytic, + y::AbstractVector, + f_mean::AbstractVector, + f_var::AbstractVector, + Σy::AbstractMatrix ) - if method === :default - return closed_form_expectation(y, f_mean, f_var, diag(Σy)) - else - return expected_loglik(y, f_mean, f_var, GaussianLikelihood(Σy[1]); method, kwargs...) - end + Σy_diag = diag(Σy) + return sum(-0.5 * (log(2π) .+ log.(Σy_diag) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy_diag)) +end + +function expected_loglik( + method::Union{GaussHermite,MonteCarlo}, + y::AbstractVector, + f_mean::AbstractVector, + f_var::AbstractVector, + Σy::AbstractMatrix +) + return expected_loglik(method, y, f_mean, f_var, GaussianLikelihood(Σy[1])) end """ @@ -128,67 +158,51 @@ Defaults to a closed form solution if it exists, otherwise defaults to Gauss-Hermite quadrature. """ function expected_loglik( + ::Default, y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, - lik::ScalarLikelihood; - method=:default, - n_points=20, - n_samples=20 + lik::ScalarLikelihood ) - if method === :default && has_closed_form_expectation(lik) - return closed_form_expectation(y, f_mean, f_var, lik) - elseif method === :default || method === :gausshermite - return gauss_hermite_quadrature(y, f_mean, f_var, lik; n_points) - elseif method === :montecarlo - return monte_carlo_expectation(y, f_mean, f_var, lik; n_samples) - end -end - -# The closed form solution for independent Gaussian noise -function closed_form_expectation( - y::AbstractVector, - f_mean::AbstractVector, - f_var::AbstractVector, - Σy::AbstractVector - ) - return sum(-0.5 * (log(2π) .+ log.(Σy) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy)) + method = _default_method(lik) + expected_loglik(method, y, f_mean, f_var, lik) end # The closed form solution for a Poisson likelihood -function closed_form_expectation( +function expected_loglik( + ::Analytic, y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, ::PoissonLikelihood - ) +) return sum(y .* f_mean - exp(f_mean .+ (f_var / 2) - loggamma.(y))) end -function monte_carlo_expectation( +function expected_loglik( + mc::MonteCarlo, y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, - lik::ScalarLikelihood; - n_samples=20 + lik::ScalarLikelihood ) # take 'n_samples' reparameterised samples with μ=f_mean and σ²=f_var - fs = f_mean .+ .√f_var .* randn(eltype(f_mean), length(f_mean), n_samples) + fs = f_mean .+ .√f_var .* randn(eltype(f_mean), length(f_mean), mc.n_samples) lls = loglikelihood.(lik.(fs), y) - return sum(lls) / n_samples + return sum(lls) / mc.n_samples end -function gauss_hermite_quadrature( +function expected_loglik( + gh::GaussHermite, y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, - lik; - n_points=20 + lik::ScalarLikelihood ) # Compute the expectation via Gauss-Hermite quadrature # using a reparameterisation by change of variable # (see eg. en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature) - xs, ws = gausshermite(n_points) + xs, ws = gausshermite(gh.n_points) # size(fs): (length(y), n_points) fs = √2 * .√f_var .* transpose(xs) .+ f_mean lls = loglikelihood.(lik.(fs), y) @@ -204,5 +218,5 @@ function StatsBase.kldivergence(q::AbstractMvNormal, p::AbstractMvNormal) Xt_invA_X(cholesky(p_Σ), (q_μ - p_μ))) end -has_closed_form_expectation(lik::Union{PoissonLikelihood,GaussianLikelihood}) = true -has_closed_form_expectation(lik) = false +_default_method(::Union{PoissonLikelihood,GaussianLikelihood}) = Analytic() +_default_method(_) = GaussHermite() From 56507a8e2157f236a687d82d40a33b873b073ee2 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Wed, 21 Jul 2021 15:08:49 +0100 Subject: [PATCH 38/66] Update doctrings --- src/elbo.jl | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/src/elbo.jl b/src/elbo.jl index 61739a69..14d4b5a8 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -17,17 +17,17 @@ end MonteCarlo() = MonteCarlo(20) """ - elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), method=:default) + elbo(fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), method=Default()) Compute the Evidence Lower BOund from [1] for the process `fx.f` where `y` are observations of `fx`, pseudo-inputs are given by `z = fz.x` and `q(u)` is a variational distribution over inducing points `u = f(z)`. `method` selects which method is used to calculate the expected loglikelihood in -the ELBO. The options are: `:default`, `:gausshermite` and `:montecarlo`. For -likelihoods with a closed form solution, `:default` uses this exact solution. If -there is no such solution, `:default` is instead synonymous with -`:gausshermite`. +the ELBO. The options are: `Default()`, `Analytic()`, `GaussHermite()` and +`MonteCarlo()`. For likelihoods with an analytic solution, `Default()` uses this +exact solution. If there is no such solution, `Default()` either uses +`GaussHermite()` or `MonteCarlo()`, depending on the likelihood. [1] - Hensman, James, Alexander Matthews, and Zoubin Ghahramani. "Scalable variational Gaussian process classification." Artificial Intelligence and @@ -46,7 +46,7 @@ end """ - elbo(lfx::LatentFiniteGP, y::AbstractVector, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), method=:default) + elbo(lfx::LatentFiniteGP, y::AbstractVector, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), method=Default()) Compute the ELBO for a LatentGP with a possibly non-conjugate likelihood. """ @@ -83,7 +83,7 @@ function _elbo( end """ - expected_loglik(y, f_mean, f_var, [Σy | lik]) + expected_loglik(method, y, f_mean, f_var, [Σy | lik]) This function computes the expected log likelihood: @@ -100,7 +100,7 @@ where `q(u)` is the variational distribution over inducing points (see [`elbo`](@ref)). Where possible, this expectation is calculated in closed form. Otherwise, it is -approximated using Gauss-Hermite quadrature by default. +approximated using either Gauss-Hermite quadrature or Monte Carlo. # Extended help @@ -114,7 +114,7 @@ function expected_loglik end The expected log likelihood for a Gaussian likelihood, computed in closed form by default. If using the closed form solution, the noise Σy is assumed to be -uncorrelated (i.e. only diag(Σy) is used). If using `:gausshermite` or `:montecarlo`, +uncorrelated (i.e. only diag(Σy) is used). If using `GaussHermite()` or `MonteCarlo()`, the noise is assumed to be homoscedastic as well (i.e. only Σy[1] is used). """ function expected_loglik( @@ -151,7 +151,7 @@ function expected_loglik( end """ - expected_loglik(y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, lik::ScalarLikelihood; method=:default, n_points=20, n_samples=20) + expected_loglik(method::ExpectationMethod, y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, lik::ScalarLikelihood) The expected log likelihood for a `ScalarLikelihood`, computed via `method`. Defaults to a closed form solution if it exists, otherwise defaults to @@ -179,6 +179,16 @@ function expected_loglik( return sum(y .* f_mean - exp(f_mean .+ (f_var / 2) - loggamma.(y))) end +function expected_loglik( + ::Analytic, + y::AbstractVector, + f_mean::AbstractVector, + f_var::AbstractVector, + lik +) + return sum(y .* f_mean - exp(f_mean .+ (f_var / 2) - loggamma.(y))) +end + function expected_loglik( mc::MonteCarlo, y::AbstractVector, From 857ecc34f4c718a063cb83c7caff61903acbffef Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Wed, 21 Jul 2021 15:09:16 +0100 Subject: [PATCH 39/66] Enforce type for MonteCarlo and GaussHermite --- src/elbo.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/elbo.jl b/src/elbo.jl index 14d4b5a8..95f15900 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -7,12 +7,12 @@ struct Default <: ExpectationMethod end struct Analytic <: ExpectationMethod end struct GaussHermite <: ExpectationMethod - n_points + n_points::Int end GaussHermite() = GaussHermite(20) struct MonteCarlo <: ExpectationMethod - n_samples + n_samples::Int end MonteCarlo() = MonteCarlo(20) From 1bbf38568b8e3ed935786a093e64e066c7b88231 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Wed, 21 Jul 2021 15:17:04 +0100 Subject: [PATCH 40/66] Added error for Analytic --- src/elbo.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/elbo.jl b/src/elbo.jl index 95f15900..cabe300e 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -186,7 +186,10 @@ function expected_loglik( f_var::AbstractVector, lik ) - return sum(y .* f_mean - exp(f_mean .+ (f_var / 2) - loggamma.(y))) + return error( + "No analytic solution exists for ", lik, + ". Use `Default()`, `GaussHermite()` or `MonteCarlo()` instead." + ) end function expected_loglik( From bbd8502c66dfabf7000066df56de5d0b14e08fb3 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Wed, 21 Jul 2021 15:26:31 +0100 Subject: [PATCH 41/66] Rename GaussHermite to Quadrature --- src/elbo.jl | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/elbo.jl b/src/elbo.jl index cabe300e..d2d3533b 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -6,10 +6,10 @@ abstract type ExpectationMethod end struct Default <: ExpectationMethod end struct Analytic <: ExpectationMethod end -struct GaussHermite <: ExpectationMethod +struct Quadrature <: ExpectationMethod n_points::Int end -GaussHermite() = GaussHermite(20) +Quadrature() = Quadrature(20) struct MonteCarlo <: ExpectationMethod n_samples::Int @@ -24,10 +24,10 @@ observations of `fx`, pseudo-inputs are given by `z = fz.x` and `q(u)` is a variational distribution over inducing points `u = f(z)`. `method` selects which method is used to calculate the expected loglikelihood in -the ELBO. The options are: `Default()`, `Analytic()`, `GaussHermite()` and +the ELBO. The options are: `Default()`, `Analytic()`, `Quadrature()` and `MonteCarlo()`. For likelihoods with an analytic solution, `Default()` uses this exact solution. If there is no such solution, `Default()` either uses -`GaussHermite()` or `MonteCarlo()`, depending on the likelihood. +`Quadrature()` or `MonteCarlo()`, depending on the likelihood. [1] - Hensman, James, Alexander Matthews, and Zoubin Ghahramani. "Scalable variational Gaussian process classification." Artificial Intelligence and @@ -114,7 +114,7 @@ function expected_loglik end The expected log likelihood for a Gaussian likelihood, computed in closed form by default. If using the closed form solution, the noise Σy is assumed to be -uncorrelated (i.e. only diag(Σy) is used). If using `GaussHermite()` or `MonteCarlo()`, +uncorrelated (i.e. only diag(Σy) is used). If using `Quadrature()` or `MonteCarlo()`, the noise is assumed to be homoscedastic as well (i.e. only Σy[1] is used). """ function expected_loglik( @@ -141,7 +141,7 @@ function expected_loglik( end function expected_loglik( - method::Union{GaussHermite,MonteCarlo}, + method::Union{Quadrature,MonteCarlo}, y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, @@ -188,7 +188,7 @@ function expected_loglik( ) return error( "No analytic solution exists for ", lik, - ". Use `Default()`, `GaussHermite()` or `MonteCarlo()` instead." + ". Use `Default()`, `Quadrature()` or `MonteCarlo()` instead." ) end @@ -206,7 +206,7 @@ function expected_loglik( end function expected_loglik( - gh::GaussHermite, + gh::Quadrature, y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, @@ -232,4 +232,4 @@ function StatsBase.kldivergence(q::AbstractMvNormal, p::AbstractMvNormal) end _default_method(::Union{PoissonLikelihood,GaussianLikelihood}) = Analytic() -_default_method(_) = GaussHermite() +_default_method(_) = Quadrature() From 0563d0192f830889f40e9feda51c2274008aee8d Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Wed, 21 Jul 2021 18:00:57 +0100 Subject: [PATCH 42/66] Assume homoscedastic Gaussian noise --- src/SparseGPs.jl | 2 +- src/elbo.jl | 64 ++++++++++++++---------------------------------- 2 files changed, 20 insertions(+), 46 deletions(-) diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl index b39b7a51..6209acdb 100644 --- a/src/SparseGPs.jl +++ b/src/SparseGPs.jl @@ -25,7 +25,7 @@ export elbo, SVGP, Default, Analytic, - GaussHermite, + Quadrature, MonteCarlo diff --git a/src/elbo.jl b/src/elbo.jl index d2d3533b..a6ea2ba4 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -29,6 +29,9 @@ the ELBO. The options are: `Default()`, `Analytic()`, `Quadrature()` and exact solution. If there is no such solution, `Default()` either uses `Quadrature()` or `MonteCarlo()`, depending on the likelihood. +N.B. the observation noise `fx.Σy` is assumed to be homoscedastic and +uncorrelated - i.e. only `fx.Σy[1]` is used. + [1] - Hensman, James, Alexander Matthews, and Zoubin Ghahramani. "Scalable variational Gaussian process classification." Artificial Intelligence and Statistics. PMLR, 2015. @@ -41,7 +44,7 @@ function elbo( n_data=length(y), method=Default() ) - return _elbo(method, fx, y, fz, q, fx.Σy, n_data) + return _elbo(method, fx, y, fz, q, GaussianLikelihood(fx.Σy[1]), n_data) end @@ -68,7 +71,7 @@ function _elbo( y::AbstractVector, fz::FiniteGP, q::AbstractMvNormal, - lik::Union{AbstractVecOrMat,ScalarLikelihood}, + lik::ScalarLikelihood, n_data::Integer ) post = approx_posterior(SVGP(), fz, q) @@ -83,7 +86,7 @@ function _elbo( end """ - expected_loglik(method, y, f_mean, f_var, [Σy | lik]) + expected_loglik(method, y, f_mean, f_var, lik) This function computes the expected log likelihood: @@ -97,7 +100,8 @@ where `p(y | f)` is the process likelihood. q(f) = ∫ p(f | u) q(u) du ``` where `q(u)` is the variational distribution over inducing points (see -[`elbo`](@ref)). +[`elbo`](@ref)). The marginal means and variances of `q(f)` are given by +`f_mean` and `f_var` respectively. Where possible, this expectation is calculated in closed form. Otherwise, it is approximated using either Gauss-Hermite quadrature or Monte Carlo. @@ -110,22 +114,21 @@ have independent marginals such that only the marginals of `q(f)` are required. function expected_loglik end """ - expected_loglik(y::AbstractVector{<:Real}, f_mean::AbstractVector, f_var::AbstractVector, Σy::AbstractMatrix) + expected_loglik(method::ExpectationMethod, y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, lik::ScalarLikelihood) -The expected log likelihood for a Gaussian likelihood, computed in closed form -by default. If using the closed form solution, the noise Σy is assumed to be -uncorrelated (i.e. only diag(Σy) is used). If using `Quadrature()` or `MonteCarlo()`, -the noise is assumed to be homoscedastic as well (i.e. only Σy[1] is used). +The expected log likelihood for a `ScalarLikelihood`, computed via `method`. +Defaults to a closed form solution if it exists, otherwise defaults to +Gauss-Hermite quadrature. """ function expected_loglik( ::Default, - y::AbstractVector{<:Real}, + y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, - Σy::AbstractMatrix + lik::ScalarLikelihood ) - method = _default_method(GaussianLikelihood()) - expected_loglik(method, y, f_mean, f_var, Σy) + method = _default_method(lik) + expected_loglik(method, y, f_mean, f_var, lik) end # The closed form solution for independent Gaussian noise @@ -134,38 +137,9 @@ function expected_loglik( y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, - Σy::AbstractMatrix -) - Σy_diag = diag(Σy) - return sum(-0.5 * (log(2π) .+ log.(Σy_diag) .+ ((y .- f_mean).^2 .+ f_var) ./ Σy_diag)) -end - -function expected_loglik( - method::Union{Quadrature,MonteCarlo}, - y::AbstractVector, - f_mean::AbstractVector, - f_var::AbstractVector, - Σy::AbstractMatrix + lik::GaussianLikelihood ) - return expected_loglik(method, y, f_mean, f_var, GaussianLikelihood(Σy[1])) -end - -""" - expected_loglik(method::ExpectationMethod, y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, lik::ScalarLikelihood) - -The expected log likelihood for a `ScalarLikelihood`, computed via `method`. -Defaults to a closed form solution if it exists, otherwise defaults to -Gauss-Hermite quadrature. -""" -function expected_loglik( - ::Default, - y::AbstractVector, - f_mean::AbstractVector, - f_var::AbstractVector, - lik::ScalarLikelihood -) - method = _default_method(lik) - expected_loglik(method, y, f_mean, f_var, lik) + return sum(-0.5 * (log(2π) .+ log(lik.σ²) .+ ((y .- f_mean).^2 .+ f_var) / lik.σ²)) end # The closed form solution for a Poisson likelihood @@ -176,7 +150,7 @@ function expected_loglik( f_var::AbstractVector, ::PoissonLikelihood ) - return sum(y .* f_mean - exp(f_mean .+ (f_var / 2) - loggamma.(y))) + return sum((y .* f_mean) - exp.(f_mean .+ (f_var / 2)) - loggamma.(y .+ 1)) end function expected_loglik( From fb9a56399e1710578928ecc20d52751f8fe57992 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Wed, 21 Jul 2021 18:01:16 +0100 Subject: [PATCH 43/66] Add tests for `expected_loglik` --- test/elbo.jl | 18 ++++++++++++++++++ test/runtests.jl | 4 ++++ 2 files changed, 22 insertions(+) create mode 100644 test/elbo.jl diff --git a/test/elbo.jl b/test/elbo.jl new file mode 100644 index 00000000..649f985b --- /dev/null +++ b/test/elbo.jl @@ -0,0 +1,18 @@ +@testset "elbo" begin + # Test that the various methods of computing expectations return the same + # result. + rng = MersenneTwister(123456) + f_mean = rand(rng, 10) + f_var = rand(rng, 10) + + @testset "$lik" for lik in Base.uniontypes(SparseGPs.ScalarLikelihood) + l = lik() + methods = [Quadrature(100), MonteCarlo(1000000)] + def = SparseGPs._default_method(l) + if def isa Analytic push!(methods, def) end + y = rand.(rng, l.(f_mean)) + + results = map(m -> SparseGPs.expected_loglik(m, y, f_mean, f_var, l), methods) + @test all(x->isapprox(x, results[end], rtol=1e-3), results) + end +end diff --git a/test/runtests.jl b/test/runtests.jl index 5419760a..3a60e76b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -17,6 +17,10 @@ include("test_utils.jl") println(" ") @info "Ran svgp tests" + include("elbo.jl") + println(" ") + @info "Ran elbo tests" + include("equivalences.jl") println(" ") @info "Ran equivalences tests" From e62fbf70e2c588bc797a0645c96ad7be27a00003 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Sat, 24 Jul 2021 15:44:34 +0100 Subject: [PATCH 44/66] Require ExpLink for Poisson closed form --- src/elbo.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/elbo.jl b/src/elbo.jl index a6ea2ba4..e96a38c7 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -139,16 +139,16 @@ function expected_loglik( f_var::AbstractVector, lik::GaussianLikelihood ) - return sum(-0.5 * (log(2π) .+ log(lik.σ²) .+ ((y .- f_mean).^2 .+ f_var) / lik.σ²)) + return sum(-0.5 * (log(2π) .+ log.(lik.σ²) .+ ((y .- f_mean).^2 .+ f_var) / lik.σ²)) end -# The closed form solution for a Poisson likelihood +# The closed form solution for a Poisson likelihood with an exponential inverse link function function expected_loglik( ::Analytic, y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, - ::PoissonLikelihood + ::PoissonLikelihood{ExpLink} ) return sum((y .* f_mean) - exp.(f_mean .+ (f_var / 2)) - loggamma.(y .+ 1)) end From 36c62b941812b48b34cc98ddb4fac43012474172 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Sat, 24 Jul 2021 15:49:15 +0100 Subject: [PATCH 45/66] Better error message --- src/elbo.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/elbo.jl b/src/elbo.jl index e96a38c7..167946ac 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -161,7 +161,7 @@ function expected_loglik( lik ) return error( - "No analytic solution exists for ", lik, + "No analytic solution exists for ", typeof(lik), ". Use `Default()`, `Quadrature()` or `MonteCarlo()` instead." ) end From 0ee10044df91dee8496cd3efcabe83bed0855467 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Sat, 24 Jul 2021 17:53:29 +0100 Subject: [PATCH 46/66] Added close form for Gamma and Exponential --- src/elbo.jl | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/src/elbo.jl b/src/elbo.jl index 167946ac..33c62914 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -1,5 +1,11 @@ "Likelihoods which take a scalar (or vector of scalars) as input and return a single scalar." -ScalarLikelihood = Union{BernoulliLikelihood,PoissonLikelihood,GaussianLikelihood} +ScalarLikelihood = Union{ + BernoulliLikelihood, + PoissonLikelihood, + GaussianLikelihood, + ExponentialLikelihood, + GammaLikelihood +} abstract type ExpectationMethod end @@ -153,6 +159,29 @@ function expected_loglik( return sum((y .* f_mean) - exp.(f_mean .+ (f_var / 2)) - loggamma.(y .+ 1)) end +# The closed form solution for an Exponential likelihood with an exponential inverse link function +function expected_loglik( + ::Analytic, + y::AbstractVector, + f_mean::AbstractVector, + f_var::AbstractVector, + ::ExponentialLikelihood{ExpLink} +) + return sum(-f_mean - y .* exp.((f_var / 2) .- f_mean)) +end + +# The closed form solution for a Gamma likelihood with an exponential inverse link function +function expected_loglik( + ::Analytic, + y::AbstractVector, + f_mean::AbstractVector, + f_var::AbstractVector, + lik::GammaLikelihood{<:Any, ExpLink} +) + return sum((lik.α - 1) * log.(y) .- y .* exp.((f_var / 2) .- f_mean) + .- lik.α * f_mean .- loggamma(lik.α)) +end + function expected_loglik( ::Analytic, y::AbstractVector, @@ -205,5 +234,11 @@ function StatsBase.kldivergence(q::AbstractMvNormal, p::AbstractMvNormal) Xt_invA_X(cholesky(p_Σ), (q_μ - p_μ))) end -_default_method(::Union{PoissonLikelihood,GaussianLikelihood}) = Analytic() +AnalyticLikelihood = Union{ + PoissonLikelihood, + GaussianLikelihood, + ExponentialLikelihood, + GammaLikelihood +} +_default_method(::AnalyticLikelihood) = Analytic() _default_method(_) = Quadrature() From f648a7c6f11191d22c63dbdff31101d8a19f3450 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Sat, 24 Jul 2021 17:56:04 +0100 Subject: [PATCH 47/66] Fix docstring --- src/svgp.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/svgp.jl b/src/svgp.jl index 2a5edd68..c4257eef 100644 --- a/src/svgp.jl +++ b/src/svgp.jl @@ -17,7 +17,6 @@ which can be found in closed form. variational Gaussian process classification." Artificial Intelligence and Statistics. PMLR, 2015. """ - function approx_posterior(::SVGP, fz::FiniteGP, q::AbstractMvNormal) m, A = mean(q), cholesky(cov(q)) Kuu = cholesky(Symmetric(cov(fz))) From a9b9a57dab947ae20711de6f9e8ef2af1e6fc6f1 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Sat, 24 Jul 2021 17:59:33 +0100 Subject: [PATCH 48/66] Update docstring --- src/elbo.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/elbo.jl b/src/elbo.jl index 33c62914..a1e2db65 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -109,8 +109,8 @@ where `q(u)` is the variational distribution over inducing points (see [`elbo`](@ref)). The marginal means and variances of `q(f)` are given by `f_mean` and `f_var` respectively. -Where possible, this expectation is calculated in closed form. Otherwise, it is -approximated using either Gauss-Hermite quadrature or Monte Carlo. +`method` determines which method is used to calculate the expected log +likelihood - see [`elbo`](@ref) for more details. # Extended help From b8e7d6b71a3659df52ef28270b02b2e1336a9605 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Mon, 26 Jul 2021 16:56:00 +0100 Subject: [PATCH 49/66] Fix docstring --- src/elbo.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/elbo.jl b/src/elbo.jl index a1e2db65..92f87917 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -1,4 +1,4 @@ -"Likelihoods which take a scalar (or vector of scalars) as input and return a single scalar." +"Likelihoods which take a scalar as input and return a scalar." ScalarLikelihood = Union{ BernoulliLikelihood, PoissonLikelihood, From 9353e44d5511af90362de4cf8c576eb0ea9ca6ad Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Mon, 26 Jul 2021 17:23:36 +0100 Subject: [PATCH 50/66] Restrict types for continuous distributions --- src/elbo.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/elbo.jl b/src/elbo.jl index 92f87917..e21d2bf4 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -140,7 +140,7 @@ end # The closed form solution for independent Gaussian noise function expected_loglik( ::Analytic, - y::AbstractVector, + y::AbstractVector{<:Real}, f_mean::AbstractVector, f_var::AbstractVector, lik::GaussianLikelihood @@ -162,7 +162,7 @@ end # The closed form solution for an Exponential likelihood with an exponential inverse link function function expected_loglik( ::Analytic, - y::AbstractVector, + y::AbstractVector{<:Real}, f_mean::AbstractVector, f_var::AbstractVector, ::ExponentialLikelihood{ExpLink} @@ -173,7 +173,7 @@ end # The closed form solution for a Gamma likelihood with an exponential inverse link function function expected_loglik( ::Analytic, - y::AbstractVector, + y::AbstractVector{<:Real}, f_mean::AbstractVector, f_var::AbstractVector, lik::GammaLikelihood{<:Any, ExpLink} From ea3d3c68c476b0c4a9e641c02d4a351b53ae0d1d Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Mon, 26 Jul 2021 18:31:30 +0100 Subject: [PATCH 51/66] Use `AbstractGPs.approx_posterior` and `elbo` --- src/SparseGPs.jl | 4 +--- src/elbo.jl | 4 ++-- src/svgp.jl | 2 +- test/equivalences.jl | 10 +++++----- test/svgp.jl | 2 +- 5 files changed, 10 insertions(+), 12 deletions(-) diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl index 6209acdb..7c96a8bd 100644 --- a/src/SparseGPs.jl +++ b/src/SparseGPs.jl @@ -20,9 +20,7 @@ using AbstractGPs: diag_At_A, Xt_invA_X -export elbo, - approx_posterior, - SVGP, +export SVGP, Default, Analytic, Quadrature, diff --git a/src/elbo.jl b/src/elbo.jl index e21d2bf4..cefd09ae 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -42,7 +42,7 @@ uncorrelated - i.e. only `fx.Σy[1]` is used. variational Gaussian process classification." Artificial Intelligence and Statistics. PMLR, 2015. """ -function elbo( +function AbstractGPs.elbo( fx::FiniteGP, y::AbstractVector{<:Real}, fz::FiniteGP, @@ -59,7 +59,7 @@ end Compute the ELBO for a LatentGP with a possibly non-conjugate likelihood. """ -function elbo( +function AbstractGPs.elbo( lfx::LatentFiniteGP, y::AbstractVector, fz::FiniteGP, diff --git a/src/svgp.jl b/src/svgp.jl index c4257eef..c53f5054 100644 --- a/src/svgp.jl +++ b/src/svgp.jl @@ -17,7 +17,7 @@ which can be found in closed form. variational Gaussian process classification." Artificial Intelligence and Statistics. PMLR, 2015. """ -function approx_posterior(::SVGP, fz::FiniteGP, q::AbstractMvNormal) +function AbstractGPs.approx_posterior(::SVGP, fz::FiniteGP, q::AbstractMvNormal) m, A = mean(q), cholesky(cov(q)) Kuu = cholesky(Symmetric(cov(fz))) B = Kuu.L \ A.L diff --git a/test/equivalences.jl b/test/equivalences.jl index 1765c790..d03ee474 100644 --- a/test/equivalences.jl +++ b/test/equivalences.jl @@ -23,9 +23,9 @@ fu = f(z) q_ex = exact_variational_posterior(fu, fx, y) - gpr_post = AbstractGPs.posterior(fx, y) # Exact GP regression - vfe_post = AbstractGPs.approx_posterior(VFE(), fx, y, fu) # Titsias posterior - svgp_post = SparseGPs.approx_posterior(SVGP(), fu, q_ex) # Hensman (2013) exact posterior + gpr_post = posterior(fx, y) # Exact GP regression + vfe_post = approx_posterior(VFE(), fx, y, fu) # Titsias posterior + svgp_post = approx_posterior(SVGP(), fu, q_ex) # Hensman (2013) exact posterior @test mean(gpr_post, x) ≈ mean(svgp_post, x) atol=1e-10 @test cov(gpr_post, x) ≈ cov(svgp_post, x) atol=1e-10 @@ -78,7 +78,7 @@ svgp = SVGPModel(copy(k_init), copy(z), m, A) function SVGP_loss(x, y) fx, fz, q = svgp(x) - return -SparseGPs.elbo(fx, y, fz, q) + return -elbo(fx, y, fz, q) end ## THIRD - train the models @@ -102,7 +102,7 @@ f = GP(make_kernel(m.k)) fz = f(m.z, jitter) q = MvNormal(m.m, m.A'm.A) - return SparseGPs.approx_posterior(SVGP(), fz, q) + return approx_posterior(SVGP(), fz, q) end gpr_post = posterior(gpr, x, y) diff --git a/test/svgp.jl b/test/svgp.jl index a55b9bf4..7dd90692 100644 --- a/test/svgp.jl +++ b/test/svgp.jl @@ -12,7 +12,7 @@ y = rand(rng, fx) q = exact_variational_posterior(fx, fx, y) - f_approx_post = SparseGPs.approx_posterior(SVGP(), fx, q) + f_approx_post = approx_posterior(SVGP(), fx, q) a = collect(range(-1.0, 1.0; length=N_a)) b = randn(rng, N_b) From c1a45464d75f73ea9e8e53c533e3d4168008bbf2 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Tue, 27 Jul 2021 21:09:51 +0100 Subject: [PATCH 52/66] Minor formatting --- src/elbo.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/elbo.jl b/src/elbo.jl index cefd09ae..3490ec92 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -53,7 +53,6 @@ function AbstractGPs.elbo( return _elbo(method, fx, y, fz, q, GaussianLikelihood(fx.Σy[1]), n_data) end - """ elbo(lfx::LatentFiniteGP, y::AbstractVector, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), method=Default()) @@ -117,7 +116,7 @@ likelihood - see [`elbo`](@ref) for more details. `q(f)` is assumed to be an `MvNormal` distribution and `p(y | f)` is assumed to have independent marginals such that only the marginals of `q(f)` are required. """ -function expected_loglik end +expected_loglik(method, y, f_mean, f_var, lik) """ expected_loglik(method::ExpectationMethod, y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, lik::ScalarLikelihood) From 835da224cfb0b6a5bbc8c0d43e2f99162c3f4c16 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Tue, 27 Jul 2021 22:09:09 +0100 Subject: [PATCH 53/66] Dispatch on filled diagonal matrix obs noise --- Project.toml | 1 + src/SparseGPs.jl | 2 ++ src/elbo.jl | 6 +++++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 5f55a43d..0bb253b1 100644 --- a/Project.toml +++ b/Project.toml @@ -8,6 +8,7 @@ AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918" ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" FastGaussQuadrature = "442a2c76-b920-505d-bb47-c5924d526838" +FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b" GPLikelihoods = "6031954c-0455-49d7-b3b9-3e1c99afaf40" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl index 7c96a8bd..14fef8b4 100644 --- a/src/SparseGPs.jl +++ b/src/SparseGPs.jl @@ -9,8 +9,10 @@ using FastGaussQuadrature using GPLikelihoods using SpecialFunctions using ChainRulesCore +using FillArrays using AbstractGPs: + AbstractGP, FiniteGP, LatentFiniteGP, ApproxPosteriorGP, diff --git a/src/elbo.jl b/src/elbo.jl index 3490ec92..1cb2bf40 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -43,7 +43,7 @@ variational Gaussian process classification." Artificial Intelligence and Statistics. PMLR, 2015. """ function AbstractGPs.elbo( - fx::FiniteGP, + fx::FiniteGP{<:AbstractGP, <:AbstractVector, <:Diagonal{<:Real, <:Fill}}, y::AbstractVector{<:Real}, fz::FiniteGP, q::AbstractMvNormal; @@ -53,6 +53,10 @@ function AbstractGPs.elbo( return _elbo(method, fx, y, fz, q, GaussianLikelihood(fx.Σy[1]), n_data) end +function AbstractGPs.elbo(::FiniteGP, ::AbstractVector, ::FiniteGP, ::AbstractMvNormal; kwargs...) where T<:FiniteGP + return error("The observation noise fx.Σy may not be homoscedastic.\n To avoid this error, construct fx using: f = GP(kernel); fx = f(x, σ²)") +end + """ elbo(lfx::LatentFiniteGP, y::AbstractVector, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), method=Default()) From fa1cdc36bbd9b340b069a37d568645ee3534c04e Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Tue, 27 Jul 2021 22:39:12 +0100 Subject: [PATCH 54/66] Add elbo tests --- test/elbo.jl | 21 +++++++++++++++++++-- test/equivalences.jl | 12 ++++++------ test/test_utils.jl | 3 +++ 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/test/elbo.jl b/test/elbo.jl index 649f985b..c4c879c2 100644 --- a/test/elbo.jl +++ b/test/elbo.jl @@ -1,9 +1,26 @@ @testset "elbo" begin + rng, N = MersenneTwister(654321), 20 + x = rand(rng, N) * 10 + y = sin.(x) + 0.9 * cos.(x * 1.6) + 0.4 * rand(rng, N) + z = x[begin:5] + + kernel = make_kernel([0.2, 0.6]) + f = GP(sin, kernel) + fx = f(x, 0.1) + fx_bad = f(x, fill(0.1, N)) + fz = f(z) + q_ex = exact_variational_posterior(fz, fx, y) + + @test elbo(fx, y, fz, q_ex) isa Real + @test elbo(fx, y, fz, q_ex) ≤ logpdf(fx, y) + + @test_throws ErrorException elbo(fx_bad, y, fz, q_ex) + # Test that the various methods of computing expectations return the same # result. rng = MersenneTwister(123456) - f_mean = rand(rng, 10) - f_var = rand(rng, 10) + f_mean = zeros(10) + f_var = ones(10) @testset "$lik" for lik in Base.uniontypes(SparseGPs.ScalarLikelihood) l = lik() diff --git a/test/equivalences.jl b/test/equivalences.jl index d03ee474..1d3eba31 100644 --- a/test/equivalences.jl +++ b/test/equivalences.jl @@ -5,8 +5,6 @@ z = copy(x) # Set inducing inputs == training inputs - make_kernel(k) = softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2]))) - k_init = [0.2, 0.6] # initial kernel parameters lik_noise = 0.1 # The (fixed) Gaussian likelihood noise @@ -20,18 +18,20 @@ kernel = make_kernel(k_init) f = GP(kernel) fx = f(x, lik_noise) - fu = f(z) - q_ex = exact_variational_posterior(fu, fx, y) + fz = f(z) + q_ex = exact_variational_posterior(fz, fx, y) gpr_post = posterior(fx, y) # Exact GP regression - vfe_post = approx_posterior(VFE(), fx, y, fu) # Titsias posterior - svgp_post = approx_posterior(SVGP(), fu, q_ex) # Hensman (2013) exact posterior + vfe_post = approx_posterior(VFE(), fx, y, fz) # Titsias posterior + svgp_post = approx_posterior(SVGP(), fz, q_ex) # Hensman (2013) exact posterior @test mean(gpr_post, x) ≈ mean(svgp_post, x) atol=1e-10 @test cov(gpr_post, x) ≈ cov(svgp_post, x) atol=1e-10 @test mean(vfe_post, x) ≈ mean(svgp_post, x) atol=1e-10 @test cov(vfe_post, x) ≈ cov(svgp_post, x) atol=1e-10 + + @test elbo(fx, y, fz, q_ex) ≈ logpdf(fx, y) end @testset "optimised posterior" begin diff --git a/test/test_utils.jl b/test/test_utils.jl index 0bae973c..33ed214f 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -1,3 +1,6 @@ +# Create a default kernel from two parameters k[1] and k[2] +make_kernel(k) = softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2]))) + # Computes the optimal closed form solution for the variational posterior # q(u) (e.g. # https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/ # equations (11) & (12)). From af41ca3a0384e0300aa1720569902e8a9b27708b Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Tue, 27 Jul 2021 22:57:45 +0100 Subject: [PATCH 55/66] Small test changes --- test/elbo.jl | 2 +- test/equivalences.jl | 14 ++++++++------ test/test_utils.jl | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/test/elbo.jl b/test/elbo.jl index c4c879c2..2d33b53a 100644 --- a/test/elbo.jl +++ b/test/elbo.jl @@ -7,13 +7,13 @@ kernel = make_kernel([0.2, 0.6]) f = GP(sin, kernel) fx = f(x, 0.1) - fx_bad = f(x, fill(0.1, N)) fz = f(z) q_ex = exact_variational_posterior(fz, fx, y) @test elbo(fx, y, fz, q_ex) isa Real @test elbo(fx, y, fz, q_ex) ≤ logpdf(fx, y) + fx_bad = f(x, fill(0.1, N)) @test_throws ErrorException elbo(fx_bad, y, fz, q_ex) # Test that the various methods of computing expectations return the same diff --git a/test/equivalences.jl b/test/equivalences.jl index 1d3eba31..162e8d3d 100644 --- a/test/equivalences.jl +++ b/test/equivalences.jl @@ -36,7 +36,9 @@ @testset "optimised posterior" begin jitter = 1e-5 - + + make_gp(kernel) = GP(kernel) + ## FIRST - define the models # GPR - Exact GP regression struct GPRModel @@ -45,7 +47,7 @@ @Flux.functor GPRModel function (m::GPRModel)(x) - f = GP(make_kernel(m.k)) + f = make_gp(make_kernel(m.k)) fx = f(x, lik_noise) return fx end @@ -60,7 +62,7 @@ @Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs function (m::SVGPModel)(x) - f = GP(make_kernel(m.k)) + f = make_gp(make_kernel(m.k)) q = MvNormal(m.m, m.A'm.A) fx = f(x, lik_noise) fz = f(m.z, jitter) @@ -93,18 +95,18 @@ ## FOURTH - construct the posteriors function posterior(m::GPRModel, x, y) - f = GP(make_kernel(m.k)) + f = make_gp(make_kernel(m.k)) fx = f(x, lik_noise) return AbstractGPs.posterior(fx, y) end function posterior(m::SVGPModel) - f = GP(make_kernel(m.k)) + f = make_gp(make_kernel(m.k)) fz = f(m.z, jitter) q = MvNormal(m.m, m.A'm.A) return approx_posterior(SVGP(), fz, q) end - + gpr_post = posterior(gpr, x, y) svgp_post = posterior(svgp) diff --git a/test/test_utils.jl b/test/test_utils.jl index 33ed214f..0ac02c41 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -3,7 +3,7 @@ make_kernel(k) = softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(soft # Computes the optimal closed form solution for the variational posterior # q(u) (e.g. # https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/ -# equations (11) & (12)). +# equations (11) & (12)). Assumes a ZeroMean function. function exact_variational_posterior(fu, fx, y) σ² = fx.Σy[1] Kuf = cov(fu, fx) From de2c4cd42fc0cae5079ff91501a329bc5f7de7e2 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Tue, 27 Jul 2021 23:07:23 +0100 Subject: [PATCH 56/66] Fix elbo error --- src/elbo.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/elbo.jl b/src/elbo.jl index 1cb2bf40..4a90329d 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -53,8 +53,8 @@ function AbstractGPs.elbo( return _elbo(method, fx, y, fz, q, GaussianLikelihood(fx.Σy[1]), n_data) end -function AbstractGPs.elbo(::FiniteGP, ::AbstractVector, ::FiniteGP, ::AbstractMvNormal; kwargs...) where T<:FiniteGP - return error("The observation noise fx.Σy may not be homoscedastic.\n To avoid this error, construct fx using: f = GP(kernel); fx = f(x, σ²)") +function AbstractGPs.elbo(::FiniteGP, ::AbstractVector, ::FiniteGP, ::AbstractMvNormal; kwargs...) + return error("The observation noise fx.Σy must be homoscedastic.\n To avoid this error, construct fx using: f = GP(kernel); fx = f(x, σ²)") end """ From f07c6f17a4722234546acdd1ddff0de0e11a86a8 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Wed, 28 Jul 2021 13:55:34 +0100 Subject: [PATCH 57/66] Remove qualifier from kldivergence Co-authored-by: st-- --- src/elbo.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/elbo.jl b/src/elbo.jl index 4a90329d..17ebf802 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -87,7 +87,7 @@ function _elbo( f_mean, f_var = mean_and_var(post, fx.x) variational_exp = expected_loglik(method, y, f_mean, f_var, lik) - kl_term = StatsBase.kldivergence(q, fz) + kl_term = kldivergence(q, fz) n_batch = length(y) scale = n_data / n_batch From 9f4d2951c1663f04657e603663c9b5220fd5b142 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Wed, 28 Jul 2021 14:00:12 +0100 Subject: [PATCH 58/66] Check for ZeroMean --- test/elbo.jl | 2 +- test/test_utils.jl | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/test/elbo.jl b/test/elbo.jl index 2d33b53a..7de90d9a 100644 --- a/test/elbo.jl +++ b/test/elbo.jl @@ -5,7 +5,7 @@ z = x[begin:5] kernel = make_kernel([0.2, 0.6]) - f = GP(sin, kernel) + f = GP(kernel) fx = f(x, 0.1) fz = f(z) q_ex = exact_variational_posterior(fz, fx, y) diff --git a/test/test_utils.jl b/test/test_utils.jl index 0ac02c41..02b50670 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -5,6 +5,7 @@ make_kernel(k) = softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(soft # q(u) (e.g. # https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/ # equations (11) & (12)). Assumes a ZeroMean function. function exact_variational_posterior(fu, fx, y) + fu.f.mean isa AbstractGPs.ZeroMean || error("The exact posterior requires a GP with ZeroMean.") σ² = fx.Σy[1] Kuf = cov(fu, fx) Kuu = Symmetric(cov(fu)) From ca5f1488b4f374ed5e005c8a66c54e618509df21 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Wed, 28 Jul 2021 14:20:32 +0100 Subject: [PATCH 59/66] Fix classification example jitter --- examples/classification.jl | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/classification.jl b/examples/classification.jl index b1442a09..ab476bce 100644 --- a/examples/classification.jl +++ b/examples/classification.jl @@ -51,6 +51,7 @@ plt = plot( ) scatter!(plt, x, y; seriescolor="blue", label="Data points") + # %% # Plot the same samples, but pushed through a logistic sigmoid to constrain # them in (0, 1). @@ -80,9 +81,11 @@ end @Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs lik = BernoulliLikelihood() +jitter = 1e-4 + function (m::SVGPModel)(x) kernel = make_kernel(m.k) - f = LatentGP(GP(kernel), BernoulliLikelihood(), 0.1) + f = LatentGP(GP(kernel), lik, jitter) q = MvNormal(m.m, m.A'm.A) fx = f(x) fu = f(m.z).fx @@ -117,7 +120,7 @@ println(flux_loss(x, y)) Flux.train!( (x, y) -> flux_loss(x, y), parameters, - ncycle([(x, y)], 1000), # Train for 1000 epochs + ncycle([(x, y)], 2000), # Train for 1000 epochs opt ) @@ -127,10 +130,9 @@ println(flux_loss(x, y)) # %% # After optimisation, plot samples from the underlying posterior GP. - fu = f(z).fx # want the underlying FiniteGP post = SparseGPs.approx_posterior(SVGP(), fu, MvNormal(m, A'A)) -l_post = LatentGP(post, BernoulliLikelihood(), 0.1) +l_post = LatentGP(post, BernoulliLikelihood(), jitter) post_f_samples = rand(l_post.f(x_plot, 1e-6), 20) From 66ec256e8df64bb13447fc052b12ff7c7421dfde Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Wed, 28 Jul 2021 14:30:00 +0100 Subject: [PATCH 60/66] Remove unnecessary imports from AbstractGPs --- src/SparseGPs.jl | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl index 14fef8b4..cc0d156b 100644 --- a/src/SparseGPs.jl +++ b/src/SparseGPs.jl @@ -16,8 +16,6 @@ using AbstractGPs: FiniteGP, LatentFiniteGP, ApproxPosteriorGP, - _cholesky, - _symmetric, At_A, diag_At_A, Xt_invA_X From 6841074f3aaf0a09a917250732739df4b30035a6 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Wed, 28 Jul 2021 15:10:31 +0100 Subject: [PATCH 61/66] Better cholesky of covariance methods --- src/SparseGPs.jl | 1 - src/svgp.jl | 7 +++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl index cc0d156b..2c2c380f 100644 --- a/src/SparseGPs.jl +++ b/src/SparseGPs.jl @@ -26,7 +26,6 @@ export SVGP, Quadrature, MonteCarlo - include("elbo.jl") include("svgp.jl") diff --git a/src/svgp.jl b/src/svgp.jl index c53f5054..12b9e293 100644 --- a/src/svgp.jl +++ b/src/svgp.jl @@ -18,8 +18,8 @@ variational Gaussian process classification." Artificial Intelligence and Statistics. PMLR, 2015. """ function AbstractGPs.approx_posterior(::SVGP, fz::FiniteGP, q::AbstractMvNormal) - m, A = mean(q), cholesky(cov(q)) - Kuu = cholesky(Symmetric(cov(fz))) + m, A = mean(q), _chol_cov(q) + Kuu = _chol_cov(fz) B = Kuu.L \ A.L α=Kuu \ (m - mean(fz)) data = (A=A, m=m, Kuu=Kuu, B=B, α=α, u=fz.x) @@ -66,3 +66,6 @@ function StatsBase.mean_and_var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) Σ_diag = var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) return μ, Σ_diag end + +_chol_cov(q::AbstractMvNormal) = cholesky(Symmetric(cov(q))) +_chol_cov(q::MvNormal) = cholesky(q.Σ) From 1594ee8ff9490c8c568412b3da7a540e5c8345ab Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Wed, 28 Jul 2021 15:16:24 +0100 Subject: [PATCH 62/66] Use KLDivergences --- Project.toml | 1 + src/SparseGPs.jl | 1 + src/elbo.jl | 7 ------- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/Project.toml b/Project.toml index 0bb253b1..de63578f 100644 --- a/Project.toml +++ b/Project.toml @@ -10,6 +10,7 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" FastGaussQuadrature = "442a2c76-b920-505d-bb47-c5924d526838" FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b" GPLikelihoods = "6031954c-0455-49d7-b3b9-3e1c99afaf40" +KLDivergences = "3c9cd921-3d3f-41e2-830c-e020174918cc" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl index 2c2c380f..575ed4cb 100644 --- a/src/SparseGPs.jl +++ b/src/SparseGPs.jl @@ -10,6 +10,7 @@ using GPLikelihoods using SpecialFunctions using ChainRulesCore using FillArrays +using KLDivergences using AbstractGPs: AbstractGP, diff --git a/src/elbo.jl b/src/elbo.jl index 17ebf802..7c1e74fb 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -230,13 +230,6 @@ end ChainRulesCore.@non_differentiable gausshermite(n) -function StatsBase.kldivergence(q::AbstractMvNormal, p::AbstractMvNormal) - p_μ, p_Σ = mean(p), cov(p) - q_μ, q_Σ = mean(q), cov(q) - (1/2) .* (logdet(p_Σ) - logdet(q_Σ) - length(p_μ) + tr(p_Σ \ q_Σ) + - Xt_invA_X(cholesky(p_Σ), (q_μ - p_μ))) -end - AnalyticLikelihood = Union{ PoissonLikelihood, GaussianLikelihood, From 878b2145d1e8346b4ada39f7a0ed0babb6f9791e Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Wed, 28 Jul 2021 16:01:54 +0100 Subject: [PATCH 63/66] Use vector of marginals `q_f` vs. `f_mean, f_var` --- src/elbo.jl | 69 ++++++++++++++++++++++++---------------------------- test/elbo.jl | 7 +++--- 2 files changed, 35 insertions(+), 41 deletions(-) diff --git a/src/elbo.jl b/src/elbo.jl index 7c1e74fb..e25e43da 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -35,8 +35,8 @@ the ELBO. The options are: `Default()`, `Analytic()`, `Quadrature()` and exact solution. If there is no such solution, `Default()` either uses `Quadrature()` or `MonteCarlo()`, depending on the likelihood. -N.B. the observation noise `fx.Σy` is assumed to be homoscedastic and -uncorrelated - i.e. only `fx.Σy[1]` is used. +N.B. the likelihood is assumed to be Gaussian with observation noise `fx.Σy`. +Further, `fx.Σy` must be homoscedastic and uncorrelated - i.e. `fx.Σy = α * I`. [1] - Hensman, James, Alexander Matthews, and Zoubin Ghahramani. "Scalable variational Gaussian process classification." Artificial Intelligence and @@ -84,8 +84,8 @@ function _elbo( n_data::Integer ) post = approx_posterior(SVGP(), fz, q) - f_mean, f_var = mean_and_var(post, fx.x) - variational_exp = expected_loglik(method, y, f_mean, f_var, lik) + q_f = marginals(post(fx.x)) + variational_exp = expected_loglik(method, y, q_f, lik) kl_term = kldivergence(q, fz) @@ -95,7 +95,7 @@ function _elbo( end """ - expected_loglik(method, y, f_mean, f_var, lik) + expected_loglik(method::ExpectationMethod, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik) This function computes the expected log likelihood: @@ -109,8 +109,7 @@ where `p(y | f)` is the process likelihood. q(f) = ∫ p(f | u) q(u) du ``` where `q(u)` is the variational distribution over inducing points (see -[`elbo`](@ref)). The marginal means and variances of `q(f)` are given by -`f_mean` and `f_var` respectively. +[`elbo`](@ref)). The marginal distributions of `q(f)` are given by `q_f`. `method` determines which method is used to calculate the expected log likelihood - see [`elbo`](@ref) for more details. @@ -120,10 +119,10 @@ likelihood - see [`elbo`](@ref) for more details. `q(f)` is assumed to be an `MvNormal` distribution and `p(y | f)` is assumed to have independent marginals such that only the marginals of `q(f)` are required. """ -expected_loglik(method, y, f_mean, f_var, lik) +expected_loglik(method, y, q_f, lik) """ - expected_loglik(method::ExpectationMethod, y::AbstractVector, f_mean::AbstractVector, f_var::AbstractVector, lik::ScalarLikelihood) + expected_loglik(method::ExpectationMethod, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik::ScalarLikelihood) The expected log likelihood for a `ScalarLikelihood`, computed via `method`. Defaults to a closed form solution if it exists, otherwise defaults to @@ -132,64 +131,61 @@ Gauss-Hermite quadrature. function expected_loglik( ::Default, y::AbstractVector, - f_mean::AbstractVector, - f_var::AbstractVector, + q_f::AbstractVector{<:Normal}, lik::ScalarLikelihood ) method = _default_method(lik) - expected_loglik(method, y, f_mean, f_var, lik) + expected_loglik(method, y, q_f, lik) end # The closed form solution for independent Gaussian noise function expected_loglik( ::Analytic, y::AbstractVector{<:Real}, - f_mean::AbstractVector, - f_var::AbstractVector, + q_f::AbstractVector{<:Normal}, lik::GaussianLikelihood ) - return sum(-0.5 * (log(2π) .+ log.(lik.σ²) .+ ((y .- f_mean).^2 .+ f_var) / lik.σ²)) + return sum(-0.5 * (log(2π) .+ log.(lik.σ²) .+ ((y .- mean.(q_f)).^2 .+ var.(q_f)) / lik.σ²)) end # The closed form solution for a Poisson likelihood with an exponential inverse link function function expected_loglik( ::Analytic, y::AbstractVector, - f_mean::AbstractVector, - f_var::AbstractVector, + q_f::AbstractVector{<:Normal}, ::PoissonLikelihood{ExpLink} -) - return sum((y .* f_mean) - exp.(f_mean .+ (f_var / 2)) - loggamma.(y .+ 1)) + ) + f_μ = mean.(q_f) + return sum((y .* f_μ) - exp.(f_μ .+ (var.(q_f) / 2)) - loggamma.(y .+ 1)) end # The closed form solution for an Exponential likelihood with an exponential inverse link function function expected_loglik( ::Analytic, y::AbstractVector{<:Real}, - f_mean::AbstractVector, - f_var::AbstractVector, + q_f::AbstractVector{<:Normal}, ::ExponentialLikelihood{ExpLink} -) - return sum(-f_mean - y .* exp.((f_var / 2) .- f_mean)) + ) + f_μ = mean.(q_f) + return sum(-f_μ - y .* exp.((var.(q_f) / 2) .- f_μ)) end # The closed form solution for a Gamma likelihood with an exponential inverse link function function expected_loglik( ::Analytic, y::AbstractVector{<:Real}, - f_mean::AbstractVector, - f_var::AbstractVector, + q_f::AbstractVector{<:Normal}, lik::GammaLikelihood{<:Any, ExpLink} -) - return sum((lik.α - 1) * log.(y) .- y .* exp.((f_var / 2) .- f_mean) - .- lik.α * f_mean .- loggamma(lik.α)) + ) + f_μ = mean.(q_f) + return sum((lik.α - 1) * log.(y) .- y .* exp.((var.(q_f) / 2) .- f_μ) + .- lik.α * f_μ .- loggamma(lik.α)) end function expected_loglik( ::Analytic, y::AbstractVector, - f_mean::AbstractVector, - f_var::AbstractVector, + q_f::AbstractVector{<:Normal}, lik ) return error( @@ -201,12 +197,12 @@ end function expected_loglik( mc::MonteCarlo, y::AbstractVector, - f_mean::AbstractVector, - f_var::AbstractVector, + q_f::AbstractVector{<:Normal}, lik::ScalarLikelihood ) - # take 'n_samples' reparameterised samples with μ=f_mean and σ²=f_var - fs = f_mean .+ .√f_var .* randn(eltype(f_mean), length(f_mean), mc.n_samples) + # take 'n_samples' reparameterised samples + f_μ = mean.(q_f) + fs = f_μ .+ std.(q_f) .* randn(eltype(f_μ), length(q_f), mc.n_samples) lls = loglikelihood.(lik.(fs), y) return sum(lls) / mc.n_samples end @@ -214,8 +210,7 @@ end function expected_loglik( gh::Quadrature, y::AbstractVector, - f_mean::AbstractVector, - f_var::AbstractVector, + q_f::AbstractVector{<:Normal}, lik::ScalarLikelihood ) # Compute the expectation via Gauss-Hermite quadrature @@ -223,7 +218,7 @@ function expected_loglik( # (see eg. en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature) xs, ws = gausshermite(gh.n_points) # size(fs): (length(y), n_points) - fs = √2 * .√f_var .* transpose(xs) .+ f_mean + fs = √2 * std.(q_f) .* transpose(xs) .+ mean.(q_f) lls = loglikelihood.(lik.(fs), y) return sum((1/√π) * lls * ws) end diff --git a/test/elbo.jl b/test/elbo.jl index 7de90d9a..07b89035 100644 --- a/test/elbo.jl +++ b/test/elbo.jl @@ -19,17 +19,16 @@ # Test that the various methods of computing expectations return the same # result. rng = MersenneTwister(123456) - f_mean = zeros(10) - f_var = ones(10) + q_f = Normal.(zeros(10), ones(10)) @testset "$lik" for lik in Base.uniontypes(SparseGPs.ScalarLikelihood) l = lik() methods = [Quadrature(100), MonteCarlo(1000000)] def = SparseGPs._default_method(l) if def isa Analytic push!(methods, def) end - y = rand.(rng, l.(f_mean)) + y = rand.(rng, l.(zeros(10))) - results = map(m -> SparseGPs.expected_loglik(m, y, f_mean, f_var, l), methods) + results = map(m -> SparseGPs.expected_loglik(m, y, q_f, l), methods) @test all(x->isapprox(x, results[end], rtol=1e-3), results) end end From be967226317e5f9aa7e23474b9b86c9434d70d6a Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Fri, 30 Jul 2021 12:25:09 +0100 Subject: [PATCH 64/66] Ran JuliaFormatter --- examples/classification.jl | 58 ++++++++++------------------ examples/regression.jl | 65 ++++++++++++++++--------------- src/SparseGPs.jl | 14 +------ src/elbo.jl | 78 ++++++++++++++++++++------------------ src/svgp.jl | 12 +++--- test/elbo.jl | 6 ++- test/equivalences.jl | 29 +++++++------- test/svgp.jl | 4 +- test/test_utils.jl | 7 ++-- 9 files changed, 125 insertions(+), 148 deletions(-) diff --git a/examples/classification.jl b/examples/classification.jl index ab476bce..c97dc556 100644 --- a/examples/classification.jl +++ b/examples/classification.jl @@ -12,7 +12,7 @@ using DelimitedFiles using IterTools using Plots -default(; legend=:outertopright, size=(700, 400)) +default(; legend = :outertopright, size = (700, 400)) using Random Random.seed!(1234) @@ -40,16 +40,10 @@ fx = f(x) # %% # Then, plot some samples from the prior underlying GP x_plot = 0:0.02:6 -prior_f_samples = rand(f.f(x_plot, 1e-6),20) +prior_f_samples = rand(f.f(x_plot, 1e-6), 20) -plt = plot( - x_plot, - prior_f_samples; - seriescolor="red", - linealpha=0.2, - label="" -) -scatter!(plt, x, y; seriescolor="blue", label="Data points") +plt = plot(x_plot, prior_f_samples; seriescolor = "red", linealpha = 0.2, label = "") +scatter!(plt, x, y; seriescolor = "blue", label = "Data points") # %% @@ -57,14 +51,8 @@ scatter!(plt, x, y; seriescolor="blue", label="Data points") # them in (0, 1). prior_y_samples = mean.(f.lik.(prior_f_samples)) -plt = plot( - x_plot, - prior_y_samples; - seriescolor="red", - linealpha=0.2, - label="" -) -scatter!(plt, x, y; seriescolor="blue", label="Data points") +plt = plot(x_plot, prior_y_samples; seriescolor = "red", linealpha = 0.2, label = "") +scatter!(plt, x, y; seriescolor = "blue", label = "Data points") # %% @@ -72,13 +60,13 @@ scatter!(plt, x, y; seriescolor="blue", label="Data points") using Flux struct SVGPModel - k # kernel parameters - m # variational mean - A # variational covariance - z # inducing points + k::Any # kernel parameters + m::Any # variational mean + A::Any # variational covariance + z::Any # inducing points end -@Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs +Flux.@functor SVGPModel (k, m, A) # Don't train the inducing inputs lik = BernoulliLikelihood() jitter = 1e-4 @@ -92,9 +80,9 @@ function (m::SVGPModel)(x) return fx, fu, q end -function flux_loss(x, y; n_data=length(y)) +function flux_loss(x, y; n_data = length(y)) fx, fu, q = model(x) - return -SparseGPs.elbo(fx, y, fu, q; n_data, method=MonteCarlo()) + return -SparseGPs.elbo(fx, y, fu, q; n_data, method = MonteCarlo()) end # %% @@ -121,7 +109,7 @@ Flux.train!( (x, y) -> flux_loss(x, y), parameters, ncycle([(x, y)], 2000), # Train for 1000 epochs - opt + opt, ) # %% @@ -136,13 +124,7 @@ l_post = LatentGP(post, BernoulliLikelihood(), jitter) post_f_samples = rand(l_post.f(x_plot, 1e-6), 20) -plt = plot( - x_plot, - post_f_samples; - seriescolor="red", - linealpha=0.2, - legend=false -) +plt = plot(x_plot, post_f_samples; seriescolor = "red", linealpha = 0.2, legend = false) # %% # As above, push these samples through a logistic sigmoid to get posterior predictions. @@ -151,10 +133,10 @@ post_y_samples = mean.(l_post.lik.(post_f_samples)) plt = plot( x_plot, post_y_samples; - seriescolor="red", - linealpha=0.2, + seriescolor = "red", + linealpha = 0.2, # legend=false, - label="" + label = "", ) -scatter!(plt, x, y; seriescolor="blue", label="Data points") -vline!(z; label="Pseudo-points") +scatter!(plt, x, y; seriescolor = "blue", label = "Data points") +vline!(z; label = "Pseudo-points") diff --git a/examples/regression.jl b/examples/regression.jl index d537f448..82a31e61 100644 --- a/examples/regression.jl +++ b/examples/regression.jl @@ -8,7 +8,7 @@ using Optim using IterTools using Plots -default(; legend=:outertopright, size=(700, 400)) +default(; legend = :outertopright, size = (700, 400)) using Random Random.seed!(1234) @@ -23,7 +23,7 @@ N = 10000 # Number of training points x = rand(Uniform(-1, 1), N) y = g.(x) + 0.3 * randn(N) -scatter(x, y; xlabel="x", ylabel="y", legend=false) +scatter(x, y; xlabel = "x", ylabel = "y", legend = false) # %% @@ -34,13 +34,13 @@ lik_noise = 0.3 jitter = 1e-5 struct SVGPModel - k # kernel parameters - m # variational mean - A # variational covariance - z # inducing points + k::Any # kernel parameters + m::Any # variational mean + A::Any # variational covariance + z::Any # inducing points end -@Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs +Flux.@functor SVGPModel (k, m, A) # Don't train the inducing inputs function make_kernel(k) return softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2]))) @@ -68,7 +68,7 @@ function posterior(m::SVGPModel) end # Return the loss given data - in this case the negative ELBO. -function flux_loss(x, y; n_data=length(y)) +function flux_loss(x, y; n_data = length(y)) fx, fu, q = model(x) return -SparseGPs.elbo(fx, y, fu, q; n_data) end @@ -90,7 +90,7 @@ model = SVGPModel(k, m, A, z) b = 100 # minibatch size opt = ADAM(0.001) parameters = Flux.params(model) -data_loader = Flux.Data.DataLoader((x, y), batchsize=b) +data_loader = Flux.Data.DataLoader((x, y), batchsize = b) # %% # Negative ELBO before training @@ -99,10 +99,10 @@ println(flux_loss(x, y)) # %% # Train the model Flux.train!( - (x, y) -> flux_loss(x, y; n_data=N), + (x, y) -> flux_loss(x, y; n_data = N), parameters, ncycle(data_loader, 300), # Train for 300 epochs - opt + opt, ) # %% @@ -116,16 +116,16 @@ post = posterior(model) scatter( x, y; - markershape=:xcross, - markeralpha=0.1, - xlim=(-1, 1), - xlabel="x", - ylabel="y", - title="posterior (VI with sparse grid)", - label="Train Data", + markershape = :xcross, + markeralpha = 0.1, + xlim = (-1, 1), + xlabel = "x", + ylabel = "y", + title = "posterior (VI with sparse grid)", + label = "Train Data", ) -plot!(-1:0.001:1, post; label="Posterior") -vline!(z; label="Pseudo-points") +plot!(-1:0.001:1, post; label = "Posterior") +vline!(z; label = "Pseudo-points") # %% There is a closed form optimal solution for the variational posterior q(u) @@ -137,8 +137,8 @@ function exact_q(fu, fx, y) σ² = fx.Σy[1] Kuf = cov(fu, fx) Kuu = Symmetric(cov(fu)) - Σ = (Symmetric(cov(fu) + (1/σ²) * Kuf * Kuf')) - m = ((1/σ²)*Kuu* (Σ\Kuf)) * y + Σ = (Symmetric(cov(fu) + (1 / σ²) * Kuf * Kuf')) + m = ((1 / σ²) * Kuu * (Σ \ Kuf)) * y S = Symmetric(Kuu * (Σ \ Kuu)) return MvNormal(m, S) end @@ -164,15 +164,14 @@ AbstractGPs.elbo(fx, y, fu) scatter( x, y; - markershape=:xcross, - markeralpha=0.1, - xlim=(-1, 1), - xlabel="x", - ylabel="y", - title="posterior (VI with sparse grid)", - label="Train Data", + markershape = :xcross, + markeralpha = 0.1, + xlim = (-1, 1), + xlabel = "x", + ylabel = "y", + title = "posterior (VI with sparse grid)", + label = "Train Data", ) -plot!(-1:0.001:1, ap_ex; label="SVGP posterior") -plot!(-1:0.001:1, ap_tits; label="Titsias posterior") -vline!(z; label="Pseudo-points") - +plot!(-1:0.001:1, ap_ex; label = "SVGP posterior") +plot!(-1:0.001:1, ap_tits; label = "Titsias posterior") +vline!(z; label = "Pseudo-points") diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl index 575ed4cb..c0a165af 100644 --- a/src/SparseGPs.jl +++ b/src/SparseGPs.jl @@ -13,19 +13,9 @@ using FillArrays using KLDivergences using AbstractGPs: - AbstractGP, - FiniteGP, - LatentFiniteGP, - ApproxPosteriorGP, - At_A, - diag_At_A, - Xt_invA_X + AbstractGP, FiniteGP, LatentFiniteGP, ApproxPosteriorGP, At_A, diag_At_A -export SVGP, - Default, - Analytic, - Quadrature, - MonteCarlo +export SVGP, Default, Analytic, Quadrature, MonteCarlo include("elbo.jl") include("svgp.jl") diff --git a/src/elbo.jl b/src/elbo.jl index e25e43da..94a16e49 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -4,7 +4,7 @@ ScalarLikelihood = Union{ PoissonLikelihood, GaussianLikelihood, ExponentialLikelihood, - GammaLikelihood + GammaLikelihood, } @@ -43,18 +43,26 @@ variational Gaussian process classification." Artificial Intelligence and Statistics. PMLR, 2015. """ function AbstractGPs.elbo( - fx::FiniteGP{<:AbstractGP, <:AbstractVector, <:Diagonal{<:Real, <:Fill}}, + fx::FiniteGP{<:AbstractGP,<:AbstractVector,<:Diagonal{<:Real,<:Fill}}, y::AbstractVector{<:Real}, fz::FiniteGP, q::AbstractMvNormal; - n_data=length(y), - method=Default() + n_data = length(y), + method = Default(), ) return _elbo(method, fx, y, fz, q, GaussianLikelihood(fx.Σy[1]), n_data) end -function AbstractGPs.elbo(::FiniteGP, ::AbstractVector, ::FiniteGP, ::AbstractMvNormal; kwargs...) - return error("The observation noise fx.Σy must be homoscedastic.\n To avoid this error, construct fx using: f = GP(kernel); fx = f(x, σ²)") +function AbstractGPs.elbo( + ::FiniteGP, + ::AbstractVector, + ::FiniteGP, + ::AbstractMvNormal; + kwargs..., +) + return error( + "The observation noise fx.Σy must be homoscedastic.\n To avoid this error, construct fx using: f = GP(kernel); fx = f(x, σ²)", + ) end """ @@ -67,8 +75,8 @@ function AbstractGPs.elbo( y::AbstractVector, fz::FiniteGP, q::AbstractMvNormal; - n_data=length(y), - method=Default() + n_data = length(y), + method = Default(), ) return _elbo(method, lfx.fx, y, fz, q, lfx.lik, n_data) end @@ -81,7 +89,7 @@ function _elbo( fz::FiniteGP, q::AbstractMvNormal, lik::ScalarLikelihood, - n_data::Integer + n_data::Integer, ) post = approx_posterior(SVGP(), fz, q) q_f = marginals(post(fx.x)) @@ -132,7 +140,7 @@ function expected_loglik( ::Default, y::AbstractVector, q_f::AbstractVector{<:Normal}, - lik::ScalarLikelihood + lik::ScalarLikelihood, ) method = _default_method(lik) expected_loglik(method, y, q_f, lik) @@ -143,9 +151,11 @@ function expected_loglik( ::Analytic, y::AbstractVector{<:Real}, q_f::AbstractVector{<:Normal}, - lik::GaussianLikelihood + lik::GaussianLikelihood, ) - return sum(-0.5 * (log(2π) .+ log.(lik.σ²) .+ ((y .- mean.(q_f)).^2 .+ var.(q_f)) / lik.σ²)) + return sum( + -0.5 * (log(2π) .+ log.(lik.σ²) .+ ((y .- mean.(q_f)) .^ 2 .+ var.(q_f)) / lik.σ²), + ) end # The closed form solution for a Poisson likelihood with an exponential inverse link function @@ -153,8 +163,8 @@ function expected_loglik( ::Analytic, y::AbstractVector, q_f::AbstractVector{<:Normal}, - ::PoissonLikelihood{ExpLink} - ) + ::PoissonLikelihood{ExpLink}, +) f_μ = mean.(q_f) return sum((y .* f_μ) - exp.(f_μ .+ (var.(q_f) / 2)) - loggamma.(y .+ 1)) end @@ -164,8 +174,8 @@ function expected_loglik( ::Analytic, y::AbstractVector{<:Real}, q_f::AbstractVector{<:Normal}, - ::ExponentialLikelihood{ExpLink} - ) + ::ExponentialLikelihood{ExpLink}, +) f_μ = mean.(q_f) return sum(-f_μ - y .* exp.((var.(q_f) / 2) .- f_μ)) end @@ -175,22 +185,20 @@ function expected_loglik( ::Analytic, y::AbstractVector{<:Real}, q_f::AbstractVector{<:Normal}, - lik::GammaLikelihood{<:Any, ExpLink} - ) + lik::GammaLikelihood{<:Any,ExpLink}, +) f_μ = mean.(q_f) - return sum((lik.α - 1) * log.(y) .- y .* exp.((var.(q_f) / 2) .- f_μ) - .- lik.α * f_μ .- loggamma(lik.α)) + return sum( + (lik.α - 1) * log.(y) .- y .* exp.((var.(q_f) / 2) .- f_μ) .- lik.α * f_μ .- + loggamma(lik.α), + ) end -function expected_loglik( - ::Analytic, - y::AbstractVector, - q_f::AbstractVector{<:Normal}, - lik -) +function expected_loglik(::Analytic, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik) return error( - "No analytic solution exists for ", typeof(lik), - ". Use `Default()`, `Quadrature()` or `MonteCarlo()` instead." + "No analytic solution exists for ", + typeof(lik), + ". Use `Default()`, `Quadrature()` or `MonteCarlo()` instead.", ) end @@ -198,7 +206,7 @@ function expected_loglik( mc::MonteCarlo, y::AbstractVector, q_f::AbstractVector{<:Normal}, - lik::ScalarLikelihood + lik::ScalarLikelihood, ) # take 'n_samples' reparameterised samples f_μ = mean.(q_f) @@ -211,7 +219,7 @@ function expected_loglik( gh::Quadrature, y::AbstractVector, q_f::AbstractVector{<:Normal}, - lik::ScalarLikelihood + lik::ScalarLikelihood, ) # Compute the expectation via Gauss-Hermite quadrature # using a reparameterisation by change of variable @@ -220,16 +228,12 @@ function expected_loglik( # size(fs): (length(y), n_points) fs = √2 * std.(q_f) .* transpose(xs) .+ mean.(q_f) lls = loglikelihood.(lik.(fs), y) - return sum((1/√π) * lls * ws) + return sum((1 / √π) * lls * ws) end ChainRulesCore.@non_differentiable gausshermite(n) -AnalyticLikelihood = Union{ - PoissonLikelihood, - GaussianLikelihood, - ExponentialLikelihood, - GammaLikelihood -} +AnalyticLikelihood = + Union{PoissonLikelihood,GaussianLikelihood,ExponentialLikelihood,GammaLikelihood} _default_method(::AnalyticLikelihood) = Analytic() _default_method(_) = Quadrature() diff --git a/src/svgp.jl b/src/svgp.jl index 12b9e293..f9a10838 100644 --- a/src/svgp.jl +++ b/src/svgp.jl @@ -21,8 +21,8 @@ function AbstractGPs.approx_posterior(::SVGP, fz::FiniteGP, q::AbstractMvNormal) m, A = mean(q), _chol_cov(q) Kuu = _chol_cov(fz) B = Kuu.L \ A.L - α=Kuu \ (m - mean(fz)) - data = (A=A, m=m, Kuu=Kuu, B=B, α=α, u=fz.x) + α = Kuu \ (m - mean(fz)) + data = (A = A, m = m, Kuu = Kuu, B = B, α = α, u = fz.x) return ApproxPosteriorGP(SVGP(), fz.f, data) end @@ -33,13 +33,13 @@ end function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) Cux = cov(f.prior, f.data.u, x) D = f.data.Kuu.L \ Cux - return cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) + return cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) end function Statistics.var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) Cux = cov(f.prior, f.data.u, x) D = f.data.Kuu.L \ Cux - return var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) + return var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) end function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector, y::AbstractVector) @@ -55,7 +55,7 @@ function StatsBase.mean_and_cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) Cux = cov(f.prior, f.data.u, x) D = f.data.Kuu.L \ Cux μ = Cux' * f.data.α - Σ = cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) + Σ = cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) return μ, Σ end @@ -63,7 +63,7 @@ function StatsBase.mean_and_var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) Cux = cov(f.prior, f.data.u, x) D = f.data.Kuu.L \ Cux μ = Cux' * f.data.α - Σ_diag = var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) + Σ_diag = var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) return μ, Σ_diag end diff --git a/test/elbo.jl b/test/elbo.jl index 07b89035..e4394da0 100644 --- a/test/elbo.jl +++ b/test/elbo.jl @@ -25,10 +25,12 @@ l = lik() methods = [Quadrature(100), MonteCarlo(1000000)] def = SparseGPs._default_method(l) - if def isa Analytic push!(methods, def) end + if def isa Analytic + push!(methods, def) + end y = rand.(rng, l.(zeros(10))) results = map(m -> SparseGPs.expected_loglik(m, y, q_f, l), methods) - @test all(x->isapprox(x, results[end], rtol=1e-3), results) + @test all(x -> isapprox(x, results[end], rtol = 1e-3), results) end end diff --git a/test/equivalences.jl b/test/equivalences.jl index 162e8d3d..8b375eb9 100644 --- a/test/equivalences.jl +++ b/test/equivalences.jl @@ -4,7 +4,7 @@ y = sin.(x) + 0.9 * cos.(x * 1.6) + 0.4 * rand(rng, N) z = copy(x) # Set inducing inputs == training inputs - + k_init = [0.2, 0.6] # initial kernel parameters lik_noise = 0.1 # The (fixed) Gaussian likelihood noise @@ -25,11 +25,11 @@ vfe_post = approx_posterior(VFE(), fx, y, fz) # Titsias posterior svgp_post = approx_posterior(SVGP(), fz, q_ex) # Hensman (2013) exact posterior - @test mean(gpr_post, x) ≈ mean(svgp_post, x) atol=1e-10 - @test cov(gpr_post, x) ≈ cov(svgp_post, x) atol=1e-10 + @test mean(gpr_post, x) ≈ mean(svgp_post, x) atol = 1e-10 + @test cov(gpr_post, x) ≈ cov(svgp_post, x) atol = 1e-10 - @test mean(vfe_post, x) ≈ mean(svgp_post, x) atol=1e-10 - @test cov(vfe_post, x) ≈ cov(svgp_post, x) atol=1e-10 + @test mean(vfe_post, x) ≈ mean(svgp_post, x) atol = 1e-10 + @test cov(vfe_post, x) ≈ cov(svgp_post, x) atol = 1e-10 @test elbo(fx, y, fz, q_ex) ≈ logpdf(fx, y) end @@ -42,9 +42,9 @@ ## FIRST - define the models # GPR - Exact GP regression struct GPRModel - k # kernel parameters + k::Any # kernel parameters end - @Flux.functor GPRModel + Flux.@functor GPRModel function (m::GPRModel)(x) f = make_gp(make_kernel(m.k)) @@ -54,12 +54,12 @@ # SVGP - Sparse variational GP regression (Hensman 2014) struct SVGPModel - k # kernel parameters - z # inducing points - m # variational mean - A # variational covariance sqrt (Σ = A'A) + k::Any # kernel parameters + z::Any # inducing points + m::Any # variational mean + A::Any # variational covariance sqrt (Σ = A'A) end - @Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs + Flux.@functor SVGPModel (k, m, A) # Don't train the inducing inputs function (m::SVGPModel)(x) f = make_gp(make_kernel(m.k)) @@ -111,9 +111,8 @@ svgp_post = posterior(svgp) ## FIFTH - test equivalences - @test all(isapprox.(mean(gpr_post, x), mean(svgp_post, x), atol=1e-4)) - @test all(isapprox.(cov(gpr_post, x), cov(svgp_post, x), atol=1e-4)) + @test all(isapprox.(mean(gpr_post, x), mean(svgp_post, x), atol = 1e-4)) + @test all(isapprox.(cov(gpr_post, x), cov(svgp_post, x), atol = 1e-4)) end end - diff --git a/test/svgp.jl b/test/svgp.jl index 7dd90692..9cfef116 100644 --- a/test/svgp.jl +++ b/test/svgp.jl @@ -7,14 +7,14 @@ # Specify prior. f = GP(Matern32Kernel()) # Sample from prior. - x = collect(range(-1.0, 1.0; length=N_cond)) + x = collect(range(-1.0, 1.0; length = N_cond)) fx = f(x, 1e-15) y = rand(rng, fx) q = exact_variational_posterior(fx, fx, y) f_approx_post = approx_posterior(SVGP(), fx, q) - a = collect(range(-1.0, 1.0; length=N_a)) + a = collect(range(-1.0, 1.0; length = N_a)) b = randn(rng, N_b) AbstractGPs.TestUtils.test_internal_abstractgps_interface(rng, f_approx_post, a, b) end diff --git a/test/test_utils.jl b/test/test_utils.jl index 02b50670..805c799d 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -5,12 +5,13 @@ make_kernel(k) = softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(soft # q(u) (e.g. # https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/ # equations (11) & (12)). Assumes a ZeroMean function. function exact_variational_posterior(fu, fx, y) - fu.f.mean isa AbstractGPs.ZeroMean || error("The exact posterior requires a GP with ZeroMean.") + fu.f.mean isa AbstractGPs.ZeroMean || + error("The exact posterior requires a GP with ZeroMean.") σ² = fx.Σy[1] Kuf = cov(fu, fx) Kuu = Symmetric(cov(fu)) - Σ = (Symmetric(cov(fu) + (1/σ²) * Kuf * Kuf')) - m = ((1/σ²)*Kuu* (Σ\Kuf)) * y + Σ = (Symmetric(cov(fu) + (1 / σ²) * Kuf * Kuf')) + m = ((1 / σ²) * Kuu * (Σ \ Kuf)) * y S = Symmetric(Kuu * (Σ \ Kuu)) return MvNormal(m, S) end From 39f243a3622b13c61b5f6db89d8fe85735836dda Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Fri, 30 Jul 2021 12:28:37 +0100 Subject: [PATCH 65/66] Revert "Ran JuliaFormatter" This reverts commit be967226317e5f9aa7e23474b9b86c9434d70d6a. --- examples/classification.jl | 58 ++++++++++++++++++---------- examples/regression.jl | 65 +++++++++++++++---------------- src/SparseGPs.jl | 14 ++++++- src/elbo.jl | 78 ++++++++++++++++++-------------------- src/svgp.jl | 12 +++--- test/elbo.jl | 6 +-- test/equivalences.jl | 29 +++++++------- test/svgp.jl | 4 +- test/test_utils.jl | 7 ++-- 9 files changed, 148 insertions(+), 125 deletions(-) diff --git a/examples/classification.jl b/examples/classification.jl index c97dc556..ab476bce 100644 --- a/examples/classification.jl +++ b/examples/classification.jl @@ -12,7 +12,7 @@ using DelimitedFiles using IterTools using Plots -default(; legend = :outertopright, size = (700, 400)) +default(; legend=:outertopright, size=(700, 400)) using Random Random.seed!(1234) @@ -40,10 +40,16 @@ fx = f(x) # %% # Then, plot some samples from the prior underlying GP x_plot = 0:0.02:6 -prior_f_samples = rand(f.f(x_plot, 1e-6), 20) +prior_f_samples = rand(f.f(x_plot, 1e-6),20) -plt = plot(x_plot, prior_f_samples; seriescolor = "red", linealpha = 0.2, label = "") -scatter!(plt, x, y; seriescolor = "blue", label = "Data points") +plt = plot( + x_plot, + prior_f_samples; + seriescolor="red", + linealpha=0.2, + label="" +) +scatter!(plt, x, y; seriescolor="blue", label="Data points") # %% @@ -51,8 +57,14 @@ scatter!(plt, x, y; seriescolor = "blue", label = "Data points") # them in (0, 1). prior_y_samples = mean.(f.lik.(prior_f_samples)) -plt = plot(x_plot, prior_y_samples; seriescolor = "red", linealpha = 0.2, label = "") -scatter!(plt, x, y; seriescolor = "blue", label = "Data points") +plt = plot( + x_plot, + prior_y_samples; + seriescolor="red", + linealpha=0.2, + label="" +) +scatter!(plt, x, y; seriescolor="blue", label="Data points") # %% @@ -60,13 +72,13 @@ scatter!(plt, x, y; seriescolor = "blue", label = "Data points") using Flux struct SVGPModel - k::Any # kernel parameters - m::Any # variational mean - A::Any # variational covariance - z::Any # inducing points + k # kernel parameters + m # variational mean + A # variational covariance + z # inducing points end -Flux.@functor SVGPModel (k, m, A) # Don't train the inducing inputs +@Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs lik = BernoulliLikelihood() jitter = 1e-4 @@ -80,9 +92,9 @@ function (m::SVGPModel)(x) return fx, fu, q end -function flux_loss(x, y; n_data = length(y)) +function flux_loss(x, y; n_data=length(y)) fx, fu, q = model(x) - return -SparseGPs.elbo(fx, y, fu, q; n_data, method = MonteCarlo()) + return -SparseGPs.elbo(fx, y, fu, q; n_data, method=MonteCarlo()) end # %% @@ -109,7 +121,7 @@ Flux.train!( (x, y) -> flux_loss(x, y), parameters, ncycle([(x, y)], 2000), # Train for 1000 epochs - opt, + opt ) # %% @@ -124,7 +136,13 @@ l_post = LatentGP(post, BernoulliLikelihood(), jitter) post_f_samples = rand(l_post.f(x_plot, 1e-6), 20) -plt = plot(x_plot, post_f_samples; seriescolor = "red", linealpha = 0.2, legend = false) +plt = plot( + x_plot, + post_f_samples; + seriescolor="red", + linealpha=0.2, + legend=false +) # %% # As above, push these samples through a logistic sigmoid to get posterior predictions. @@ -133,10 +151,10 @@ post_y_samples = mean.(l_post.lik.(post_f_samples)) plt = plot( x_plot, post_y_samples; - seriescolor = "red", - linealpha = 0.2, + seriescolor="red", + linealpha=0.2, # legend=false, - label = "", + label="" ) -scatter!(plt, x, y; seriescolor = "blue", label = "Data points") -vline!(z; label = "Pseudo-points") +scatter!(plt, x, y; seriescolor="blue", label="Data points") +vline!(z; label="Pseudo-points") diff --git a/examples/regression.jl b/examples/regression.jl index 82a31e61..d537f448 100644 --- a/examples/regression.jl +++ b/examples/regression.jl @@ -8,7 +8,7 @@ using Optim using IterTools using Plots -default(; legend = :outertopright, size = (700, 400)) +default(; legend=:outertopright, size=(700, 400)) using Random Random.seed!(1234) @@ -23,7 +23,7 @@ N = 10000 # Number of training points x = rand(Uniform(-1, 1), N) y = g.(x) + 0.3 * randn(N) -scatter(x, y; xlabel = "x", ylabel = "y", legend = false) +scatter(x, y; xlabel="x", ylabel="y", legend=false) # %% @@ -34,13 +34,13 @@ lik_noise = 0.3 jitter = 1e-5 struct SVGPModel - k::Any # kernel parameters - m::Any # variational mean - A::Any # variational covariance - z::Any # inducing points + k # kernel parameters + m # variational mean + A # variational covariance + z # inducing points end -Flux.@functor SVGPModel (k, m, A) # Don't train the inducing inputs +@Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs function make_kernel(k) return softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2]))) @@ -68,7 +68,7 @@ function posterior(m::SVGPModel) end # Return the loss given data - in this case the negative ELBO. -function flux_loss(x, y; n_data = length(y)) +function flux_loss(x, y; n_data=length(y)) fx, fu, q = model(x) return -SparseGPs.elbo(fx, y, fu, q; n_data) end @@ -90,7 +90,7 @@ model = SVGPModel(k, m, A, z) b = 100 # minibatch size opt = ADAM(0.001) parameters = Flux.params(model) -data_loader = Flux.Data.DataLoader((x, y), batchsize = b) +data_loader = Flux.Data.DataLoader((x, y), batchsize=b) # %% # Negative ELBO before training @@ -99,10 +99,10 @@ println(flux_loss(x, y)) # %% # Train the model Flux.train!( - (x, y) -> flux_loss(x, y; n_data = N), + (x, y) -> flux_loss(x, y; n_data=N), parameters, ncycle(data_loader, 300), # Train for 300 epochs - opt, + opt ) # %% @@ -116,16 +116,16 @@ post = posterior(model) scatter( x, y; - markershape = :xcross, - markeralpha = 0.1, - xlim = (-1, 1), - xlabel = "x", - ylabel = "y", - title = "posterior (VI with sparse grid)", - label = "Train Data", + markershape=:xcross, + markeralpha=0.1, + xlim=(-1, 1), + xlabel="x", + ylabel="y", + title="posterior (VI with sparse grid)", + label="Train Data", ) -plot!(-1:0.001:1, post; label = "Posterior") -vline!(z; label = "Pseudo-points") +plot!(-1:0.001:1, post; label="Posterior") +vline!(z; label="Pseudo-points") # %% There is a closed form optimal solution for the variational posterior q(u) @@ -137,8 +137,8 @@ function exact_q(fu, fx, y) σ² = fx.Σy[1] Kuf = cov(fu, fx) Kuu = Symmetric(cov(fu)) - Σ = (Symmetric(cov(fu) + (1 / σ²) * Kuf * Kuf')) - m = ((1 / σ²) * Kuu * (Σ \ Kuf)) * y + Σ = (Symmetric(cov(fu) + (1/σ²) * Kuf * Kuf')) + m = ((1/σ²)*Kuu* (Σ\Kuf)) * y S = Symmetric(Kuu * (Σ \ Kuu)) return MvNormal(m, S) end @@ -164,14 +164,15 @@ AbstractGPs.elbo(fx, y, fu) scatter( x, y; - markershape = :xcross, - markeralpha = 0.1, - xlim = (-1, 1), - xlabel = "x", - ylabel = "y", - title = "posterior (VI with sparse grid)", - label = "Train Data", + markershape=:xcross, + markeralpha=0.1, + xlim=(-1, 1), + xlabel="x", + ylabel="y", + title="posterior (VI with sparse grid)", + label="Train Data", ) -plot!(-1:0.001:1, ap_ex; label = "SVGP posterior") -plot!(-1:0.001:1, ap_tits; label = "Titsias posterior") -vline!(z; label = "Pseudo-points") +plot!(-1:0.001:1, ap_ex; label="SVGP posterior") +plot!(-1:0.001:1, ap_tits; label="Titsias posterior") +vline!(z; label="Pseudo-points") + diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl index c0a165af..575ed4cb 100644 --- a/src/SparseGPs.jl +++ b/src/SparseGPs.jl @@ -13,9 +13,19 @@ using FillArrays using KLDivergences using AbstractGPs: - AbstractGP, FiniteGP, LatentFiniteGP, ApproxPosteriorGP, At_A, diag_At_A + AbstractGP, + FiniteGP, + LatentFiniteGP, + ApproxPosteriorGP, + At_A, + diag_At_A, + Xt_invA_X -export SVGP, Default, Analytic, Quadrature, MonteCarlo +export SVGP, + Default, + Analytic, + Quadrature, + MonteCarlo include("elbo.jl") include("svgp.jl") diff --git a/src/elbo.jl b/src/elbo.jl index 94a16e49..e25e43da 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -4,7 +4,7 @@ ScalarLikelihood = Union{ PoissonLikelihood, GaussianLikelihood, ExponentialLikelihood, - GammaLikelihood, + GammaLikelihood } @@ -43,26 +43,18 @@ variational Gaussian process classification." Artificial Intelligence and Statistics. PMLR, 2015. """ function AbstractGPs.elbo( - fx::FiniteGP{<:AbstractGP,<:AbstractVector,<:Diagonal{<:Real,<:Fill}}, + fx::FiniteGP{<:AbstractGP, <:AbstractVector, <:Diagonal{<:Real, <:Fill}}, y::AbstractVector{<:Real}, fz::FiniteGP, q::AbstractMvNormal; - n_data = length(y), - method = Default(), + n_data=length(y), + method=Default() ) return _elbo(method, fx, y, fz, q, GaussianLikelihood(fx.Σy[1]), n_data) end -function AbstractGPs.elbo( - ::FiniteGP, - ::AbstractVector, - ::FiniteGP, - ::AbstractMvNormal; - kwargs..., -) - return error( - "The observation noise fx.Σy must be homoscedastic.\n To avoid this error, construct fx using: f = GP(kernel); fx = f(x, σ²)", - ) +function AbstractGPs.elbo(::FiniteGP, ::AbstractVector, ::FiniteGP, ::AbstractMvNormal; kwargs...) + return error("The observation noise fx.Σy must be homoscedastic.\n To avoid this error, construct fx using: f = GP(kernel); fx = f(x, σ²)") end """ @@ -75,8 +67,8 @@ function AbstractGPs.elbo( y::AbstractVector, fz::FiniteGP, q::AbstractMvNormal; - n_data = length(y), - method = Default(), + n_data=length(y), + method=Default() ) return _elbo(method, lfx.fx, y, fz, q, lfx.lik, n_data) end @@ -89,7 +81,7 @@ function _elbo( fz::FiniteGP, q::AbstractMvNormal, lik::ScalarLikelihood, - n_data::Integer, + n_data::Integer ) post = approx_posterior(SVGP(), fz, q) q_f = marginals(post(fx.x)) @@ -140,7 +132,7 @@ function expected_loglik( ::Default, y::AbstractVector, q_f::AbstractVector{<:Normal}, - lik::ScalarLikelihood, + lik::ScalarLikelihood ) method = _default_method(lik) expected_loglik(method, y, q_f, lik) @@ -151,11 +143,9 @@ function expected_loglik( ::Analytic, y::AbstractVector{<:Real}, q_f::AbstractVector{<:Normal}, - lik::GaussianLikelihood, + lik::GaussianLikelihood ) - return sum( - -0.5 * (log(2π) .+ log.(lik.σ²) .+ ((y .- mean.(q_f)) .^ 2 .+ var.(q_f)) / lik.σ²), - ) + return sum(-0.5 * (log(2π) .+ log.(lik.σ²) .+ ((y .- mean.(q_f)).^2 .+ var.(q_f)) / lik.σ²)) end # The closed form solution for a Poisson likelihood with an exponential inverse link function @@ -163,8 +153,8 @@ function expected_loglik( ::Analytic, y::AbstractVector, q_f::AbstractVector{<:Normal}, - ::PoissonLikelihood{ExpLink}, -) + ::PoissonLikelihood{ExpLink} + ) f_μ = mean.(q_f) return sum((y .* f_μ) - exp.(f_μ .+ (var.(q_f) / 2)) - loggamma.(y .+ 1)) end @@ -174,8 +164,8 @@ function expected_loglik( ::Analytic, y::AbstractVector{<:Real}, q_f::AbstractVector{<:Normal}, - ::ExponentialLikelihood{ExpLink}, -) + ::ExponentialLikelihood{ExpLink} + ) f_μ = mean.(q_f) return sum(-f_μ - y .* exp.((var.(q_f) / 2) .- f_μ)) end @@ -185,20 +175,22 @@ function expected_loglik( ::Analytic, y::AbstractVector{<:Real}, q_f::AbstractVector{<:Normal}, - lik::GammaLikelihood{<:Any,ExpLink}, -) - f_μ = mean.(q_f) - return sum( - (lik.α - 1) * log.(y) .- y .* exp.((var.(q_f) / 2) .- f_μ) .- lik.α * f_μ .- - loggamma(lik.α), + lik::GammaLikelihood{<:Any, ExpLink} ) + f_μ = mean.(q_f) + return sum((lik.α - 1) * log.(y) .- y .* exp.((var.(q_f) / 2) .- f_μ) + .- lik.α * f_μ .- loggamma(lik.α)) end -function expected_loglik(::Analytic, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik) +function expected_loglik( + ::Analytic, + y::AbstractVector, + q_f::AbstractVector{<:Normal}, + lik +) return error( - "No analytic solution exists for ", - typeof(lik), - ". Use `Default()`, `Quadrature()` or `MonteCarlo()` instead.", + "No analytic solution exists for ", typeof(lik), + ". Use `Default()`, `Quadrature()` or `MonteCarlo()` instead." ) end @@ -206,7 +198,7 @@ function expected_loglik( mc::MonteCarlo, y::AbstractVector, q_f::AbstractVector{<:Normal}, - lik::ScalarLikelihood, + lik::ScalarLikelihood ) # take 'n_samples' reparameterised samples f_μ = mean.(q_f) @@ -219,7 +211,7 @@ function expected_loglik( gh::Quadrature, y::AbstractVector, q_f::AbstractVector{<:Normal}, - lik::ScalarLikelihood, + lik::ScalarLikelihood ) # Compute the expectation via Gauss-Hermite quadrature # using a reparameterisation by change of variable @@ -228,12 +220,16 @@ function expected_loglik( # size(fs): (length(y), n_points) fs = √2 * std.(q_f) .* transpose(xs) .+ mean.(q_f) lls = loglikelihood.(lik.(fs), y) - return sum((1 / √π) * lls * ws) + return sum((1/√π) * lls * ws) end ChainRulesCore.@non_differentiable gausshermite(n) -AnalyticLikelihood = - Union{PoissonLikelihood,GaussianLikelihood,ExponentialLikelihood,GammaLikelihood} +AnalyticLikelihood = Union{ + PoissonLikelihood, + GaussianLikelihood, + ExponentialLikelihood, + GammaLikelihood +} _default_method(::AnalyticLikelihood) = Analytic() _default_method(_) = Quadrature() diff --git a/src/svgp.jl b/src/svgp.jl index f9a10838..12b9e293 100644 --- a/src/svgp.jl +++ b/src/svgp.jl @@ -21,8 +21,8 @@ function AbstractGPs.approx_posterior(::SVGP, fz::FiniteGP, q::AbstractMvNormal) m, A = mean(q), _chol_cov(q) Kuu = _chol_cov(fz) B = Kuu.L \ A.L - α = Kuu \ (m - mean(fz)) - data = (A = A, m = m, Kuu = Kuu, B = B, α = α, u = fz.x) + α=Kuu \ (m - mean(fz)) + data = (A=A, m=m, Kuu=Kuu, B=B, α=α, u=fz.x) return ApproxPosteriorGP(SVGP(), fz.f, data) end @@ -33,13 +33,13 @@ end function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) Cux = cov(f.prior, f.data.u, x) D = f.data.Kuu.L \ Cux - return cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) + return cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) end function Statistics.var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) Cux = cov(f.prior, f.data.u, x) D = f.data.Kuu.L \ Cux - return var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) + return var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) end function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector, y::AbstractVector) @@ -55,7 +55,7 @@ function StatsBase.mean_and_cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) Cux = cov(f.prior, f.data.u, x) D = f.data.Kuu.L \ Cux μ = Cux' * f.data.α - Σ = cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) + Σ = cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) return μ, Σ end @@ -63,7 +63,7 @@ function StatsBase.mean_and_var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) Cux = cov(f.prior, f.data.u, x) D = f.data.Kuu.L \ Cux μ = Cux' * f.data.α - Σ_diag = var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) + Σ_diag = var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) return μ, Σ_diag end diff --git a/test/elbo.jl b/test/elbo.jl index e4394da0..07b89035 100644 --- a/test/elbo.jl +++ b/test/elbo.jl @@ -25,12 +25,10 @@ l = lik() methods = [Quadrature(100), MonteCarlo(1000000)] def = SparseGPs._default_method(l) - if def isa Analytic - push!(methods, def) - end + if def isa Analytic push!(methods, def) end y = rand.(rng, l.(zeros(10))) results = map(m -> SparseGPs.expected_loglik(m, y, q_f, l), methods) - @test all(x -> isapprox(x, results[end], rtol = 1e-3), results) + @test all(x->isapprox(x, results[end], rtol=1e-3), results) end end diff --git a/test/equivalences.jl b/test/equivalences.jl index 8b375eb9..162e8d3d 100644 --- a/test/equivalences.jl +++ b/test/equivalences.jl @@ -4,7 +4,7 @@ y = sin.(x) + 0.9 * cos.(x * 1.6) + 0.4 * rand(rng, N) z = copy(x) # Set inducing inputs == training inputs - + k_init = [0.2, 0.6] # initial kernel parameters lik_noise = 0.1 # The (fixed) Gaussian likelihood noise @@ -25,11 +25,11 @@ vfe_post = approx_posterior(VFE(), fx, y, fz) # Titsias posterior svgp_post = approx_posterior(SVGP(), fz, q_ex) # Hensman (2013) exact posterior - @test mean(gpr_post, x) ≈ mean(svgp_post, x) atol = 1e-10 - @test cov(gpr_post, x) ≈ cov(svgp_post, x) atol = 1e-10 + @test mean(gpr_post, x) ≈ mean(svgp_post, x) atol=1e-10 + @test cov(gpr_post, x) ≈ cov(svgp_post, x) atol=1e-10 - @test mean(vfe_post, x) ≈ mean(svgp_post, x) atol = 1e-10 - @test cov(vfe_post, x) ≈ cov(svgp_post, x) atol = 1e-10 + @test mean(vfe_post, x) ≈ mean(svgp_post, x) atol=1e-10 + @test cov(vfe_post, x) ≈ cov(svgp_post, x) atol=1e-10 @test elbo(fx, y, fz, q_ex) ≈ logpdf(fx, y) end @@ -42,9 +42,9 @@ ## FIRST - define the models # GPR - Exact GP regression struct GPRModel - k::Any # kernel parameters + k # kernel parameters end - Flux.@functor GPRModel + @Flux.functor GPRModel function (m::GPRModel)(x) f = make_gp(make_kernel(m.k)) @@ -54,12 +54,12 @@ # SVGP - Sparse variational GP regression (Hensman 2014) struct SVGPModel - k::Any # kernel parameters - z::Any # inducing points - m::Any # variational mean - A::Any # variational covariance sqrt (Σ = A'A) + k # kernel parameters + z # inducing points + m # variational mean + A # variational covariance sqrt (Σ = A'A) end - Flux.@functor SVGPModel (k, m, A) # Don't train the inducing inputs + @Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs function (m::SVGPModel)(x) f = make_gp(make_kernel(m.k)) @@ -111,8 +111,9 @@ svgp_post = posterior(svgp) ## FIFTH - test equivalences - @test all(isapprox.(mean(gpr_post, x), mean(svgp_post, x), atol = 1e-4)) - @test all(isapprox.(cov(gpr_post, x), cov(svgp_post, x), atol = 1e-4)) + @test all(isapprox.(mean(gpr_post, x), mean(svgp_post, x), atol=1e-4)) + @test all(isapprox.(cov(gpr_post, x), cov(svgp_post, x), atol=1e-4)) end end + diff --git a/test/svgp.jl b/test/svgp.jl index 9cfef116..7dd90692 100644 --- a/test/svgp.jl +++ b/test/svgp.jl @@ -7,14 +7,14 @@ # Specify prior. f = GP(Matern32Kernel()) # Sample from prior. - x = collect(range(-1.0, 1.0; length = N_cond)) + x = collect(range(-1.0, 1.0; length=N_cond)) fx = f(x, 1e-15) y = rand(rng, fx) q = exact_variational_posterior(fx, fx, y) f_approx_post = approx_posterior(SVGP(), fx, q) - a = collect(range(-1.0, 1.0; length = N_a)) + a = collect(range(-1.0, 1.0; length=N_a)) b = randn(rng, N_b) AbstractGPs.TestUtils.test_internal_abstractgps_interface(rng, f_approx_post, a, b) end diff --git a/test/test_utils.jl b/test/test_utils.jl index 805c799d..02b50670 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -5,13 +5,12 @@ make_kernel(k) = softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(soft # q(u) (e.g. # https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/ # equations (11) & (12)). Assumes a ZeroMean function. function exact_variational_posterior(fu, fx, y) - fu.f.mean isa AbstractGPs.ZeroMean || - error("The exact posterior requires a GP with ZeroMean.") + fu.f.mean isa AbstractGPs.ZeroMean || error("The exact posterior requires a GP with ZeroMean.") σ² = fx.Σy[1] Kuf = cov(fu, fx) Kuu = Symmetric(cov(fu)) - Σ = (Symmetric(cov(fu) + (1 / σ²) * Kuf * Kuf')) - m = ((1 / σ²) * Kuu * (Σ \ Kuf)) * y + Σ = (Symmetric(cov(fu) + (1/σ²) * Kuf * Kuf')) + m = ((1/σ²)*Kuu* (Σ\Kuf)) * y S = Symmetric(Kuu * (Σ \ Kuu)) return MvNormal(m, S) end From ef3292cb44f74581fce32e6d4563b4611870deb3 Mon Sep 17 00:00:00 2001 From: Ross Viljoen Date: Fri, 30 Jul 2021 12:37:07 +0100 Subject: [PATCH 66/66] Reformat with JuliaFormatter - BlueStyle --- .JuliaFormatter.toml | 1 + examples/classification.jl | 36 ++++------------- examples/regression.jl | 14 +++---- src/SparseGPs.jl | 14 +------ src/elbo.jl | 79 +++++++++++++++++--------------------- src/svgp.jl | 10 ++--- test/elbo.jl | 6 ++- test/equivalences.jl | 16 ++++---- test/test_utils.jl | 7 ++-- 9 files changed, 70 insertions(+), 113 deletions(-) create mode 100644 .JuliaFormatter.toml diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml new file mode 100644 index 00000000..323237ba --- /dev/null +++ b/.JuliaFormatter.toml @@ -0,0 +1 @@ +style = "blue" diff --git a/examples/classification.jl b/examples/classification.jl index ab476bce..b1b7f6fc 100644 --- a/examples/classification.jl +++ b/examples/classification.jl @@ -23,7 +23,6 @@ data_file = pkgdir(SparseGPs) * "/examples/data/classif_1D.csv" x, y = eachcol(readdlm(data_file)) scatter(x, y) - # %% # First, create the GP kernel from given parameters k function make_kernel(k) @@ -36,37 +35,22 @@ kernel = make_kernel(k) f = LatentGP(GP(kernel), BernoulliLikelihood(), 0.1) fx = f(x) - # %% # Then, plot some samples from the prior underlying GP x_plot = 0:0.02:6 -prior_f_samples = rand(f.f(x_plot, 1e-6),20) +prior_f_samples = rand(f.f(x_plot, 1e-6), 20) -plt = plot( - x_plot, - prior_f_samples; - seriescolor="red", - linealpha=0.2, - label="" -) +plt = plot(x_plot, prior_f_samples; seriescolor="red", linealpha=0.2, label="") scatter!(plt, x, y; seriescolor="blue", label="Data points") - # %% # Plot the same samples, but pushed through a logistic sigmoid to constrain # them in (0, 1). prior_y_samples = mean.(f.lik.(prior_f_samples)) -plt = plot( - x_plot, - prior_y_samples; - seriescolor="red", - linealpha=0.2, - label="" -) +plt = plot(x_plot, prior_y_samples; seriescolor="red", linealpha=0.2, label="") scatter!(plt, x, y; seriescolor="blue", label="Data points") - # %% # A simple Flux model using Flux @@ -78,7 +62,7 @@ struct SVGPModel z # inducing points end -@Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs +Flux.@functor SVGPModel (k, m, A) # Don't train the inducing inputs lik = BernoulliLikelihood() jitter = 1e-4 @@ -121,7 +105,7 @@ Flux.train!( (x, y) -> flux_loss(x, y), parameters, ncycle([(x, y)], 2000), # Train for 1000 epochs - opt + opt, ) # %% @@ -136,13 +120,7 @@ l_post = LatentGP(post, BernoulliLikelihood(), jitter) post_f_samples = rand(l_post.f(x_plot, 1e-6), 20) -plt = plot( - x_plot, - post_f_samples; - seriescolor="red", - linealpha=0.2, - legend=false -) +plt = plot(x_plot, post_f_samples; seriescolor="red", linealpha=0.2, legend=false) # %% # As above, push these samples through a logistic sigmoid to get posterior predictions. @@ -154,7 +132,7 @@ plt = plot( seriescolor="red", linealpha=0.2, # legend=false, - label="" + label="", ) scatter!(plt, x, y; seriescolor="blue", label="Data points") vline!(z; label="Pseudo-points") diff --git a/examples/regression.jl b/examples/regression.jl index d537f448..518ee761 100644 --- a/examples/regression.jl +++ b/examples/regression.jl @@ -25,7 +25,6 @@ y = g.(x) + 0.3 * randn(N) scatter(x, y; xlabel="x", ylabel="y", legend=false) - # %% # A simple Flux model using Flux @@ -40,7 +39,7 @@ struct SVGPModel z # inducing points end -@Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs +Flux.@functor SVGPModel (k, m, A) # Don't train the inducing inputs function make_kernel(k) return softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(softplus(k[2]))) @@ -73,7 +72,6 @@ function flux_loss(x, y; n_data=length(y)) return -SparseGPs.elbo(fx, y, fu, q; n_data) end - # %% M = 50 # number of inducing points @@ -90,7 +88,7 @@ model = SVGPModel(k, m, A, z) b = 100 # minibatch size opt = ADAM(0.001) parameters = Flux.params(model) -data_loader = Flux.Data.DataLoader((x, y), batchsize=b) +data_loader = Flux.Data.DataLoader((x, y); batchsize=b) # %% # Negative ELBO before training @@ -102,7 +100,7 @@ Flux.train!( (x, y) -> flux_loss(x, y; n_data=N), parameters, ncycle(data_loader, 300), # Train for 300 epochs - opt + opt, ) # %% @@ -127,7 +125,6 @@ scatter( plot!(-1:0.001:1, post; label="Posterior") vline!(z; label="Pseudo-points") - # %% There is a closed form optimal solution for the variational posterior q(u) # (e.g. https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/ # equations (11) & (12)). The SVGP posterior with this optimal q(u) should @@ -137,8 +134,8 @@ function exact_q(fu, fx, y) σ² = fx.Σy[1] Kuf = cov(fu, fx) Kuu = Symmetric(cov(fu)) - Σ = (Symmetric(cov(fu) + (1/σ²) * Kuf * Kuf')) - m = ((1/σ²)*Kuu* (Σ\Kuf)) * y + Σ = (Symmetric(cov(fu) + (1 / σ²) * Kuf * Kuf')) + m = ((1 / σ²) * Kuu * (Σ \ Kuf)) * y S = Symmetric(Kuu * (Σ \ Kuu)) return MvNormal(m, S) end @@ -175,4 +172,3 @@ scatter( plot!(-1:0.001:1, ap_ex; label="SVGP posterior") plot!(-1:0.001:1, ap_tits; label="Titsias posterior") vline!(z; label="Pseudo-points") - diff --git a/src/SparseGPs.jl b/src/SparseGPs.jl index 575ed4cb..c0a165af 100644 --- a/src/SparseGPs.jl +++ b/src/SparseGPs.jl @@ -13,19 +13,9 @@ using FillArrays using KLDivergences using AbstractGPs: - AbstractGP, - FiniteGP, - LatentFiniteGP, - ApproxPosteriorGP, - At_A, - diag_At_A, - Xt_invA_X + AbstractGP, FiniteGP, LatentFiniteGP, ApproxPosteriorGP, At_A, diag_At_A -export SVGP, - Default, - Analytic, - Quadrature, - MonteCarlo +export SVGP, Default, Analytic, Quadrature, MonteCarlo include("elbo.jl") include("svgp.jl") diff --git a/src/elbo.jl b/src/elbo.jl index e25e43da..38627ceb 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -4,10 +4,9 @@ ScalarLikelihood = Union{ PoissonLikelihood, GaussianLikelihood, ExponentialLikelihood, - GammaLikelihood + GammaLikelihood, } - abstract type ExpectationMethod end struct Default <: ExpectationMethod end struct Analytic <: ExpectationMethod end @@ -43,18 +42,22 @@ variational Gaussian process classification." Artificial Intelligence and Statistics. PMLR, 2015. """ function AbstractGPs.elbo( - fx::FiniteGP{<:AbstractGP, <:AbstractVector, <:Diagonal{<:Real, <:Fill}}, + fx::FiniteGP{<:AbstractGP,<:AbstractVector,<:Diagonal{<:Real,<:Fill}}, y::AbstractVector{<:Real}, fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), - method=Default() + method=Default(), ) return _elbo(method, fx, y, fz, q, GaussianLikelihood(fx.Σy[1]), n_data) end -function AbstractGPs.elbo(::FiniteGP, ::AbstractVector, ::FiniteGP, ::AbstractMvNormal; kwargs...) - return error("The observation noise fx.Σy must be homoscedastic.\n To avoid this error, construct fx using: f = GP(kernel); fx = f(x, σ²)") +function AbstractGPs.elbo( + ::FiniteGP, ::AbstractVector, ::FiniteGP, ::AbstractMvNormal; kwargs... +) + return error( + "The observation noise fx.Σy must be homoscedastic.\n To avoid this error, construct fx using: f = GP(kernel); fx = f(x, σ²)", + ) end """ @@ -68,7 +71,7 @@ function AbstractGPs.elbo( fz::FiniteGP, q::AbstractMvNormal; n_data=length(y), - method=Default() + method=Default(), ) return _elbo(method, lfx.fx, y, fz, q, lfx.lik, n_data) end @@ -81,7 +84,7 @@ function _elbo( fz::FiniteGP, q::AbstractMvNormal, lik::ScalarLikelihood, - n_data::Integer + n_data::Integer, ) post = approx_posterior(SVGP(), fz, q) q_f = marginals(post(fx.x)) @@ -129,13 +132,10 @@ Defaults to a closed form solution if it exists, otherwise defaults to Gauss-Hermite quadrature. """ function expected_loglik( - ::Default, - y::AbstractVector, - q_f::AbstractVector{<:Normal}, - lik::ScalarLikelihood + ::Default, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik::ScalarLikelihood ) method = _default_method(lik) - expected_loglik(method, y, q_f, lik) + return expected_loglik(method, y, q_f, lik) end # The closed form solution for independent Gaussian noise @@ -143,9 +143,11 @@ function expected_loglik( ::Analytic, y::AbstractVector{<:Real}, q_f::AbstractVector{<:Normal}, - lik::GaussianLikelihood + lik::GaussianLikelihood, ) - return sum(-0.5 * (log(2π) .+ log.(lik.σ²) .+ ((y .- mean.(q_f)).^2 .+ var.(q_f)) / lik.σ²)) + return sum( + -0.5 * (log(2π) .+ log.(lik.σ²) .+ ((y .- mean.(q_f)) .^ 2 .+ var.(q_f)) / lik.σ²) + ) end # The closed form solution for a Poisson likelihood with an exponential inverse link function @@ -153,8 +155,8 @@ function expected_loglik( ::Analytic, y::AbstractVector, q_f::AbstractVector{<:Normal}, - ::PoissonLikelihood{ExpLink} - ) + ::PoissonLikelihood{ExpLink}, +) f_μ = mean.(q_f) return sum((y .* f_μ) - exp.(f_μ .+ (var.(q_f) / 2)) - loggamma.(y .+ 1)) end @@ -164,8 +166,8 @@ function expected_loglik( ::Analytic, y::AbstractVector{<:Real}, q_f::AbstractVector{<:Normal}, - ::ExponentialLikelihood{ExpLink} - ) + ::ExponentialLikelihood{ExpLink}, +) f_μ = mean.(q_f) return sum(-f_μ - y .* exp.((var.(q_f) / 2) .- f_μ)) end @@ -175,30 +177,25 @@ function expected_loglik( ::Analytic, y::AbstractVector{<:Real}, q_f::AbstractVector{<:Normal}, - lik::GammaLikelihood{<:Any, ExpLink} - ) + lik::GammaLikelihood{<:Any,ExpLink}, +) f_μ = mean.(q_f) - return sum((lik.α - 1) * log.(y) .- y .* exp.((var.(q_f) / 2) .- f_μ) - .- lik.α * f_μ .- loggamma(lik.α)) + return sum( + (lik.α - 1) * log.(y) .- y .* exp.((var.(q_f) / 2) .- f_μ) .- lik.α * f_μ .- + loggamma(lik.α), + ) end -function expected_loglik( - ::Analytic, - y::AbstractVector, - q_f::AbstractVector{<:Normal}, - lik -) +function expected_loglik(::Analytic, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik) return error( - "No analytic solution exists for ", typeof(lik), - ". Use `Default()`, `Quadrature()` or `MonteCarlo()` instead." + "No analytic solution exists for ", + typeof(lik), + ". Use `Default()`, `Quadrature()` or `MonteCarlo()` instead.", ) end function expected_loglik( - mc::MonteCarlo, - y::AbstractVector, - q_f::AbstractVector{<:Normal}, - lik::ScalarLikelihood + mc::MonteCarlo, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik::ScalarLikelihood ) # take 'n_samples' reparameterised samples f_μ = mean.(q_f) @@ -208,10 +205,7 @@ function expected_loglik( end function expected_loglik( - gh::Quadrature, - y::AbstractVector, - q_f::AbstractVector{<:Normal}, - lik::ScalarLikelihood + gh::Quadrature, y::AbstractVector, q_f::AbstractVector{<:Normal}, lik::ScalarLikelihood ) # Compute the expectation via Gauss-Hermite quadrature # using a reparameterisation by change of variable @@ -220,16 +214,13 @@ function expected_loglik( # size(fs): (length(y), n_points) fs = √2 * std.(q_f) .* transpose(xs) .+ mean.(q_f) lls = loglikelihood.(lik.(fs), y) - return sum((1/√π) * lls * ws) + return sum((1 / √π) * lls * ws) end ChainRulesCore.@non_differentiable gausshermite(n) AnalyticLikelihood = Union{ - PoissonLikelihood, - GaussianLikelihood, - ExponentialLikelihood, - GammaLikelihood + PoissonLikelihood,GaussianLikelihood,ExponentialLikelihood,GammaLikelihood } _default_method(::AnalyticLikelihood) = Analytic() _default_method(_) = Quadrature() diff --git a/src/svgp.jl b/src/svgp.jl index 12b9e293..0e71a265 100644 --- a/src/svgp.jl +++ b/src/svgp.jl @@ -21,7 +21,7 @@ function AbstractGPs.approx_posterior(::SVGP, fz::FiniteGP, q::AbstractMvNormal) m, A = mean(q), _chol_cov(q) Kuu = _chol_cov(fz) B = Kuu.L \ A.L - α=Kuu \ (m - mean(fz)) + α = Kuu \ (m - mean(fz)) data = (A=A, m=m, Kuu=Kuu, B=B, α=α, u=fz.x) return ApproxPosteriorGP(SVGP(), fz.f, data) end @@ -33,13 +33,13 @@ end function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) Cux = cov(f.prior, f.data.u, x) D = f.data.Kuu.L \ Cux - return cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) + return cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) end function Statistics.var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) Cux = cov(f.prior, f.data.u, x) D = f.data.Kuu.L \ Cux - return var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) + return var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) end function Statistics.cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector, y::AbstractVector) @@ -55,7 +55,7 @@ function StatsBase.mean_and_cov(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) Cux = cov(f.prior, f.data.u, x) D = f.data.Kuu.L \ Cux μ = Cux' * f.data.α - Σ = cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) + Σ = cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D) return μ, Σ end @@ -63,7 +63,7 @@ function StatsBase.mean_and_var(f::ApproxPosteriorGP{SVGP}, x::AbstractVector) Cux = cov(f.prior, f.data.u, x) D = f.data.Kuu.L \ Cux μ = Cux' * f.data.α - Σ_diag = var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) + Σ_diag = var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D) return μ, Σ_diag end diff --git a/test/elbo.jl b/test/elbo.jl index 07b89035..0f20367c 100644 --- a/test/elbo.jl +++ b/test/elbo.jl @@ -25,10 +25,12 @@ l = lik() methods = [Quadrature(100), MonteCarlo(1000000)] def = SparseGPs._default_method(l) - if def isa Analytic push!(methods, def) end + if def isa Analytic + push!(methods, def) + end y = rand.(rng, l.(zeros(10))) results = map(m -> SparseGPs.expected_loglik(m, y, q_f, l), methods) - @test all(x->isapprox(x, results[end], rtol=1e-3), results) + @test all(x -> isapprox(x, results[end]; rtol=1e-3), results) end end diff --git a/test/equivalences.jl b/test/equivalences.jl index 162e8d3d..3bb4c7bc 100644 --- a/test/equivalences.jl +++ b/test/equivalences.jl @@ -4,7 +4,7 @@ y = sin.(x) + 0.9 * cos.(x * 1.6) + 0.4 * rand(rng, N) z = copy(x) # Set inducing inputs == training inputs - + k_init = [0.2, 0.6] # initial kernel parameters lik_noise = 0.1 # The (fixed) Gaussian likelihood noise @@ -25,11 +25,11 @@ vfe_post = approx_posterior(VFE(), fx, y, fz) # Titsias posterior svgp_post = approx_posterior(SVGP(), fz, q_ex) # Hensman (2013) exact posterior - @test mean(gpr_post, x) ≈ mean(svgp_post, x) atol=1e-10 - @test cov(gpr_post, x) ≈ cov(svgp_post, x) atol=1e-10 + @test mean(gpr_post, x) ≈ mean(svgp_post, x) atol = 1e-10 + @test cov(gpr_post, x) ≈ cov(svgp_post, x) atol = 1e-10 - @test mean(vfe_post, x) ≈ mean(svgp_post, x) atol=1e-10 - @test cov(vfe_post, x) ≈ cov(svgp_post, x) atol=1e-10 + @test mean(vfe_post, x) ≈ mean(svgp_post, x) atol = 1e-10 + @test cov(vfe_post, x) ≈ cov(svgp_post, x) atol = 1e-10 @test elbo(fx, y, fz, q_ex) ≈ logpdf(fx, y) end @@ -44,7 +44,7 @@ struct GPRModel k # kernel parameters end - @Flux.functor GPRModel + Flux.@functor GPRModel function (m::GPRModel)(x) f = make_gp(make_kernel(m.k)) @@ -59,7 +59,7 @@ m # variational mean A # variational covariance sqrt (Σ = A'A) end - @Flux.functor SVGPModel (k, m, A,) # Don't train the inducing inputs + Flux.@functor SVGPModel (k, m, A) # Don't train the inducing inputs function (m::SVGPModel)(x) f = make_gp(make_kernel(m.k)) @@ -114,6 +114,4 @@ @test all(isapprox.(mean(gpr_post, x), mean(svgp_post, x), atol=1e-4)) @test all(isapprox.(cov(gpr_post, x), cov(svgp_post, x), atol=1e-4)) end - end - diff --git a/test/test_utils.jl b/test/test_utils.jl index 02b50670..805c799d 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -5,12 +5,13 @@ make_kernel(k) = softplus(k[1]) * (SqExponentialKernel() ∘ ScaleTransform(soft # q(u) (e.g. # https://krasserm.github.io/2020/12/12/gaussian-processes-sparse/ # equations (11) & (12)). Assumes a ZeroMean function. function exact_variational_posterior(fu, fx, y) - fu.f.mean isa AbstractGPs.ZeroMean || error("The exact posterior requires a GP with ZeroMean.") + fu.f.mean isa AbstractGPs.ZeroMean || + error("The exact posterior requires a GP with ZeroMean.") σ² = fx.Σy[1] Kuf = cov(fu, fx) Kuu = Symmetric(cov(fu)) - Σ = (Symmetric(cov(fu) + (1/σ²) * Kuf * Kuf')) - m = ((1/σ²)*Kuu* (Σ\Kuf)) * y + Σ = (Symmetric(cov(fu) + (1 / σ²) * Kuf * Kuf')) + m = ((1 / σ²) * Kuu * (Σ \ Kuf)) * y S = Symmetric(Kuu * (Σ \ Kuu)) return MvNormal(m, S) end