Include number of rejected updates in the results (#116)

* Count number of bfgs updates rejected * Propagate count to outer function * Return number of BFGS updates rejected * Raise warning if updates were rejected * Add missing import * Update tests * Increment minor version number * Bump docs and test compatibilities
mlcolab · Jan 5, 2023 · 6b18635 · 6b18635 · sethaxen · Jan 5, 2023
1 parent 4b79342
commit 6b18635
Show file tree

Hide file tree

Showing 10 changed files with 39 additions and 15 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Pathfinder"
 uuid = "b1d3bc72-d0e7-4279-b92f-7fa5d6d2d454"
 authors = ["Seth Axen <[email protected]> and contributors"]
-version = "0.5.6"
+version = "0.6.0"
 
 [deps]
 AbstractDifferentiation = "c29ec348-61ec-40c8-8164-b8c60e9d9f3d"

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -21,7 +21,7 @@ DynamicHMC = "3"
 ForwardDiff = "0.10"
 LogDensityProblems = "1, 2"
 LogDensityProblemsAD = "1, 2"
-Pathfinder = "0.5"
+Pathfinder = "0.6"
 StatsFuns = "0.9, 1"
 StatsPlots = "0.14, 0.15"
 TransformVariables = "0.6, 0.7"

diff --git a/src/inverse_hessian.jl b/src/inverse_hessian.jl
@@ -10,12 +10,17 @@ function gilbert_init(α, s, y)
 end
 
 """
-    lbfgs_inverse_hessians(θs, ∇logpθs; Hinit=gilbert_init, history_length=5, ϵ=1e-12) -> Vector{WoodburyPDMat}
+    lbfgs_inverse_hessians(
+        θs, ∇logpθs; Hinit=gilbert_init, history_length=5, ϵ=1e-12
+    ) -> Tuple{Vector{WoodburyPDMat},Int}
 
 From an L-BFGS trajectory and gradients, compute the inverse Hessian approximations at each point.
 
 Given positions `θs` with gradients `∇logpθs`, construct LBFGS inverse Hessian
 approximations with the provided `history_length`.
+
+The 2nd returned value is the number of BFGS updates to the inverse Hessian matrices that
+were rejected due to keeping the inverse Hessian positive definite.
 """
 function lbfgs_inverse_hessians(θs, ∇logpθs; Hinit=gilbert_init, history_length=5, ϵ=1e-12)
     L = length(θs) - 1
@@ -34,6 +39,7 @@ function lbfgs_inverse_hessians(θs, ∇logpθs; Hinit=gilbert_init, history_len
     H = lbfgs_inverse_hessian(Diagonal(α), S, Y, history_ind, history_length_effective) # H₀ = I
     Hs = [H] # trace of H
 
+    num_bfgs_updates_rejected = 0
     for l in 1:L
         θlp1, ∇logpθlp1 = θs[l + 1], ∇logpθs[l + 1]
         s .= θlp1 .- θ
@@ -48,15 +54,15 @@ function lbfgs_inverse_hessians(θs, ∇logpθs; Hinit=gilbert_init, history_len
             # initial diagonal estimate of H
             α = Hinit(α, s, y)
         else
-            @debug "Skipping inverse Hessian update from iteration $l to avoid negative curvature."
+            num_bfgs_updates_rejected += 1
         end
 
         θ, ∇logpθ = θlp1, ∇logpθlp1
         H = lbfgs_inverse_hessian(Diagonal(α), S, Y, history_ind, history_length_effective)
         push!(Hs, H)
     end
 
-    return Hs
+    return Hs, num_bfgs_updates_rejected
 end
 
 """

diff --git a/src/mvnormal.jl b/src/mvnormal.jl
@@ -1,21 +1,25 @@
 """
-    fit_mvnormals(θs, ∇logpθs; history_length=5)
+    fit_mvnormals(θs, ∇logpθs; history_length=5) -> (dists, num_bfgs_updates_rejected)
 
 Fit a multivariate-normal distribution to each point on the trajectory `θs`.
 
 Given `θs` with gradients `∇logpθs`, construct LBFGS inverse Hessian approximations with
 the provided `history_length`. The inverse Hessians approximate a covariance. The
 covariances and corresponding means that define multivariate normal approximations per
 point are returned.
+
+The 2nd returned value is the number of BFGS updates to the inverse Hessian matrices that
+were rejected due to keeping the inverse Hessian positive definite.
 """
 function fit_mvnormals(θs, ∇logpθs; kwargs...)
-    Σs = lbfgs_inverse_hessians(θs, ∇logpθs; kwargs...)
+    Σs, num_bfgs_updates_rejected = lbfgs_inverse_hessians(θs, ∇logpθs; kwargs...)
     trans = Transducers.MapSplat() do Σ, ∇logpθ, θ
         μ = muladd(Σ, ∇logpθ, θ)
         return Distributions.MvNormal(μ, Σ)
     end
     l = length(Σs)
-    return @views(zip(Σs, ∇logpθs[1:l], θs[1:l])) |> trans |> collect
+    dists = @views(zip(Σs, ∇logpθs[1:l], θs[1:l])) |> trans |> collect
+    return dists, num_bfgs_updates_rejected
 end
 
 # faster than computing `logpdf` and `rand` independently

diff --git a/src/singlepath.jl b/src/singlepath.jl
@@ -29,6 +29,8 @@ Container for results of single-path Pathfinder.
     `fit_distributions[fit_iteration + 1] == fit_distribution`
 - `elbo_estimates::AbstractVector{<:Pathfinder.ELBOEstimate}`: ELBO estimates for all but
     the first distribution in `fit_distributions`.
+- `num_bfgs_updates_rejected::Int`: Number of times a BFGS update to the reconstructed
+    inverse Hessian was rejected to keep the inverse Hessian positive definite.
 
 # Returns
 - [`PathfinderResult`](@ref)
@@ -49,6 +51,7 @@ struct PathfinderResult{I,O,R,OP,LP,FD,D,FDT,DT,OS,OT,EE}
     optim_trace::OT
     fit_distributions::Vector{FD}
     elbo_estimates::EE
+    num_bfgs_updates_rejected::Int
 end
 
 function Base.show(io::IO, ::MIME"text/plain", result::PathfinderResult)
@@ -197,6 +200,7 @@ function pathfinder(
         fit_distributions,
         fit_iteration,
         elbo_estimates,
+        num_bfgs_updates_rejected,
     ) = path_result
 
     if !success
@@ -207,6 +211,11 @@ function pathfinder(
         ndraws_elbo_actual = ndraws_elbo
     end
 
+    if num_bfgs_updates_rejected > 0
+        perc = round(num_bfgs_updates_rejected * (100//length(fit_distributions)); digits=1)
+        @warn "$num_bfgs_updates_rejected ($(perc)%) updates to the inverse Hessian estimate were rejected to keep it positive definite."
+    end
+
     fit_distribution = fit_distributions[fit_iteration + 1]
 
     # reuse existing draws; draw additional ones if necessary
@@ -238,6 +247,7 @@ function pathfinder(
         optim_trace,
         fit_distributions,
         elbo_estimates,
+        num_bfgs_updates_rejected,
     )
 end
 
@@ -283,7 +293,7 @@ function _pathfinder(
     success = L > 0
 
     # fit mv-normal distributions to trajectory
-    fit_distributions = fit_mvnormals(
+    fit_distributions, num_bfgs_updates_rejected = fit_mvnormals(
         optim_trace.points, optim_trace.gradients; history_length
     )
 
@@ -305,6 +315,7 @@ function _pathfinder(
         fit_distributions,
         fit_iteration,
         elbo_estimates,
+        num_bfgs_updates_rejected,
     )
 end
 

diff --git a/test/integration/AdvancedHMC/Project.toml b/test/integration/AdvancedHMC/Project.toml
@@ -17,6 +17,6 @@ Distributions = "0.25"
 ForwardDiff = "0.10"
 MCMCDiagnosticTools = "0.1.3, 0.2"
 Optim = "1.4"
-Pathfinder = "0.5"
+Pathfinder = "0.5, 0.6"
 StatsFuns = "0.9, 1"
 julia = "1.6"
diff --git a/test/integration/DynamicHMC/Project.toml b/test/integration/DynamicHMC/Project.toml
@@ -19,7 +19,7 @@ LogDensityProblems = "1, 2"
 LogDensityProblemsAD = "1"
 MCMCDiagnosticTools = "0.1.3, 0.2"
 Optim = "1.4"
-Pathfinder = "0.5"
+Pathfinder = "0.5, 0.6"
 StatsFuns = "0.9, 1"
 TransformVariables = "0.6, 0.7"
 TransformedLogDensities = "1"

diff --git a/test/integration/Turing/Project.toml b/test/integration/Turing/Project.toml
@@ -5,6 +5,6 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Turing = "fce5fe82-541a-59a6-adf8-730c64b5f9a0"
 
 [compat]
-Pathfinder = "0.5"
+Pathfinder = "0.5, 0.6"
 Turing = "0.21, 0.22, 0.23"
 julia = "1.6"
diff --git a/test/inverse_hessian.jl b/test/inverse_hessian.jl
@@ -1,3 +1,4 @@
+using AbstractDifferentiation
 using ForwardDiff
 using LinearAlgebra
 using Optim
@@ -60,7 +61,7 @@ end
         sol, optim_trace = Pathfinder.optimize_with_trace(prob, optimizer)
 
         # run lbfgs_inverse_hessians with the same initialization as Optim.LBFGS
-        Hs = Pathfinder.lbfgs_inverse_hessians(
+        Hs, num_bfgs_updates_rejected = Pathfinder.lbfgs_inverse_hessians(
             optim_trace.points,
             optim_trace.gradients;
             history_length,
@@ -71,5 +72,6 @@ end
         # check that next direction computed from Hessian is the same as the actual
         # direction that was taken
         @test all(≈(1), dot.(ps, ss) ./ norm.(ss) ./ norm.(ps))
+        @test num_bfgs_updates_rejected == 0
     end
 end
diff --git a/test/mvnormal.jl b/test/mvnormal.jl
@@ -19,13 +19,14 @@ include("test_utils.jl")
         optimizer = Optim.LBFGS()
         history_length = optimizer.m
         _, optim_trace = Pathfinder.optimize_with_trace(prob, optimizer)
-        Σs = Pathfinder.lbfgs_inverse_hessians(
+        Σs, num_bfgs_updates_rejected1 = Pathfinder.lbfgs_inverse_hessians(
             optim_trace.points, optim_trace.gradients; history_length
         )
-        dists = @inferred Pathfinder.fit_mvnormals(
+        dists, num_bfgs_updates_rejected2 = @inferred Pathfinder.fit_mvnormals(
             optim_trace.points, optim_trace.gradients; history_length
         )
         @test dists isa Vector{<:MvNormal{Float64,<:Pathfinder.WoodburyPDMat}}
+        @test num_bfgs_updates_rejected2 == num_bfgs_updates_rejected1
         @test Σs ≈ getproperty.(dists, :Σ)
         @test optim_trace.points .+ Σs .* optim_trace.gradients ≈ getproperty.(dists, :μ)
     end