Support all GalacticOptim-compatible optimizers (#25)

* Add GalacticOptim as dependency * Use GalacticOptim API * Add comment about recomputing gradient * Add ForwardDiff as dependency * Refactor single-path pathfinder * Rearrange imports * Test optimization functions * Update tests * Increment version number * Document compatible optimizers * Copy positions just in case * Test NLopt works * Rename files * Clarify where remaining kwargs are forwarded * Increment minimum supported version to LTS * Test minimum supported version * Standardize use of params * Require grad function is specified * Add AbstractDifferentiation as dependency * Use AD backend to create optimization function * Improve optimization tests * Test AD backend functionality * Include AD * Use more compact keyword syntax * Provide nothing for parameters * Rearrange code * Support passing OptimizationFunction to multi-path pathfinder * Docstring formatting * Improve documentation * Test ad_backend * Mark version number as DEV
mlcolab · Feb 5, 2022 · a7ec9d3 · a7ec9d3
1 parent 7e6da49
commit a7ec9d3
Show file tree

Hide file tree

Showing 15 changed files with 344 additions and 93 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -10,6 +10,7 @@ jobs:
       fail-fast: false
       matrix:
         version:
+          - '1.6'
           - '1'
         os:
           - ubuntu-latest

diff --git a/Project.toml b/Project.toml
@@ -1,10 +1,13 @@
 name = "Pathfinder"
 uuid = "b1d3bc72-d0e7-4279-b92f-7fa5d6d2d454"
 authors = ["Seth Axen <[email protected]> and contributors"]
-version = "0.2.3"
+version = "0.3.0-DEV"
 
 [deps]
+AbstractDifferentiation = "c29ec348-61ec-40c8-8164-b8c60e9d9f3d"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+GalacticOptim = "a75be94c-b780-496d-a8a9-0878b188d577"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
 PDMats = "90014a1f-27ba-587c-ab20-58faa44d9150"
@@ -15,17 +18,21 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
 
 [compat]
+AbstractDifferentiation = "0.4"
 Distributions = "0.25"
+ForwardDiff = "0.10"
+GalacticOptim = "2"
 Optim = "1.4"
 PDMats = "0.11"
 PSIS = "0.2"
 StatsBase = "0.33"
 StatsFuns = "0.9"
-julia = "1"
+julia = "1.6"
 
 [extras]
-ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+NLopt = "76087f3c-5699-56af-9a33-bf431cd00edd"
+ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["ForwardDiff", "Test"]
+test = ["NLopt", "ReverseDiff", "Test"]
diff --git a/src/Pathfinder.jl b/src/Pathfinder.jl
@@ -1,7 +1,11 @@
 module Pathfinder
 
+using AbstractDifferentiation: AD
 using Distributions: Distributions
+# ensure that ForwardDiff is conditionally loaded by GalacticOptim
+using ForwardDiff: ForwardDiff
 using LinearAlgebra
+using GalacticOptim: GalacticOptim
 using Optim: Optim, LineSearches
 using PDMats: PDMats
 using PSIS: PSIS
@@ -21,7 +25,7 @@ const DEFAULT_OPTIMIZER = Optim.LBFGS(;
 )
 
 include("woodbury.jl")
-include("maximize.jl")
+include("optimize.jl")
 include("inverse_hessian.jl")
 include("mvnormal.jl")
 include("elbo.jl")

diff --git a/src/maximize.jl b/src/maximize.jl
diff --git a/src/multipath.jl b/src/multipath.jl
@@ -1,7 +1,7 @@
 """
     multipathfinder(
         logp,
-        ∇logp,
+        [∇logp,]
         θ₀s::AbstractVector{AbstractVector{<:Real}},
         ndraws::Int;
         kwargs...
@@ -27,41 +27,75 @@ resulting draws better approximate draws from the target distribution ``p`` inst
 
 # Arguments
 - `logp`: a callable that computes the log-density of the target distribution.
-- `∇logp`: a callable that computes the gradient of `logp`.
-- `θ₀s`: vector of length `nruns` of initial points of length `dim` from which each
-    single-path Pathfinder run will begin
-- `ndraws`: number of approximate draws to return
+- `∇logp`: a callable that computes the gradient of `logp`. If not provided, `logp` is
+    automatically differentiated using the backend specified in `ad_backend`.
+- `θ₀s::AbstractVector{AbstractVector{<:Real}}`: vector of length `nruns` of initial points
+    of length `dim` from which each single-path Pathfinder run will begin
+- `ndraws::Int`: number of approximate draws to return
 
 # Keywords
+- `ad_backend=AD.ForwardDiffBackend()`: AbstractDifferentiation.jl AD backend.
 - `ndraws_per_run::Int=5`: The number of draws to take for each component before resampling.
 - `importance::Bool=true`: Perform Pareto smoothed importance resampling of draws.
+- `kwargs...` : Remaining keywords are forwarded to [`pathfinder`](@ref).
 
 # Returns
 - `q::Distributions.MixtureModel`: Uniformly weighted mixture of ELBO-maximizing
     multivariate normal distributions
 - `ϕ::AbstractMatrix{<:Real}`: approximate draws from target distribution with size
     `(dim, ndraws)`
 - `component_inds::Vector{Int}`: Indices ``k`` of components in ``q`` from which each column
-in `ϕ` was drawn.
+    in `ϕ` was drawn.
 """
+function multipathfinder(logp, θ₀s, ndraws; ad_backend=AD.ForwardDiffBackend(), kwargs...)
+    optim_fun = build_optim_function(logp; ad_backend)
+    return multipathfinder(optim_fun, θ₀s, ndraws; kwargs...)
+end
 function multipathfinder(
-    logp,
-    ∇logp,
+    logp, ∇logp, θ₀s, ndraws; ad_backend=AD.ForwardDiffBackend(), kwargs...
+)
+    optim_fun = build_optim_function(logp, ∇logp; ad_backend)
+    return multipathfinder(optim_fun, θ₀s, ndraws; kwargs...)
+end
+
+"""
+    multipathfinder(
+        f::GalacticOptim.OptimizationFunction,
+        θ₀s::AbstractVector{<:Real},
+        ndraws::Int;
+        kwargs...,
+    )
+
+Filter samples from a mixture of multivariate normal distributions fit using `pathfinder`.
+
+`f` is a user-created optimization function that represents the negative log density with
+its gradient and must have the necessary features (e.g. a Hessian function or specified
+automatic differentiation type) for the chosen optimization algorithm. For details, see
+[GalacticOptim.jl: OptimizationFunction](https://galacticoptim.sciml.ai/stable/API/optimization_function/).
+
+See [`multipathfinder`](@ref) for a description of remaining arguments.
+"""
+function multipathfinder(
+    optim_fun::GalacticOptim.OptimizationFunction,
     θ₀s,
     ndraws;
     ndraws_per_run::Int=5,
     rng::Random.AbstractRNG=Random.default_rng(),
     importance::Bool=true,
     kwargs...,
 )
+    if optim_fun.grad === nothing || optim_fun.grad isa Bool
+        throw(ArgumentError("optimization function must define a gradient function."))
+    end
     if ndraws > ndraws_per_run * length(θ₀s)
         @warn "More draws requested than total number of draws across replicas. Draws will not be unique."
     end
+    logp(x) = -optim_fun.f(x, nothing)
 
     # run pathfinder independently from each starting point
     # TODO: allow to be parallelized
     res = map(θ₀s) do θ₀
-        return pathfinder(logp, ∇logp, θ₀, ndraws_per_run; rng=rng, kwargs...)
+        return pathfinder(optim_fun, θ₀, ndraws_per_run; rng, kwargs...)
     end
     qs = reduce(vcat, first.(res))
     ϕs = reduce(hcat, getindex.(res, 2))

diff --git a/src/optimize.jl b/src/optimize.jl
@@ -0,0 +1,61 @@
+function build_optim_function(f; ad_backend=AD.ForwardDiffBackend())
+    ∇f(x) = only(AD.gradient(ad_backend, f, x))
+    return build_optim_function(f, ∇f; ad_backend)
+end
+function build_optim_function(f, ∇f; ad_backend=AD.ForwardDiffBackend())
+    # because we need explicit access to grad, we generate these ourselves instead of using
+    # GalacticOptim's auto-AD feature.
+    # TODO: switch to caching API if available, see
+    # https://github.com/JuliaDiff/AbstractDifferentiation.jl/issues/41
+    function grad(res, x, p...)
+        ∇fx = ∇f(x)
+        @. res = -∇fx
+        return res
+    end
+    function hess(res, x, p...)
+        H = only(AD.hessian(ad_backend, f, x))
+        @. res = -H
+        return res
+    end
+    function hv(res, x, v, p...)
+        Hv = only(AD.lazy_hessian(ad_backend, f, x) * v)
+        @. res = -Hv
+        return res
+    end
+    return GalacticOptim.OptimizationFunction((x, p...) -> -f(x); grad, hess, hv)
+end
+
+function build_optim_problem(optim_fun, x₀; kwargs...)
+    return GalacticOptim.OptimizationProblem(optim_fun, x₀, nothing; kwargs...)
+end
+
+function optimize_with_trace(prob, optimizer)
+    u0 = prob.u0
+    fun = prob.f
+    grad! = fun.grad
+    function ∇f(x)
+        ∇fx = similar(x)
+        grad!(∇fx, x, nothing)
+        rmul!(∇fx, -1)
+        return ∇fx
+    end
+    # caches for the trace of x, f(x), and ∇f(x)
+    xs = typeof(u0)[]
+    fxs = typeof(fun.f(u0, nothing))[]
+    ∇fxs = typeof(similar(u0))[]
+    function callback(x, nfx, args...)
+        # NOTE: GalacticOptim doesn't have an interface for accessing the gradient trace,
+        # so we need to recompute it ourselves
+        # see https://github.com/SciML/GalacticOptim.jl/issues/149
+        ∇fx = ∇f(x)
+        # terminate if optimization encounters NaNs
+        (isnan(nfx) || any(isnan, x) || any(isnan, ∇fx)) && return true
+        # some backends mutate x, so we must copy it
+        push!(xs, copy(x))
+        push!(fxs, -nfx)
+        push!(∇fxs, ∇fx)
+        return false
+    end
+    GalacticOptim.solve(prob, optimizer; cb=callback)
+    return xs, fxs, ∇fxs
+end
diff --git a/src/singlepath.jl b/src/singlepath.jl
@@ -1,49 +1,112 @@
 """
-    pathfinder(logp, ∇logp, θ₀::AbstractVector{<:Real}, ndraws::Int; kwargs...)
+    pathfinder(logp[, ∇logp], θ₀::AbstractVector{<:Real}, ndraws::Int; kwargs...)
 
-Find the best multivariate normal approximation encountered while optimizing `logp`.
+Find the best multivariate normal approximation encountered while maximizing `logp`.
 
-The multivariate normal approximation returned is the one that maximizes the evidence lower
-bound (ELBO), or equivalently, minimizes the KL divergence between
+From an optimization trajectory, Pathfinder constructs a sequence of (multivariate normal)
+approximations to the distribution specified by `logp`. The approximation that maximizes the
+evidence lower bound (ELBO), or equivalently, minimizes the KL divergence between the
+approximation and the true distribution, is returned.
+
+The covariance of the multivariate normal distribution is an inverse Hessian approximation
+constructed using at most the previous `history_length` steps.
 
 # Arguments
 - `logp`: a callable that computes the log-density of the target distribution.
-- `∇logp`: a callable that computes the gradient of `logp`.
+- `∇logp`: a callable that computes the gradient of `logp`. If not provided, `logp` is
+    automatically differentiated using the backend specified in `ad_backend`.
 - `θ₀`: initial point of length `dim` from which to begin optimization
 - `ndraws`: number of approximate draws to return
 
 # Keywords
+- `ad_backend=AD.ForwardDiffBackend()`: AbstractDifferentiation.jl AD backend.
 - `rng::Random.AbstractRNG`: The random number generator to be used for drawing samples
-- `optimizer::Optim.AbstractOptimizer`: Optimizer to be used for constructing trajectory.
-Defaults to `Optim.LBFGS(; m=$DEFAULT_HISTORY_LENGTH, linesearch=$DEFAULT_LINE_SEARCH)`.
+- `optimizer`: Optimizer to be used for constructing trajectory. Can be any optimizer
+    compatible with GalacticOptim, so long as it supports callbacks. Defaults to
+    `Optim.LBFGS(; m=$DEFAULT_HISTORY_LENGTH, linesearch=LineSearches.MoreThuente())`. See
+    the [GalacticOptim.jl documentation](https://galacticoptim.sciml.ai/stable) for details.
 - `history_length::Int=$DEFAULT_HISTORY_LENGTH`: Size of the history used to approximate the
-inverse Hessian. This should only be set when `optimizer` is not an `Optim.LBFGS`.
+    inverse Hessian. This should only be set when `optimizer` is not an `Optim.LBFGS`.
 - `ndraws_elbo::Int=5`: Number of draws used to estimate the ELBO
-- `kwargs...` : Remaining keywords are forwarded to `Optim.Options`.
+- `kwargs...` : Remaining keywords are forwarded to `GalacticOptim.OptimizationProblem`.
 
 # Returns
 - `q::Distributions.MvNormal`: ELBO-maximizing multivariate normal distribution
 - `ϕ::AbstractMatrix{<:Real}`: draws from multivariate normal with size `(dim, ndraws)`
 - `logqϕ::Vector{<:Real}`: log-density of multivariate normal at columns of `ϕ`
 """
+function pathfinder(logp, θ₀, ndraws; ad_backend=AD.ForwardDiffBackend(), kwargs...)
+    optim_fun = build_optim_function(logp; ad_backend)
+    return pathfinder(optim_fun, θ₀, ndraws; kwargs...)
+end
+function pathfinder(logp, ∇logp, θ₀, ndraws; ad_backend=AD.ForwardDiffBackend(), kwargs...)
+    optim_fun = build_optim_function(logp, ∇logp; ad_backend)
+    return pathfinder(optim_fun, θ₀, ndraws; kwargs...)
+end
+
+"""
+    pathfinder(
+        f::GalacticOptim.OptimizationFunction,
+        θ₀::AbstractVector{<:Real},
+        ndraws::Int;
+        kwargs...,
+    )
+
+Find the best multivariate normal approximation encountered while minimizing `f`.
+
+`f` is a user-created optimization function that represents the negative log density with
+its gradient and must have the necessary features (e.g. a Hessian function or specified
+automatic differentiation type) for the chosen optimization algorithm. For details, see
+[GalacticOptim.jl: OptimizationFunction](https://galacticoptim.sciml.ai/stable/API/optimization_function/).
+
+See [`pathfinder`](@ref) for a description of remaining arguments.
+"""
 function pathfinder(
-    logp,
-    ∇logp,
+    optim_fun::GalacticOptim.OptimizationFunction,
     θ₀,
     ndraws;
     rng::Random.AbstractRNG=Random.default_rng(),
-    optimizer::Optim.AbstractOptimizer=DEFAULT_OPTIMIZER,
+    optimizer=DEFAULT_OPTIMIZER,
     history_length::Int=optimizer isa Optim.LBFGS ? optimizer.m : DEFAULT_HISTORY_LENGTH,
     ndraws_elbo::Int=5,
     kwargs...,
 )
+    optim_prob = build_optim_problem(optim_fun, θ₀; kwargs...)
+    return pathfinder(optim_prob, ndraws; rng, optimizer, history_length, ndraws_elbo)
+end
+
+"""
+    pathfinder(prob::GalacticOptim.OptimizationProblem, ndraws::Int; kwargs...)
+
+Find the best multivariate normal approximation encountered while solving `prob`.
+
+`prob` is a user-created optimization problem that represents the negative log density with
+its gradient, an initial position and must have the necessary features (e.g. a Hessian
+function or specified automatic differentiation type) for the chosen optimization algorithm.
+For details, see
+[GalacticOptim.jl: Defining OptimizationProblems](https://galacticoptim.sciml.ai/stable/API/optimization_problem/).
+
+See [`pathfinder`](@ref) for a description of remaining arguments.
+"""
+function pathfinder(
+    optim_prob::GalacticOptim.OptimizationProblem,
+    ndraws;
+    rng::Random.AbstractRNG=Random.default_rng(),
+    optimizer=DEFAULT_OPTIMIZER,
+    history_length::Int=optimizer isa Optim.LBFGS ? optimizer.m : DEFAULT_HISTORY_LENGTH,
+    ndraws_elbo::Int=5,
+)
+    if optim_prob.f.grad === nothing || optim_prob.f.grad isa Bool
+        throw(ArgumentError("optimization function must define a gradient function."))
+    end
+    logp(x) = -optim_prob.f.f(x, nothing)
     # compute trajectory
-    θs, logpθs, ∇logpθs = maximize_with_trace(logp, ∇logp, θ₀, optimizer; kwargs...)
+    θs, logpθs, ∇logpθs = optimize_with_trace(optim_prob, optimizer)
     L = length(θs) - 1
     @assert L + 1 == length(logpθs) == length(∇logpθs)
 
     # fit mv-normal distributions to trajectory
-    qs = fit_mvnormals(θs, ∇logpθs; history_length=history_length)
+    qs = fit_mvnormals(θs, ∇logpθs; history_length)
 
     # find ELBO-maximizing distribution
     lopt, elbo, ϕ, logqϕ = maximize_elbo(rng, logp, qs[2:end], ndraws_elbo)

diff --git a/test/inverse_hessian.jl b/test/inverse_hessian.jl
@@ -48,18 +48,20 @@ end
         n = 10
         history_length = 5
         logp(x) = logp_banana(x)
-        ∇logp(x) = ForwardDiff.gradient(logp, x)
         nocedal_wright_scaling(α, s, y) = fill!(similar(α), dot(y, s) / sum(abs2, y))
         θ₀ = 10 * randn(n)
 
+        ad_backend = AD.ForwardDiffBackend()
+        fun = Pathfinder.build_optim_function(logp; ad_backend)
+        prob = Pathfinder.build_optim_problem(fun, θ₀)
         optimizer = Optim.LBFGS(;
             m=history_length, linesearch=Optim.LineSearches.MoreThuente()
         )
-        θs, logpθs, ∇logpθs = Pathfinder.maximize_with_trace(logp, ∇logp, θ₀, optimizer)
+        θs, logpθs, ∇logpθs = Pathfinder.optimize_with_trace(prob, optimizer)
 
         # run lbfgs_inverse_hessians with the same initialization as Optim.LBFGS
         Hs = Pathfinder.lbfgs_inverse_hessians(
-            θs, ∇logpθs; history_length=history_length, Hinit=nocedal_wright_scaling
+            θs, ∇logpθs; history_length, Hinit=nocedal_wright_scaling
         )
         ss = diff(θs)
         ps = (Hs .* ∇logpθs)[1:(end - 1)]
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,6 +10,7 @@ jobs: @@
           fail-fast: false
           matrix:
             version:
+              - '1.6'
               - '1'
             os:
               - ubuntu-latest
@@ Expand Down @@