Accept LogDensityProblems objects (#122)

* Add LogDensityProblems tests * Fix LogDensityProblem tests * Update build_optim_function to take a LogDensityProblem * Move code to test_utils.jl * Update mvnormal tests * Remove outdated comment * Update inverse_hessian tests * Update singlepath and multipath code * Remove AbstractDifferentiation as a dependency * Increment version number * Don't assume `optim_fun` is thread-safe * Update .gitignore * Bump Pathfinder compat * Update DynamicHMC tests * Update documented examples * Require LogDensityProblem with gradient * Run multipathfinder in parallel by default. * Fix bugs * Unify DynamicHMC and AdvancedHMC integration tests * Remove LogDensityProblems tests * Update tests * Update documentation
mlcolab · Feb 1, 2023 · b5d1917 · b5d1917 · sethaxen · Feb 1, 2023
1 parent 9e6bf45
commit b5d1917
Show file tree

Hide file tree

Showing 21 changed files with 239 additions and 259 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,5 @@
 *.jl.*.cov
 *.jl.cov
 *.jl.mem
-/Manifest.toml
+Manifest.toml
 /docs/build/
-/docs/Manifest.toml
diff --git a/Project.toml b/Project.toml
@@ -1,16 +1,16 @@
 name = "Pathfinder"
 uuid = "b1d3bc72-d0e7-4279-b92f-7fa5d6d2d454"
 authors = ["Seth Axen <[email protected]> and contributors"]
-version = "0.7.0-DEV"
+version = "0.7.0"
 
 [deps]
-AbstractDifferentiation = "c29ec348-61ec-40c8-8164-b8c60e9d9f3d"
 Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Folds = "41a02a25-b8f0-4f67-bc48-60067656b558"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 IrrationalConstants = "92d709cd-6900-40b7-9082-c6be49f344b6"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+LogDensityProblems = "6fdf6af0-433a-55f7-b3ed-c6c6e0b8df7c"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
 Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
 OptimizationOptimJL = "36348300-93cb-4f02-beb5-3c3902f8871e"
@@ -26,12 +26,12 @@ Transducers = "28d57a85-8fef-5791-bfe6-a80928e7c999"
 UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
 
 [compat]
-AbstractDifferentiation = "0.4"
 Accessors = "0.1"
 Distributions = "0.25"
 Folds = "0.2"
 ForwardDiff = "0.10"
 IrrationalConstants = "0.1.1"
+LogDensityProblems = "2"
 Optim = "1.4"
 Optimization = "3"
 OptimizationOptimJL = "0.1"
@@ -47,8 +47,7 @@ julia = "1.6"
 
 [extras]
 OptimizationNLopt = "4e6fcdb7-1186-4e1f-a706-475e75c168bb"
-ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["OptimizationNLopt", "ReverseDiff", "Test"]
+test = ["OptimizationNLopt", "Test"]
diff --git a/docs/src/examples/initializing-hmc.md b/docs/src/examples/initializing-hmc.md
@@ -85,20 +85,18 @@ nothing # hide
 To use DynamicHMC, we first need to transform our model to an unconstrained space using [TransformVariables.jl](https://tamaspapp.eu/TransformVariables.jl/stable/) and wrap it in a type that implements the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface:
 
 ```@example 1
-using DynamicHMC, LogDensityProblems, LogDensityProblemsAD, TransformVariables
+using DynamicHMC, ForwardDiff, LogDensityProblems, LogDensityProblemsAD, TransformVariables
 using TransformedLogDensities: TransformedLogDensity
 
 transform = as((σ=asℝ₊, α=asℝ, β=as(Array, J)))
 P = TransformedLogDensity(transform, model)
 ∇P = ADgradient(:ForwardDiff, P)
 ```
 
-Pathfinder, on the other hand, expects a log-density function:
+Pathfinder can take any object that implements this interface.
 
 ```@example 1
-logp(x) = LogDensityProblems.logdensity(P, x)
-∇logp(x) = LogDensityProblems.logdensity_and_gradient(∇P, x)[2]
-result_pf = pathfinder(logp, ∇logp; dim)
+result_pf = pathfinder(∇P)
 ```
 
 ```@example 1
@@ -158,10 +156,10 @@ result_dhmc3 = mcmc_with_warmup(
 
 ## AdvancedHMC.jl
 
-Similar to DynamicHMC, AdvancedHMC can work with an object implementing the LogDensityProblems API:
+Similar to Pathfinder and DynamicHMC, AdvancedHMC can also work with a LogDensityProblem:
 
 ```@example 1
-using AdvancedHMC, ForwardDiff
+using AdvancedHMC
 
 nadapts = 500;
 nothing # hide

diff --git a/docs/src/examples/quickstart.md b/docs/src/examples/quickstart.md
@@ -6,11 +6,28 @@ This page introduces basic Pathfinder usage with examples.
 
 For a simple example, we'll run Pathfinder on a multivariate normal distribution with
 a dense covariance matrix.
+Pathfinder expects an object that implements the [LogDensityProblems](https://www.tamaspapp.eu/LogDensityProblems.jl) interface and has a gradient implemented.
+We can use automatic differentiation to compute the gradient using [LogDensityProblemsAD](https://github.com/tpapp/LogDensityProblemsAD.jl).
 
 ```@example 1
-using LinearAlgebra, Pathfinder, Printf, StatsPlots, Random
+using ForwardDiff, LinearAlgebra, LogDensityProblems, LogDensityProblemsAD,
+      Pathfinder, Printf, StatsPlots, Random
 Random.seed!(42)
 
+ForwardDiff, LogDensityProblems, LogDensityProblemsAD
+struct MvNormalProblem{T,S}
+    μ::T  # mean
+    P::S  # precision matrix
+end
+function LogDensityProblems.capabilities(::Type{<:MvNormalProblem})
+    return LogDensityProblems.LogDensityOrder{0}()
+end
+LogDensityProblems.dimension(ℓ::MvNormalProblem) = length(ℓ.μ)
+function LogDensityProblems.logdensity(ℓ::MvNormalProblem, x)
+    z = x - μ
+    return -dot(z, P, z) / 2
+end
+
 Σ = [
     2.71   0.5    0.19   0.07   1.04
     0.5    1.11  -0.08  -0.17  -0.08
@@ -20,20 +37,15 @@ Random.seed!(42)
 ]
 μ = [-0.55, 0.49, -0.76, 0.25, 0.94]
 P = inv(Symmetric(Σ))
-dim = length(μ)
-init_scale=4
+prob_mvnormal = ADgradient(:ForwardDiff, MvNormalProblem(μ, P))
 
-function logp_mvnormal(x)
-    z = x - μ
-    return -dot(z, P, z) / 2
-end
 nothing # hide
 ```
 
 Now we run [`pathfinder`](@ref).
 
 ```@example 1
-result = pathfinder(logp_mvnormal; dim, init_scale)
+result = pathfinder(prob_mvnormal; init_scale=4)
 ```
 
 `result` is a [`PathfinderResult`](@ref).
@@ -114,14 +126,21 @@ Now we will run Pathfinder on the following banana-shaped distribution with dens
 \pi(x_1, x_2) = e^{-x_1^2 / 2} e^{-5 (x_2 - x_1^2)^2 / 2}.
 ```
 
-First we define the distribution:
+First we define the log density problem:
 
 ```@example 1
 Random.seed!(23)
 
-logp_banana(x) = -(x[1]^2 + 5(x[2] - x[1]^2)^2) / 2
-dim = 2
-init_scale = 10
+struct BananaProblem end
+function LogDensityProblems.capabilities(::Type{<:BananaProblem})
+    return LogDensityProblems.LogDensityOrder{0}()
+end
+LogDensityProblems.dimension(ℓ::BananaProblem) = 2
+function LogDensityProblems.logdensity(ℓ::BananaProblem, x)
+    return -(x[1]^2 + 5(x[2] - x[1]^2)^2) / 2
+end
+
+prob_banana = ADgradient(:ForwardDiff, BananaProblem())
 
 nothing # hide
 ```
@@ -131,13 +150,14 @@ and then visualise it:
 ```@example 1
 xrange = -3.5:0.05:3.5
 yrange = -3:0.05:7
+logp_banana(x) = LogDensityProblems.logdensity(prob_banana, x)
 contour(xrange, yrange, exp ∘ logp_banana ∘ Base.vect; xlabel="x₁", ylabel="x₂")
 ```
 
 Now we run [`pathfinder`](@ref).
 
 ```@example 1
-result = pathfinder(logp_banana; dim, init_scale)
+result = pathfinder(prob_banana; init_scale=10)
 ```
 
 As before we can visualise each iteration of the algorithm.
@@ -155,15 +175,15 @@ Especially for such complicated target distributions, it's always a good idea to
 
 ```@example 1
 ndraws = 1_000
-result = multipathfinder(logp_banana, ndraws; nruns=20, dim, init_scale)
+result = multipathfinder(prob_banana, ndraws; nruns=20, init_scale=10)
 ```
 
 `result` is a [`MultiPathfinderResult`](@ref).
 See its docstring for a description of its fields.
 
 `result.fit_distribution` is a uniformly-weighted `Distributions.MixtureModel`.
 Each component is the result of a single Pathfinder run.
-It's possible that some runs fit the target distribution much better than others, so instead of just drawing samples from `result.fit_distribution`, `multipathfinder` draws many samples from each component and then uses Pareto-smoothed importance resampling (using [PSIS.jl](https://psis.julia.arviz.org/stable/)) from these draws to better target `logp_banana`.
+It's possible that some runs fit the target distribution much better than others, so instead of just drawing samples from `result.fit_distribution`, `multipathfinder` draws many samples from each component and then uses Pareto-smoothed importance resampling (using [PSIS.jl](https://psis.julia.arviz.org/stable/)) from these draws to better target `prob_banana`.
 
 The Pareto shape diagnostic informs us on the quality of these draws.
 Here the Pareto shape ``k`` diagnostic is bad (``k > 0.7``), which tells us that these draws are unsuitable for computing posterior estimates, so we should definitely run MCMC to get better draws.
@@ -207,15 +227,23 @@ function logp_funnel(x)
     return ((τ / 3)^2 + (n - 1) * τ + sum(b -> abs2(b * exp(-τ / 2)), β)) / -2
 end
 
-dim = 100
-init_scale = 10
+struct FunnelProblem
+    dim::Int
+end
+function LogDensityProblems.capabilities(::Type{<:FunnelProblem})
+    return LogDensityProblems.LogDensityOrder{0}()
+end
+LogDensityProblems.dimension(ℓ::FunnelProblem) = ℓ.dim
+LogDensityProblems.logdensity(::FunnelProblem, x) = logp_funnel(x)
+
+prob_funnel = ADgradient(:ForwardDiff, FunnelProblem(100))
 nothing # hide
 ```
 
 First, let's fit this posterior with single-path Pathfinder.
 
 ```@example 1
-result_single = pathfinder(logp_funnel; dim, init_scale)
+result_single = pathfinder(prob_funnel; init_scale=10)
 ```
 
 Let's visualize this sequence of multivariate normals for the first two dimensions.
@@ -240,7 +268,7 @@ Now we run [`multipathfinder`](@ref).
 
 ```@example 1
 ndraws = 1_000
-result = multipathfinder(logp_funnel, ndraws; nruns=20, dim, init_scale)
+result = multipathfinder(prob_funnel, ndraws; nruns=20, init_scale=10)
 ```
 
 Again, the poor Pareto shape diagnostic indicates we should run MCMC to get draws suitable for computing posterior estimates.

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -42,7 +42,7 @@ Pathfinder uses several packages for extended functionality:
 - [Optimization.jl](https://optimization.sciml.ai/stable/): This allows the L-BFGS optimizer to be replaced with any of the many Optimization-compatible optimizers and supports use of callbacks. Note that any changes made to Pathfinder using these features would be experimental.
 - [Transducers.jl](https://juliafolds.github.io/Transducers.jl/stable/): parallelization support
 - [Distributions.jl](https://juliastats.org/Distributions.jl/stable/)/[PDMats.jl](https://github.com/JuliaStats/PDMats.jl): fits can be used anywhere a `Distribution` can be used
-- [AbstractDifferentiation.jl](https://github.com/JuliaDiff/AbstractDifferentiation.jl): selecting the AD package used to differentiate the provided log-density function.
+- [LogDensityProblems.jl](https://www.tamaspapp.eu/LogDensityProblems.jl/stable/): defining the log-density function, gradient, and Hessian
 - [ProgressLogging.jl](https://julialogging.github.io/ProgressLogging.jl/stable/): In Pluto, Juno, and VSCode, nested progress bars are shown. In the REPL, use TerminalLoggers.jl to get progress bars.
 
 [^Zhang2021]: Lu Zhang, Bob Carpenter, Andrew Gelman, Aki Vehtari (2021).

diff --git a/src/Pathfinder.jl b/src/Pathfinder.jl
@@ -1,12 +1,12 @@
 module Pathfinder
 
-using AbstractDifferentiation: AD
 using Distributions: Distributions
 using Folds: Folds
 # ensure that ForwardDiff is conditionally loaded by Optimization
 using ForwardDiff: ForwardDiff
 using IrrationalConstants: log2π
 using LinearAlgebra
+using LogDensityProblems: LogDensityProblems
 using Optim: Optim, LineSearches
 using Optimization: Optimization
 using OptimizationOptimJL: OptimizationOptimJL

diff --git a/src/multipath.jl b/src/multipath.jl
@@ -67,8 +67,7 @@ function Base.show(io::IO, ::MIME"text/plain", result::MultiPathfinderResult)
 end
 
 """
-    multipathfinder(logp, ndraws; kwargs...)
-    multipathfinder(logp, ∇logp, ndraws; kwargs...)
+    multipathfinder(ℓ, ndraws; kwargs...)
     multipathfinder(fun::SciMLBase.OptimizationFunction, ndraws; kwargs...)
 
 Run [`pathfinder`](@ref) multiple times to fit a multivariate normal mixture model.
@@ -91,31 +90,29 @@ resulting draws better approximate draws from the target distribution ``p`` inst
 for approximating expectations with respect to ``p``.
 
 # Arguments
-- `logp`: a callable that computes the log-density of the target distribution.
-- `∇logp`: a callable that computes the gradient of `logp`. If not provided, `logp` is
-    automatically differentiated using the backend specified in `ad_backend`.
+- `ℓ`: an object, representing the log-density of the target distribution and its gradient,
+    that implements the [LogDensityProblems](https://www.tamaspapp.eu/LogDensityProblems.jl)
+    interface.
 - `fun::SciMLBase.OptimizationFunction`: an optimization function that represents
-    `-logp(x)` with its gradient. It must have the necessary features (e.g. a Hessian
-    function) for the chosen optimization algorithm. For details, see
+    a negative log density with its gradient. It must have the necessary features (e.g. a
+    Hessian function) for the chosen optimization algorithm. For details, see
     [Optimization.jl: OptimizationFunction](https://optimization.sciml.ai/stable/API/optimization_function/).
 - `ndraws::Int`: number of approximate draws to return
 
 # Keywords
 - `init`: iterator of length `nruns` of initial points of length `dim` from which each
     single-path Pathfinder run will begin. `length(init)` must be implemented. If `init` is
-    not provided, `dim` and `nruns` must be.
+    not provided, `nruns` must be, and `dim` must be if `fun` provided.
 - `nruns::Int`: number of runs of Pathfinder to perform. Ignored if `init` is provided.
-- `ad_backend=AD.ForwardDiffBackend()`: AbstractDifferentiation.jl AD backend used to
-    differentiate `logp` if `∇logp` is not provided.
 - `ndraws_per_run::Int`: The number of draws to take for each component before resampling.
     Defaults to a number such that `ndraws_per_run * nruns > ndraws`.
 - `importance::Bool=true`: Perform Pareto smoothed importance resampling of draws.
 - `rng::AbstractRNG=Random.GLOBAL_RNG`: Pseudorandom number generator. It is recommended to
     use a parallelization-friendly PRNG like the default PRNG on Julia 1.7 and up.
-- `executor::Transducers.Executor`: Transducers.jl executor that determines if and how
-    to run the single-path runs in parallel. If `rng` is known to be thread-safe, the
-    default is `Transducers.PreferParallel()` (parallel executation, defaulting to
-    multi-threading). Otherwise, it is `Transducers.SequentialEx()` (no parallelization).
+- `executor::Transducers.Executor=Transducers.SequentialEx()`: Transducers.jl executor that
+    determines if and how to run the single-path runs in parallel. If a transducer for
+    multi-threaded computation is selected, you must first verify that `rng` and the log
+    density function are thread-safe.
 - `executor_per_run::Transducers.Executor=Transducers.SequentialEx()`: Transducers.jl
     executor used within each run to parallelize PRNG calls. Defaults to no parallelization.
     See [`pathfinder`](@ref) for a description.
@@ -126,22 +123,11 @@ for approximating expectations with respect to ``p``.
 """
 function multipathfinder end
 
-function multipathfinder(
-    logp, ndraws::Int; ad_backend=AD.ForwardDiffBackend(), input=logp, kwargs...
-)
-    return multipathfinder(build_optim_function(logp; ad_backend), ndraws; input, kwargs...)
-end
-function multipathfinder(
-    logp,
-    ∇logp,
-    ndraws::Int;
-    ad_backend=AD.ForwardDiffBackend(),
-    input=(logp, ∇logp),
-    kwargs...,
-)
-    return multipathfinder(
-        build_optim_function(logp, ∇logp; ad_backend), ndraws; input, kwargs...
-    )
+function multipathfinder(ℓ, ndraws::Int; input=ℓ, kwargs...)
+    _check_log_density_problem(ℓ)
+    dim = LogDensityProblems.dimension(ℓ)
+    optim_fun = build_optim_function(ℓ)
+    return multipathfinder(optim_fun, ndraws; input, dim, kwargs...)
 end
 function multipathfinder(
     optim_fun::SciMLBase.OptimizationFunction,
@@ -154,7 +140,7 @@ function multipathfinder(
     rng::Random.AbstractRNG=Random.GLOBAL_RNG,
     history_length::Int=DEFAULT_HISTORY_LENGTH,
     optimizer=default_optimizer(history_length),
-    executor::Transducers.Executor=_default_executor(rng),
+    executor::Transducers.Executor=Transducers.SequentialEx(),
     executor_per_run=Transducers.SequentialEx(),
     importance::Bool=true,
     kwargs...,

diff --git a/src/optimize.jl b/src/optimize.jl
@@ -1,28 +1,16 @@
-function build_optim_function(f; ad_backend=AD.ForwardDiffBackend())
-    ∇f(x) = only(AD.gradient(ad_backend, f, x))
-    return build_optim_function(f, ∇f; ad_backend)
-end
-function build_optim_function(f, ∇f; ad_backend=AD.ForwardDiffBackend())
-    # because we need explicit access to grad, we generate these ourselves instead of using
-    # Optimization.jl's auto-AD feature.
-    # TODO: switch to caching API if available, see
-    # https://github.com/JuliaDiff/AbstractDifferentiation.jl/issues/41
+function build_optim_function(ℓ)
+    f(x, p) = -LogDensityProblems.logdensity(ℓ, x)
     function grad(res, x, p)
-        ∇fx = ∇f(x)
+        _, ∇fx = LogDensityProblems.logdensity_and_gradient(ℓ, x)
         @. res = -∇fx
         return res
     end
     function hess(res, x, p)
-        H = only(AD.hessian(ad_backend, f, x))
+        _, _, H = LogDensityProblems.logdensity_gradient_and_hessian(ℓ, x)
         @. res = -H
         return res
     end
-    function hv(res, x, v, p)
-        Hv = only(AD.lazy_hessian(ad_backend, f, x) * v)
-        @. res = -Hv
-        return res
-    end
-    return SciMLBase.OptimizationFunction{true}((x, p) -> -f(x); grad, hess, hv)
+    return SciMLBase.OptimizationFunction{true}(f; grad, hess)
 end
 
 function build_optim_problem(optim_fun, x₀)