diff --git a/Project.toml b/Project.toml
index cf6dcb55..66bec83c 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "Pathfinder"
 uuid = "b1d3bc72-d0e7-4279-b92f-7fa5d6d2d454"
 authors = ["Seth Axen <seth.axen@gmail.com> and contributors"]
-version = "0.9.6"
+version = "0.9.7"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
diff --git a/docs/Project.toml b/docs/Project.toml
index 91a8f13c..194eb4de 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -2,6 +2,8 @@
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
 AdvancedHMC = "0bf59076-c3b1-5ca4-86bd-e02cd72cde3d"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"
+DocumenterInterLinks = "d12716ef-a0f6-4df4-a9f1-a5a34e75c656"
 DynamicHMC = "bbc10e6e-7c05-544b-b16e-64fede858acb"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -20,6 +22,8 @@ Turing = "fce5fe82-541a-59a6-adf8-730c64b5f9a0"
 ADTypes = "0.2, 1"
 AdvancedHMC = "0.6"
 Documenter = "1"
+DocumenterCitations = "1.2"
+DocumenterInterLinks = "1"
 DynamicHMC = "3.4.0"
 ForwardDiff = "0.10.19"
 LogDensityProblems = "2.1.0"
diff --git a/docs/inventories/Distributions.toml b/docs/inventories/Distributions.toml
new file mode 100644
index 00000000..29d7e67b
--- /dev/null
+++ b/docs/inventories/Distributions.toml
@@ -0,0 +1,11 @@
+# DocInventory version 1
+project = "Distributions.jl"
+version = "0.25.113"
+
+# Filtered to just the types we link to
+[[jl.type]]
+name = "Distributions.MixtureModel"
+uri = "mixture/#$"
+[[jl.type]]
+name = "Distributions.MvNormal"
+uri = "multivariate/#$"
diff --git a/docs/inventories/DynamicHMC.toml b/docs/inventories/DynamicHMC.toml
new file mode 100644
index 00000000..74930146
--- /dev/null
+++ b/docs/inventories/DynamicHMC.toml
@@ -0,0 +1,13 @@
+# DocInventory version 1
+project = "DynamicHMC.jl"
+version = "3.4.7"
+
+# Filtered to just the types we link to
+[[std.doc]]
+dispname = "A worked example"
+name = "worked_example"
+uri = "worked_example/"
+
+[[jl.type]]
+name = "DynamicHMC.GaussianKineticEnergy"
+uri = "interface/#$"
diff --git a/docs/inventories/Transducers.toml b/docs/inventories/Transducers.toml
new file mode 100644
index 00000000..0411804b
--- /dev/null
+++ b/docs/inventories/Transducers.toml
@@ -0,0 +1,11 @@
+# DocInventory version 1
+project = "Transducers.jl"
+version = "0.4.84"
+
+# Filtered to just the types we link to
+[[jl.type]]
+name = "Transducers.PreferParallel"
+uri = "reference/manual/#$"
+[[jl.type]]
+name = "Transducers.SequentialEx"
+uri = "reference/manual/#$"
diff --git a/docs/make.jl b/docs/make.jl
index 8f3bc6ba..35f69aae 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,8 +1,41 @@
 using Pathfinder
 using Documenter
+using DocumenterCitations
+using DocumenterInterLinks
 
 DocMeta.setdocmeta!(Pathfinder, :DocTestSetup, :(using Pathfinder); recursive=true)
 
+bib = CitationBibliography(joinpath(@__DIR__, "src", "references.bib"); style=:numeric)
+
+links = InterLinks(
+    "AdvancedHMC" => "https://turinglang.org/AdvancedHMC.jl/stable/",
+    "ADTypes" => "https://sciml.github.io/ADTypes.jl/stable/",
+    "Distributions" => (
+        "https://juliastats.org/Distributions.jl/stable/",
+        "https://juliastats.org/Distributions.jl/dev/objects.inv",
+        joinpath(@__DIR__, "inventories", "Distributions.toml"),
+    ),
+    "DynamicHMC" => (
+        "https://www.tamaspapp.eu/DynamicHMC.jl/stable/",
+        "https://www.tamaspapp.eu/DynamicHMC.jl/dev/objects.inv",
+        joinpath(@__DIR__, "inventories", "DynamicHMC.toml"),
+    ),
+    "DynamicPPL" => "https://turinglang.org/DynamicPPL.jl/stable/",
+    "LogDensityProblems" => "https://www.tamaspapp.eu/LogDensityProblems.jl/stable/",
+    "MCMCChains" => (
+        "https://turinglang.org/MCMCChains.jl/stable/",
+        "https://turinglang.org/MCMCChains.jl/dev/objects.inv",
+    ),
+    "Optim" => "https://julianlsolvers.github.io/Optim.jl/stable/",
+    "Optimization" => "https://docs.sciml.ai/Optimization/stable/",
+    "PSIS" => "https://julia.arviz.org/PSIS/stable/",
+    "Transducers" => (
+        "https://juliafolds2.github.io/Transducers.jl/stable/",  # not built for a while
+        "https://juliafolds2.github.io/Transducers.jl/dev/objects.inv",
+        joinpath(@__DIR__, "inventories", "Transducers.toml"),
+    ),
+)
+
 makedocs(;
     modules=[Pathfinder],
     authors="Seth Axen <seth.axen@gmail.com> and contributors",
@@ -11,7 +44,7 @@ makedocs(;
     format=Documenter.HTML(;
         prettyurls=get(ENV, "CI", "false") == "true",
         canonical="https://mlcolab.github.io/Pathfinder.jl",
-        assets=String[],
+        assets=String["assets/citations.css"],
     ),
     pages=[
         "Home" => "index.md",
@@ -21,7 +54,9 @@ makedocs(;
             "Initializing HMC" => "examples/initializing-hmc.md",
             "Turing usage" => "examples/turing.md",
         ],
+        "References" => "references.md",
     ],
+    plugins=[bib, links],
 )
 
 if get(ENV, "DEPLOY_DOCS", "true") == "true"
diff --git a/docs/src/assets/citations.css b/docs/src/assets/citations.css
new file mode 100644
index 00000000..db2f1c53
--- /dev/null
+++ b/docs/src/assets/citations.css
@@ -0,0 +1,29 @@
+/* Adapted from DocumenterCitations.jl's docs */
+.citation dl {
+    display: grid;
+    grid-template-columns: max-content auto;
+}
+
+.citation dt {
+    grid-column-start: 1;
+}
+
+.citation dd {
+    grid-column-start: 2;
+    margin-bottom: 0.75em;
+}
+
+.citation ul {
+    padding: 0 0 2.25em 0;
+    margin: 0;
+    list-style: none !important;
+}
+
+.citation ul li {
+    text-indent: -2.25em;
+    margin: 0.33em 0.5em 0.5em 2.25em;
+}
+
+.citation ol li {
+    padding-left: 0.75em;
+}
\ No newline at end of file
diff --git a/docs/src/examples/initializing-hmc.md b/docs/src/examples/initializing-hmc.md
index 0777ed93..443c0515 100644
--- a/docs/src/examples/initializing-hmc.md
+++ b/docs/src/examples/initializing-hmc.md
@@ -7,7 +7,7 @@ When using MCMC to draw samples from some target distribution, there is often a
 2. adapt any tunable parameters of the MCMC sampler (optional)
 
 While (1) often happens fairly quickly, (2) usually requires a lengthy exploration of the typical set to iteratively adapt parameters suitable for further exploration.
-An example is the widely used windowed adaptation scheme of Hamiltonian Monte Carlo (HMC) in Stan, where a step size and positive definite metric (aka mass matrix) are adapted.[^1]
+An example is the widely used windowed adaptation scheme of Hamiltonian Monte Carlo (HMC) in Stan [StanHMCParameters](@citep), where a step size and positive definite metric (aka mass matrix) are adapted.
 For posteriors with complex geometry, the adaptation phase can require many evaluations of the gradient of the log density function of the target distribution.
 
 Pathfinder can be used to initialize MCMC, and in particular HMC, in 3 ways:
@@ -82,7 +82,7 @@ nothing # hide
 
 ## DynamicHMC.jl
 
-To use DynamicHMC, we first need to transform our model to an unconstrained space using [TransformVariables.jl](https://tamaspapp.eu/TransformVariables.jl/stable/) and wrap it in a type that implements the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface:
+To use DynamicHMC, we first need to transform our model to an unconstrained space using [TransformVariables.jl](https://tamaspapp.eu/TransformVariables.jl/stable/) and wrap it in a type that implements the [LogDensityProblems interface](@extref LogDensityProblems log-density-api) (see [DynamicHMC's worked example](@extref DynamicHMC worked_example)):
 
 ```@example 1
 using DynamicHMC, ForwardDiff, LogDensityProblems, LogDensityProblemsAD, TransformVariables
@@ -123,7 +123,7 @@ result_dhmc1 = mcmc_with_warmup(
 
 ### Initializing metric adaptation from Pathfinder's estimate
 
-To start with Pathfinder's inverse metric estimate, we just need to initialize a `GaussianKineticEnergy` object with it as input: 
+To start with Pathfinder's inverse metric estimate, we just need to initialize a [`DynamicHMC.GaussianKineticEnergy`](@extref) object with it as input: 
 
 ```@example 1
 result_dhmc2 = mcmc_with_warmup(
@@ -212,7 +212,7 @@ samples_ahmc2, stats_ahmc2 = sample(
 
 ### Use Pathfinder's metric estimate for sampling
 
-To use Pathfinder's metric with no metric adaptation, we need to use Pathfinder's own `RankUpdateEuclideanMetric` type, which just wraps our inverse metric estimate for use with AdvancedHMC:
+To use Pathfinder's metric with no metric adaptation, we need to use Pathfinder's own [`Pathfinder.RankUpdateEuclideanMetric`](@ref) type, which just wraps our inverse metric estimate for use with AdvancedHMC:
 
 ```@example 1
 nadapts = 75
@@ -233,5 +233,3 @@ samples_ahmc3, stats_ahmc3 = sample(
     progress=false,
 )
 ```
-
-[^1]: https://mc-stan.org/docs/reference-manual/hmc-algorithm-parameters.html
diff --git a/docs/src/examples/quickstart.md b/docs/src/examples/quickstart.md
index fa436c1e..caf0c18f 100644
--- a/docs/src/examples/quickstart.md
+++ b/docs/src/examples/quickstart.md
@@ -121,7 +121,7 @@ Now we will run Pathfinder on the following banana-shaped distribution with dens
 \pi(x_1, x_2) = e^{-x_1^2 / 2} e^{-5 (x_2 - x_1^2)^2 / 2}.
 ```
 
-Pathfinder can also take any object that implements the [LogDensityProblems](https://www.tamaspapp.eu/LogDensityProblems.jl) interface.
+Pathfinder can also take any object that implements the [LogDensityProblems interface](@extref LogDensityProblems log-density-api) interface.
 This can also be used to manually define the gradient of the log-density function.
 
 First we define the log density problem:
@@ -185,9 +185,9 @@ result = multipathfinder(prob_banana, ndraws; nruns=20, init_scale=10)
 `result` is a [`MultiPathfinderResult`](@ref).
 See its docstring for a description of its fields.
 
-`result.fit_distribution` is a uniformly-weighted `Distributions.MixtureModel`.
+`result.fit_distribution` is a uniformly-weighted [`Distributions.MixtureModel`](@extref).
 Each component is the result of a single Pathfinder run.
-It's possible that some runs fit the target distribution much better than others, so instead of just drawing samples from `result.fit_distribution`, `multipathfinder` draws many samples from each component and then uses Pareto-smoothed importance resampling (using [PSIS.jl](https://psis.julia.arviz.org/stable/)) from these draws to better target `prob_banana`.
+It's possible that some runs fit the target distribution much better than others, so instead of just drawing samples from `result.fit_distribution`, `multipathfinder` draws many samples from each component and then uses Pareto-smoothed importance resampling (using [PSIS.jl](@extref PSIS PSIS)) from these draws to better target `prob_banana`.
 
 The Pareto shape diagnostic informs us on the quality of these draws.
 Here the Pareto shape ``k`` diagnostic is bad (``k > 0.7``), which tells us that these draws are unsuitable for computing posterior estimates, so we should definitely run MCMC to get better draws.
@@ -238,7 +238,7 @@ nothing # hide
 
 First, let's fit this posterior with single-path Pathfinder.
 For high-dimensional problems, it's better to use reverse-mode automatic differentiation.
-Here, we'll use `ADTypes.AutoReverseDiff()` to specify that [ReverseDiff.jl](https://github.com/JuliaDiff/ReverseDiff.jl) should be used.
+Here, we'll use [`ADTypes.AutoReverseDiff`](@extref) to specify that [ReverseDiff.jl](https://github.com/JuliaDiff/ReverseDiff.jl) should be used.
 
 
 ```@example 1
diff --git a/docs/src/examples/turing.md b/docs/src/examples/turing.md
index e2774e93..7d26f304 100644
--- a/docs/src/examples/turing.md
+++ b/docs/src/examples/turing.md
@@ -24,7 +24,7 @@ model = regress(collect(x), y)
 n_chains = 8
 ```
 
-For convenience, [`pathfinder`](@ref) and [`multipathfinder`](@ref) can take Turing models as inputs and produce `MCMCChains.Chains` objects as outputs.
+For convenience, [`pathfinder`](@ref) and [`multipathfinder`](@ref) can take Turing models as inputs and produce [`MCMCChains.Chains`](@extref) objects as outputs.
 
 ```@example 1
 result_single = pathfinder(model; ndraws=1_000)
@@ -36,7 +36,7 @@ result_multi = multipathfinder(model, 1_000; nruns=n_chains)
 
 Here, the Pareto shape diagnostic indicates that it is likely safe to use these draws to compute posterior estimates.
 
-When passed a `Model`, Pathfinder also gives access to the posterior draws in a familiar `MCMCChains.Chains` object.
+When passed a [`DynamicPPL.Model`](@extref), Pathfinder also gives access to the posterior draws in a familiar `Chains` object.
 
 ```@example 1
 result_multi.draws_transformed
diff --git a/docs/src/index.md b/docs/src/index.md
index c2469b9e..2f51db66 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -4,7 +4,7 @@ CurrentModule = Pathfinder
 
 # Pathfinder.jl: Parallel quasi-Newton variational inference
 
-This package implements Pathfinder, [^Zhang2021] a variational method for initializing Markov chain Monte Carlo (MCMC) methods.
+This package implements Pathfinder [ZhangPathfinder2021](@citep), a variational method for initializing Markov chain Monte Carlo (MCMC) methods.
 
 ## Single-path Pathfinder
 
@@ -45,8 +45,3 @@ Pathfinder uses several packages for extended functionality:
 - [Distributions.jl](https://juliastats.org/Distributions.jl/stable/)/[PDMats.jl](https://github.com/JuliaStats/PDMats.jl): fits can be used anywhere a `Distribution` can be used
 - [LogDensityProblems.jl](https://www.tamaspapp.eu/LogDensityProblems.jl/stable/): defining the log-density function, gradient, and Hessian
 - [ProgressLogging.jl](https://julialogging.github.io/ProgressLogging.jl/stable/): In Pluto, Juno, and VSCode, nested progress bars are shown. In the REPL, use TerminalLoggers.jl to get progress bars.
-
-[^Zhang2021]: Lu Zhang, Bob Carpenter, Andrew Gelman, Aki Vehtari (2021).
-              Pathfinder: Parallel quasi-Newton variational inference.
-              arXiv: [2108.03782](https://arxiv.org/abs/2108.03782) [stat.ML].
-              [Code](https://github.com/LuZhangstat/Pathfinder)
diff --git a/docs/src/references.bib b/docs/src/references.bib
new file mode 100644
index 00000000..8c011ef8
--- /dev/null
+++ b/docs/src/references.bib
@@ -0,0 +1,34 @@
+@article{Byrd1994,
+  title   = {Representations of Quasi-{{Newton}} Matrices and Their Use in Limited Memory Methods},
+  author  = {Byrd, Richard H. and Nocedal, Jorge and Schnabel, Robert B.},
+  year    = {1994},
+  month   = jan,
+  journal = {Mathematical Programming},
+  volume  = {63},
+  number  = {1-3},
+  pages   = {129--156},
+  issn    = {0025-5610, 1436-4646},
+  doi     = {10.1007/BF01582063}
+}
+
+@misc{StanHMCParameters,
+  title   = {Stan Reference Manual: {HMC} algorithm parameters},
+  urldate = {2024-12-06},
+  url     = {https://mc-stan.org/docs/reference-manual/mcmc.html#hmc-algorithm-parameters}
+}
+
+@article{ZhangPathfinder2021,
+  title      = {Pathfinder: {{Parallel}} Quasi-{{Newton}} Variational Inference},
+  shorttitle = {Pathfinder},
+  author     = {Zhang, Lu and Carpenter, Bob and Gelman, Andrew and Vehtari, Aki},
+  year       = {2022},
+  journal    = {Journal of Machine Learning Research},
+  volume     = {23},
+  number     = {306},
+  pages      = {1--49},
+  issn       = {1533-7928},
+  urldate    = {2024-12-06},
+  url        = {http://jmlr.org/papers/v23/21-0889.html},
+  eprint     = {2108.03782},
+  eprinttype = {arXiv}
+}
diff --git a/docs/src/references.md b/docs/src/references.md
new file mode 100644
index 00000000..78f29bc4
--- /dev/null
+++ b/docs/src/references.md
@@ -0,0 +1,4 @@
+# References
+
+```@bibliography
+```
diff --git a/ext/PathfinderTuringExt.jl b/ext/PathfinderTuringExt.jl
index d959f5ec..0dcbfd3c 100644
--- a/ext/PathfinderTuringExt.jl
+++ b/ext/PathfinderTuringExt.jl
@@ -34,8 +34,9 @@ end
 """
     draws_to_chains(model::DynamicPPL.Model, draws) -> MCMCChains.Chains
 
-Convert a `(nparams, ndraws)` matrix of unconstrained `draws` to an `MCMCChains.Chains`
-object with corresponding constrained draws and names according to `model`.
+Convert a `(nparams, ndraws)` matrix of unconstrained `draws` to a
+[`MCMCChains.Chains`](@extref) object with corresponding constrained draws and names
+according to `model`.
 """
 function draws_to_chains(model::DynamicPPL.Model, draws::AbstractMatrix)
     varinfo = DynamicPPL.link(DynamicPPL.VarInfo(model), model)
diff --git a/src/integration/advancedhmc.jl b/src/integration/advancedhmc.jl
index 08280b27..a8ee69ea 100644
--- a/src/integration/advancedhmc.jl
+++ b/src/integration/advancedhmc.jl
@@ -4,7 +4,8 @@ using .Random
 """
     RankUpdateEuclideanMetric{T,M} <: AdvancedHMC.AbstractMetric
 
-A Gaussian Euclidean metric whose inverse is constructed by rank-updates.
+A Gaussian Euclidean [metric](@extref AdvancedHMC Hamiltonian-mass-matrix-(metric)) whose
+inverse is constructed by rank-updates.
 
 # Constructors
 
diff --git a/src/inverse_hessian.jl b/src/inverse_hessian.jl
index 273e5948..7385fe22 100644
--- a/src/inverse_hessian.jl
+++ b/src/inverse_hessian.jl
@@ -73,10 +73,10 @@ Compute approximate inverse Hessian initialized from `H₀` from history stored
 `history_ind` indicates the column in `S₀` and `Y₀` that was most recently added to the
 history, while `history_length` indicates the number of first columns in `S₀` and `Y₀`
 currently being used for storing history.
-`S = S₀[:, history_ind+1:history_length; 1:history_ind]` reorders the columns of `₀` so that the
-oldest is first and newest is last.
+`S = S₀[:, history_ind+1:history_length; 1:history_ind]` reorders the columns of `S₀` so
+that the oldest is first and newest is last.
 
-From Theorem 2.2 of [^Byrd1994], the expression for the inverse Hessian ``H`` is
+From [Byrd1994; Theorem 2.2](@citet), the expression for the inverse Hessian ``H`` is
 
 ```math
 \\begin{align}
@@ -91,10 +91,9 @@ H &= H_0 + B D B^\\mathrm{T}
 \\end{align}
 ```
 
-[^Byrd1994]: Byrd, R.H., Nocedal, J. & Schnabel, R.B.
-             Representations of quasi-Newton matrices and their use in limited memory methods.
-             Mathematical Programming 63, 129–156 (1994).
-             doi: [10.1007/BF01582063](https://doi.org/10.1007/BF01582063)
+# References
+
+- [Byrd1994](@cite): Byrd et al. Math. Program. 63, 1994.
 """
 function lbfgs_inverse_hessian(H₀::Diagonal, S0, Y0, history_ind, history_length)
     J = history_length
diff --git a/src/multipath.jl b/src/multipath.jl
index 58a98b0e..8f1fcc22 100644
--- a/src/multipath.jl
+++ b/src/multipath.jl
@@ -8,11 +8,11 @@ Container for results of multi-path Pathfinder.
     `optim_prob`, or another object.
 - `optimizer`: Optimizer used for maximizing the log-density
 - `rng`: Pseudorandom number generator that was used for sampling
-- `optim_prob::SciMLBase.OptimizationProblem`: Otimization problem used for
+- `optim_prob::`[`SciMLBase.OptimizationProblem`](@extref): Otimization problem used for
     optimization
 - `logp`: Log-density function
-- `fit_distribution::Distributions.MixtureModel`: uniformly-weighted mixture of ELBO-
-    maximizing multivariate normal distributions from each run.
+- `fit_distribution::`[`Distributions.MixtureModel`](@extref): uniformly-weighted mixture of
+    ELBO-maximizing multivariate normal distributions from each run.
 - `draws::AbstractMatrix{<:Real}`: draws from `fit_distribution` with size `(dim, ndraws)`,
     potentially resampled using importance resampling to be closer to the target
     distribution.
@@ -21,13 +21,12 @@ Container for results of multi-path Pathfinder.
     user-supplied target distribution. This is only different from `fit_distribution` when
     integrating with other packages, and its type depends on the type of `input`.
 - `draws_transformed`: `draws` transformed to be draws from `fit_distribution_transformed`.
-- `pathfinder_results::Vector{<:PathfinderResult}`: results of each single-path Pathfinder
-    run.
-- `psis_result::Union{Nothing,<:PSIS.PSISResult}`: If importance resampling was used, the
-    result of Pareto-smoothed importance resampling. `psis_result.pareto_shape` also
-    diagnoses whether `draws` can be used to compute estimates from the target distribution.
-    See [`PSIS.PSISResult`](https://psis.julia.arviz.org/stable/api/#PSIS.PSISResult) for
-    details
+- `pathfinder_results::Vector{<:`[`PathfinderResult`](@ref)`}`: results of each single-path
+    Pathfinder run.
+- `psis_result::Union{Nothing,<:`[`PSIS.PSISResult`](@extref)`}`: If importance resampling
+    was used, the result of Pareto-smoothed importance resampling.
+    `psis_result.pareto_shape` also diagnoses whether `draws` can be used to compute
+    estimates from the target distribution.
 """
 struct MultiPathfinderResult{I,O,R,OF,LP,FD,D,C,FDT,DT,PFR,PR}
     input::I
@@ -102,13 +101,15 @@ $(_ARGUMENT_DOCSTRING)
 - `importance::Bool=true`: Perform Pareto smoothed importance resampling of draws.
 - `rng::AbstractRNG=Random.GLOBAL_RNG`: Pseudorandom number generator. It is recommended to
     use a parallelization-friendly PRNG like the default PRNG on Julia 1.7 and up.
-- `executor::Transducers.Executor=Transducers.SequentialEx()`: Transducers.jl executor that
-    determines if and how to run the single-path runs in parallel. If a transducer for
+- `executor::Transducers.Executor`: Transducers.jl executor that determines if and how to
+    run the single-path runs in parallel, defaulting to
+    [`Transducers.SequentialEx()`](@extref `Transducers.SequentialEx`). If a transducer for
     multi-threaded computation is selected, you must first verify that `rng` and the log
     density function are thread-safe.
-- `executor_per_run::Transducers.Executor=Transducers.SequentialEx()`: Transducers.jl
-    executor used within each run to parallelize PRNG calls. Defaults to no parallelization.
-    See [`pathfinder`](@ref) for a description.
+- `executor_per_run::Transducers.Executor`: Transducers.jl executor used within each run to
+    parallelize PRNG calls, defaulting to
+    [`Transducers.SequentialEx()`](@extref `Transducers.SequentialEx`). See
+    [`pathfinder`](@ref) for further description.
 - `kwargs...` : Remaining keywords are forwarded to [`pathfinder`](@ref).
 
 # Returns
diff --git a/src/singlepath.jl b/src/singlepath.jl
index c126af10..a8bb7f27 100644
--- a/src/singlepath.jl
+++ b/src/singlepath.jl
@@ -4,15 +4,15 @@ const _ARGUMENT_DOCSTRING = """
     - a callable with the signature
         `f(params::AbstractVector{<:Real}) -> log_density::Real`.
     - an object implementing the
-        [LogDensityProblems](https://www.tamaspapp.eu/LogDensityProblems.jl) interface.
-    - `SciMLBase.OptimizationFunction`: wraps the *negative* log density. It must have the
-        necessary features (e.g. a gradient or Hessian function) for the chosen `optimizer`.
-        For details, see
-        [Optimization.jl: OptimizationFunction](https://optimization.sciml.ai/stable/API/optimization_function/).
-    - `SciMLBase.OptimizationProblem`: an optimization problem containing a function with
-        the same properties as the above `OptimizationFunction`, as well as an initial
-        point. If provided, `init` and `dim` are ignored.
-    - `DynamicPPL.Model`: a Turing model. If provided, `init` and `dim` are ignored.
+        [LogDensityProblems interface](@extref LogDensityProblems log-density-api).
+    - [`SciMLBase.OptimizationFunction`](@extref): wraps the *negative* log density. It must
+        have the necessary features (e.g. a gradient or Hessian function) for the chosen
+        `optimizer`.
+    - [`SciMLBase.OptimizationProblem`](@extref): an optimization problem containing a
+        function with the same properties as the above `OptimizationFunction`, as well as an
+        initial point. If provided, `init` and `dim` are ignored.
+    - [`DynamicPPL.Model`](@extref): a Turing model. If provided, `init` and `dim` are
+        ignored.
 """
 
 """
@@ -25,11 +25,11 @@ Container for results of single-path Pathfinder.
     or another object.
 - `optimizer`: Optimizer used for maximizing the log-density
 - `rng`: Pseudorandom number generator that was used for sampling
-- `optim_prob::SciMLBase.OptimizationProblem`: Otimization problem used for
+- `optim_prob::`[`SciMLBase.OptimizationProblem`](@extref): Optimization problem used for
     optimization
 - `logp`: Log-density function
-- `fit_distribution::Distributions.MvNormal`: ELBO-maximizing multivariate normal
-    distribution
+- `fit_distribution::`[`Distributions.MvNormal`](@extref): ELBO-maximizing multivariate
+    normal distribution
 - `draws::AbstractMatrix{<:Real}`: draws from multivariate normal with size `(dim, ndraws)`
 - `fit_distribution_transformed`: `fit_distribution` transformed to the same space as the
     user-supplied target distribution. This is only different from `fit_distribution` when
@@ -37,7 +37,8 @@ Container for results of single-path Pathfinder.
 - `draws_transformed`: `draws` transformed to be draws from `fit_distribution_transformed`.
 - `fit_iteration::Int`: Iteration at which ELBO estimate was maximized
 - `num_tries::Int`: Number of tries until Pathfinder succeeded
-- `optim_solution::SciMLBase.OptimizationSolution`: Solution object of optimization.
+- `optim_solution::`[`SciMLBase.OptimizationSolution`](@extref): Solution object of
+    optimization.
 - `optim_trace::Pathfinder.OptimizationTrace`: container for optimization trace of points,
     log-density, and gradient. The first point is the initial point.
 - `fit_distributions::AbstractVector{Distributions.MvNormal}`: Multivariate normal
@@ -112,30 +113,32 @@ $(_ARGUMENT_DOCSTRING)
 - `ndraws_elbo::Int=$DEFAULT_NDRAWS_ELBO`: Number of draws used to estimate the ELBO
 - `ndraws::Int=ndraws_elbo`: number of approximate draws to return
 - `rng::Random.AbstractRNG`: The random number generator to be used for drawing samples
-- `executor::Transducers.Executor=Transducers.SequentialEx()`: Transducers.jl executor that
+- `executor::Transducers.Executor`: Transducers.jl executor that
     determines if and how to perform ELBO computation in parallel. The default
-    (`SequentialEx()`) performs no parallelization. If `rng` is known to be thread-safe, and
-    the log-density function is known to have no internal state, then
-    `Transducers.PreferParallel()` may be used to parallelize log-density evaluation.
-    This is generally only faster for expensive log density functions.
+    ([`Transducers.SequentialEx()`](@extref `Transducers.SequentialEx`)) performs no
+    parallelization. If `rng` is known to be thread-safe, and the log-density function is
+    known to have no internal state, then
+    [`Transducers.PreferParallel()`](@extref `Transducers.PreferParallel`) may be used to
+    parallelize log-density evaluation. This is generally only faster for expensive log
+    density functions.
 - `history_length::Int=$DEFAULT_HISTORY_LENGTH`: Size of the history used to approximate the
     inverse Hessian.
 - `optimizer`: Optimizer to be used for constructing trajectory. Can be any optimizer
-    compatible with Optimization.jl, so long as it supports callbacks. Defaults to
-    `Optim.LBFGS(; m=history_length, linesearch=LineSearches.HagerZhang(), alphaguess=LineSearches.InitialHagerZhang())`.
-    See the [Optimization.jl documentation](https://optimization.sciml.ai/stable) for
-    details.
-- `adtype::ADTypes.AbstractADType=AutoForwardDiff()`: Specifies which automatic
+    compatible with [Optimization.jl](https://docs.sciml.ai/Optimization/stable/), so long
+    as it supports callbacks. Defaults to
+    [`Optim.LBFGS`](@extref Optim `algo/lbfgs`)`(; m=history_length, linesearch=LineSearches.HagerZhang(), alphaguess=LineSearches.InitialHagerZhang())`.
+- `adtype::`[`ADTypes.AbstractADType`](@extref): Specifies which automatic
     differentiation backend should be used to compute the gradient, if `fun` does not
-    already specify the gradient. See
-    [SciML's Automatic Differentiation Recommendations](https://docs.sciml.ai/Optimization/stable/API/optimization_function/#Automatic-Differentiation-Construction-Choice-Recommendations).
-- `ntries::Int=1_000`: Number of times to try the optimization, restarting if it fails. Before
-    every restart, a new initial point is drawn using `init_sampler`.
+    already specify the gradient. Default is
+    [`ADTypes.AutoForwardDiff()`](@extref `ADTypes.AutoForwardDiff`) See
+    [Optimization.jl's Automatic Differentiation Recommendations](@extref Optimization ad).
+- `ntries::Int=1_000`: Number of times to try the optimization, restarting if it fails.
+    Before every restart, a new initial point is drawn using `init_sampler`.
 - `fail_on_nonfinite::Bool=true`: If `true`, optimization fails if the log-density is a
     `NaN` or `Inf` or if the gradient is ever non-finite. If `nretries > 0`, then
     optimization will be retried after reinitialization.
 - `kwargs...` : Remaining keywords are forwarded to
-    [`Optimization.solve`](https://optimization.sciml.ai/stable/API/solve).
+    [`Optimization.solve`](@extref Optimization `CommonSolve.solve`).
 
 # Returns
 - [`PathfinderResult`](@ref)
diff --git a/src/woodbury.jl b/src/woodbury.jl
index 8306f17e..3da91cde 100644
--- a/src/woodbury.jl
+++ b/src/woodbury.jl
@@ -190,13 +190,13 @@ The positive definite requirement is equivalent to the requirement that both ``A
 ``C`` are positive definite.
 
 For a derivation of this decomposition for the special case of diagonal ``A``, see
-appendix A of [^Zhang2021].
-
-[^Zhang2021]: Lu Zhang, Bob Carpenter, Andrew Gelman, Aki Vehtari (2021).
-                Pathfinder: Parallel quasi-Newton variational inference.
-                arXiv: [2108.03782](https://arxiv.org/abs/2108.03782) [stat.ML]
+[ZhangPathfinder2021; appendix A](@citet).
 
 See [`pdunfactorize`](@ref), [`WoodburyPDFactorization`](@ref), [`WoodburyPDMat`](@ref)
+
+# References
+
+- [ZhangPathfinder2021](@cite): Zhang et al. JMLR 23(306), 2022.
 """
 function pdfactorize(A::AbstractMatrix, B::AbstractMatrix, D::AbstractMatrix)
     cholA = cholesky(A isa Diagonal ? A : Symmetric(A))
@@ -240,7 +240,7 @@ thrown during construction.
 
 Upon construction, `WoodburyPDMat` calls [`pdfactorize`](@ref) to construct a
 [`WoodburyPDFactorization`](@ref), which is used in its overloads.
-z
+
 See [`pdfactorize`](@ref), [`WoodburyPDFactorization`](@ref)
 """
 struct WoodburyPDMat{