Merge 765f12d into de2260e

JuliaFirstOrder · Jan 5, 2022 · 19a0030 · 19a0030
2 parents de2260e + 765f12d
commit 19a0030
Show file tree

Hide file tree

Showing 11 changed files with 401 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -31,6 +31,7 @@ Vũ-Condat primal-dual algorithm[^chambolle_2011][^vu_2013][^condat_2013] | [`Vu
 Davis-Yin splitting[^davis_2017] | [`DavisYin`](src/algorithms/davis_yin.jl)
 Asymmetric forward-backward-adjoint splitting[^latafat_2017] | [`AFBA`](src/algorithms/primal_dual.jl)
 PANOC (L-BFGS)[^stella_2017] | [`PANOC`](src/algorithms/panoc.jl)
+PANOC+ (L-BFGS)[^demarchi_2021] | [`PANOCplus`](src/algorithms/panocplus.jl)
 ZeroFPR (L-BFGS)[^themelis_2018] | [`ZeroFPR`](src/algorithms/zerofpr.jl)
 Douglas-Rachford line-search (L-BFGS)[^themelis_2020] | [`DRLS`](src/algorithms/drls.jl)
 
@@ -65,3 +66,5 @@ Contributions are welcome in the form of [issues notification](https://github.co
 [^themelis_2018]: Themelis, Stella, Patrinos, *Forward-backward envelope for the sum of two nonconvex functions: Further properties and nonmonotone line-search algorithms*, SIAM Journal on Optimization, vol. 28, no. 3, pp. 2274–2303 (2018). [link](https://epubs.siam.org/doi/10.1137/16M1080240)
 
 [^themelis_2020]: Themelis, Stella, Patrinos, *Douglas-Rachford splitting and ADMM for nonconvex optimization: Accelerated and Newton-type algorithms*, arXiv preprint (2020). [link](https://arxiv.org/abs/2005.10230)
+
+[^demarchi_2021]: De Marchi, Themelis, *Proximal gradient algorithms under local Lipschitz gradient continuity: a convergence and robustness analysis of PANOC*, arXiv preprint (2021). [link](https://arxiv.org/abs/2112.13000)
diff --git a/src/ProximalAlgorithms.jl b/src/ProximalAlgorithms.jl
@@ -33,5 +33,6 @@ include("algorithms/primal_dual.jl")
 include("algorithms/davis_yin.jl")
 include("algorithms/li_lin.jl")
 include("algorithms/fista.jl")
+include("algorithms/panocplus.jl")
 
 end # module
diff --git a/src/algorithms/panocplus.jl b/src/algorithms/panocplus.jl
@@ -0,0 +1,223 @@
+# De Marchi, Themelis, "Proximal gradient algorithms under local Lipschitz
+# gradient continuity: a convergence and robustness analysis of PANOC",
+# arXiv:2112.13000 (2021).
+
+using Base.Iterators
+using ProximalAlgorithms.IterationTools
+using ProximalOperators: Zero
+using LinearAlgebra
+using Printf
+
+"""
+    PANOCplusIteration(; <keyword-arguments>)
+
+Instantiate the PANOCplus algorithm (see [1]) for solving optimization problems
+of the form
+
+    minimize f(Ax) + g(x),
+
+where `f` is locally smooth and `A` is a linear mapping (for example, a matrix).
+
+# Arguments
+- `x0`: initial point.
+- `f=Zero()`: smooth objective term.
+- `A=I`: linear operator (e.g. a matrix).
+- `g=Zero()`: proximable objective term.
+- `Lf=nothing`: Lipschitz constant of the gradient of x ↦ f(Ax).
+- `gamma=nothing`: stepsize to use, defaults to `1/Lf` if not set (but `Lf` is).
+- `adaptive=false`: forces the method stepsize to be adaptively adjusted.
+- `minimum_gamma=1e-7`: lower bound to `gamma` in case `adaptive == true`.
+- `max_backtracks=20`: maximum number of line-search backtracks.
+- `directions=LBFGS(5)`: strategy to use to compute line-search directions.
+
+# References
+- [1] De Marchi, Themelis, "Proximal gradient algorithms under local Lipschitz
+gradient continuity: a convergence and robustness analysis of PANOC",
+arXiv:2112.13000 (2021).
+"""
+
+Base.@kwdef struct PANOCplusIteration{R,Tx,Tf,TA,Tg,TLf,Tgamma,D}
+    f::Tf = Zero()
+    A::TA = I
+    g::Tg = Zero()
+    x0::Tx
+    alpha::R = real(eltype(x0))(0.95)
+    beta::R = real(eltype(x0))(0.5)
+    Lf::TLf = nothing
+    gamma::Tgamma = Lf === nothing ? nothing : (alpha / Lf)
+    adaptive::Bool = gamma === nothing
+    minimum_gamma::R = real(eltype(x0))(1e-7)
+    max_backtracks::Int = 20
+    directions::D = LBFGS(5)
+end
+
+Base.IteratorSize(::Type{<:PANOCplusIteration}) = Base.IsInfinite()
+
+Base.@kwdef mutable struct PANOCplusState{R,Tx,TAx,TH}
+    x::Tx             # iterate
+    Ax::TAx           # A times x
+    f_Ax::R           # value of smooth term
+    grad_f_Ax::TAx    # gradient of f at Ax
+    At_grad_f_Ax::Tx  # gradient of smooth term
+    gamma::R          # stepsize parameter of forward and backward steps
+    y::Tx             # forward point
+    z::Tx             # forward-backward point
+    g_z::R            # value of nonsmooth term (at z)
+    res::Tx           # fixed-point residual at iterate (= x - z)
+    H::TH             # variable metric
+    tau::R = zero(gamma)
+    x_prev::Tx = similar(x)
+    res_prev::Tx = similar(x)
+    d::Tx = similar(x)
+    Ad::TAx = similar(Ax)
+    x_d::Tx = similar(x)
+    Ax_d::TAx = similar(Ax)
+    f_Ax_d::R = zero(real(eltype(x)))
+    grad_f_Ax_d::TAx = similar(Ax)
+    At_grad_f_Ax_d::Tx = similar(x)
+    z_curr::Tx = similar(x)
+    Az::TAx = similar(Ax)
+    grad_f_Az::TAx = similar(Ax)
+    At_grad_f_Az::Tx = similar(x)
+end
+
+f_model(iter::PANOCplusIteration, state::PANOCplusState) = f_model(state.f_Ax, state.At_grad_f_Ax, state.res, iter.alpha / state.gamma)
+
+function Base.iterate(iter::PANOCplusIteration{R}) where {R}
+    x = copy(iter.x0)
+    Ax = iter.A * x
+    grad_f_Ax, f_Ax = gradient(iter.f, Ax)
+    gamma = iter.gamma === nothing ? iter.alpha / lower_bound_smoothness_constant(iter.f, iter.A, x, grad_f_Ax) : iter.gamma
+    At_grad_f_Ax = iter.A' * grad_f_Ax
+    y = x - gamma .* At_grad_f_Ax
+    z, g_z = prox(iter.g, y, gamma)
+    state = PANOCplusState(
+        x=x, Ax=Ax, f_Ax=f_Ax, grad_f_Ax=grad_f_Ax, At_grad_f_Ax=At_grad_f_Ax,
+        gamma=gamma, y=y, z=z, g_z=g_z, res=x-z, H=initialize(iter.directions, x),
+    )
+    if (iter.gamma === nothing || iter.adaptive == true)
+        state.gamma, state.g_z, f_Az, f_Az_upp = backtrack_stepsize!(
+            state.gamma, iter.f, iter.A, iter.g,
+            state.x, state.f_Ax, state.At_grad_f_Ax, state.y, state.z, state.g_z, state.res,
+            state.Az, state.grad_f_Az,
+            alpha = iter.alpha, minimum_gamma = iter.minimum_gamma,
+        )
+    end
+    return state, state
+end
+
+set_next_direction!(::QuasiNewtonStyle, ::PANOCplusIteration, state::PANOCplusState) = mul!(state.d, state.H, -state.res)
+set_next_direction!(::NoAccelerationStyle, ::PANOCplusIteration, state::PANOCplusState) = state.d .= .-state.res
+set_next_direction!(iter::PANOCplusIteration, state::PANOCplusState) = set_next_direction!(acceleration_style(typeof(iter.directions)), iter, state)
+
+update_direction_state!(::QuasiNewtonStyle, ::PANOCplusIteration, state::PANOCplusState) = update!(state.H, state.x - state.x_prev, state.res - state.res_prev)
+update_direction_state!(::NoAccelerationStyle, ::PANOCplusIteration, state::PANOCplusState) = return
+update_direction_state!(iter::PANOCplusIteration, state::PANOCplusState) = update_direction_state!(acceleration_style(typeof(iter.directions)), iter, state)
+
+reset_direction_state!(::QuasiNewtonStyle, ::PANOCplusIteration, state::PANOCplusState) = reset!(state.H)
+reset_direction_state!(::NoAccelerationStyle, ::PANOCplusIteration, state::PANOCplusState) = return
+reset_direction_state!(iter::PANOCplusIteration, state::PANOCplusState) = reset_direction_state!(acceleration_style(typeof(iter.directions)), iter, state)
+
+function Base.iterate(iter::PANOCplusIteration{R}, state::PANOCplusState) where R
+    f_Az, a, b, c = R(Inf), R(Inf), R(Inf), R(Inf)
+
+    # store iterate and residual for metric update later on
+    state.x_prev .= state.x
+    state.res_prev .= state.res
+
+    # compute FBE
+    FBE_x = f_model(iter, state) + state.g_z
+
+    sigma = iter.beta * (0.5 / state.gamma) * (1 - iter.alpha)
+    tol = 10 * eps(R) * (1 + abs(FBE_x))
+    threshold = FBE_x - sigma * norm(state.res)^2 + tol
+
+    tau_backtracks = 0
+    can_update_direction = true
+
+    while true
+
+        if can_update_direction
+            # compute direction
+            set_next_direction!(iter, state)
+            # backtrack tau 1 → 0
+            state.tau = R(1)
+            state.x .= state.x_prev .+ state.d
+        else
+            state.x .= (1 - state.tau) * (state.x_prev .- state.res_prev) + state.tau * (state.x_prev .+ state.d)
+        end
+
+        mul!(state.Ax, iter.A, state.x)
+        state.f_Ax = gradient!(state.grad_f_Ax, iter.f, state.Ax)
+        mul!(state.At_grad_f_Ax, adjoint(iter.A), state.grad_f_Ax)
+
+        state.y .= state.x .- state.gamma .* state.At_grad_f_Ax
+        state.g_z = prox!(state.z, iter.g, state.y, state.gamma)
+        state.res .= state.x .- state.z
+
+        f_Az_upp = f_model(iter, state)
+
+        if (iter.gamma === nothing || iter.adaptive == true)
+            mul!(state.Az, iter.A, state.z)
+            f_Az = gradient!(state.grad_f_Az, iter.f, state.Az)
+            tol = 10 * eps(R) * (1 + abs(f_Az))
+            if f_Az > f_Az_upp + tol && state.gamma >= iter.minimum_gamma
+                state.gamma *= 0.5
+                if state.gamma < iter.minimum_gamma
+                    @warn "stepsize `gamma` became too small ($(state.gamma))"
+                end
+                can_update_direction = true
+                reset_direction_state!(iter, state)
+                continue
+            end
+        end
+
+        FBE_x_new = f_Az_upp + state.g_z
+        if FBE_x_new <= threshold
+            # update metric
+            update_direction_state!(iter, state)
+            return state, state
+        end
+        state.tau *= 0.5
+        if tau_backtracks > iter.max_backtracks
+            @warn "stepsize `tau` became too small ($(state.tau))"
+            return nothing
+        end
+        tau_backtracks += 1
+        can_update_direction = false
+
+    end
+
+end
+
+# Solver
+
+struct PANOCplus{R, K}
+    maxit::Int
+    tol::R
+    verbose::Bool
+    freq::Int
+    kwargs::K
+end
+
+function (solver::PANOCplus)(x0; kwargs...)
+    stop(state::PANOCplusState) = norm(state.res, Inf) / state.gamma <= solver.tol
+    disp((it, state)) = @printf(
+        "%5d | %.3e | %.3e | %.3e\n",
+        it,
+        state.gamma,
+        norm(state.res, Inf) / state.gamma,
+        state.tau,
+    )
+    iter = PANOCplusIteration(; x0=x0, solver.kwargs..., kwargs...)
+    iter = take(halt(iter, stop), solver.maxit)
+    iter = enumerate(iter)
+    if solver.verbose
+        iter = tee(sample(iter, solver.freq), disp)
+    end
+    num_iters, state_final = loop(iter)
+    return state_final.z, num_iters
+end
+
+PANOCplus(; maxit=1_000, tol=1e-8, verbose=false, freq=10, kwargs...) =
+    PANOCplus(maxit, tol, verbose, freq, kwargs)
diff --git a/test/problems/test_equivalence.jl b/test/problems/test_equivalence.jl
@@ -5,6 +5,7 @@ using ProximalOperators
 using ProximalAlgorithms:
     DouglasRachfordIteration, DRLSIteration,
     ForwardBackwardIteration, PANOCIteration,
+    PANOCplusIteration,
     NoAcceleration
 
 @testset "DR/DRLS equivalence ($T)" for T in [Float32, Float64]
@@ -62,3 +63,31 @@ end
         @test isapprox(fb_state.z, panoc_state.z)
     end
 end
+
+@testset "PANOC/PANOCplus equivalence ($T)" for T in [Float32, Float64]
+    A = T[
+        1.0 -2.0 3.0 -4.0 5.0
+        2.0 -1.0 0.0 -1.0 3.0
+        -1.0 0.0 4.0 -3.0 2.0
+        -1.0 -1.0 -1.0 1.0 3.0
+    ]
+    b = T[1.0, 2.0, 3.0, 4.0]
+
+    m, n = size(A)
+
+    R = real(T)
+
+    lam = R(0.1) * norm(A' * b, Inf)
+
+    f = LeastSquares(A, b)
+    g = NormL1(lam)
+
+    x0 = zeros(R, n)
+
+    panoc_iter = PANOCIteration(f=f, g=g, x0=x0, gamma=R(0.95) / opnorm(A)^2)
+    panocplus_iter = PANOCplusIteration(f=f, g=g, x0=x0, gamma=R(0.95) / opnorm(A)^2)
+
+    for (panoc_state, panocplus_state) in Iterators.take(zip(panoc_iter, panocplus_iter), 10)
+        @test isapprox(panoc_state.z, panocplus_state.z)
+    end
+end
diff --git a/test/problems/test_lasso_small.jl b/test/problems/test_lasso_small.jl
@@ -124,6 +124,28 @@ using ProximalAlgorithms:
         @test x0 == x0_backup
     end
 
+    @testset "PANOCplus (fixed step)" begin
+        x0 = zeros(T, n)
+        x0_backup = copy(x0)
+        solver = ProximalAlgorithms.PANOCplus(tol = TOL)
+        x, it = @inferred solver(x0, f = f, A = A, g = g, Lf = Lf)
+        @test eltype(x) == T
+        @test norm(x - x_star, Inf) <= TOL
+        @test it < 20
+        @test x0 == x0_backup
+    end
+
+    @testset "PANOCplus (adaptive step)" begin
+        x0 = zeros(T, n)
+        x0_backup = copy(x0)
+        solver = ProximalAlgorithms.PANOCplus(adaptive = true, tol = TOL)
+        x, it = @inferred solver(x0, f = f, A = A, g = g)
+        @test eltype(x) == T
+        @test norm(x - x_star, Inf) <= TOL
+        @test it < 20
+        @test x0 == x0_backup
+    end
+
     @testset "DouglasRachford" begin
         x0 = zeros(T, n)
         x0_backup = copy(x0)

diff --git a/test/problems/test_lasso_small_h_split.jl b/test/problems/test_lasso_small_h_split.jl
@@ -142,4 +142,30 @@
 
     end
 
+    @testset "PANOCplus" begin
+
+        # PANOCplus/Nonadaptive
+
+        x0 = ArrayPartition(zeros(T, n1), zeros(T, n2))
+        x0_backup = copy(x0)
+        solver = ProximalAlgorithms.PANOCplus(tol = TOL)
+        x, it = solver(x0, f = f, A = A, g = g, Lf = Lf)
+        @test eltype(x) == T
+        @test norm(x - x_star, Inf) <= TOL
+        @test it < 20
+        @test x0 == x0_backup
+
+        ## PANOCplus/Adaptive
+
+        x0 = ArrayPartition(zeros(T, n1), zeros(T, n2))
+        x0_backup = copy(x0)
+        solver = ProximalAlgorithms.PANOCplus(adaptive = true, tol = TOL)
+        x, it = solver(x0, f = f, A = A, g = g)
+        @test eltype(x) == T
+        @test norm(x - x_star, Inf) <= TOL
+        @test it < 20
+        @test x0 == x0_backup
+
+    end
+
 end
diff --git a/test/problems/test_lasso_small_strongly_convex.jl b/test/problems/test_lasso_small_strongly_convex.jl
@@ -69,4 +69,22 @@ using ProximalAlgorithms
         @test x0 == x0_backup
     end
 
+    @testset "PANOC" begin
+        solver = ProximalAlgorithms.PANOC(tol = TOL)
+        y, it = solver(x0, f = f, g = h, Lf = Lf)
+        @test eltype(y) == T
+        @test norm(y - x_star, Inf) <= TOL
+        @test it < 45
+        @test x0 == x0_backup
+    end
+
+    @testset "PANOCplus" begin
+        solver = ProximalAlgorithms.PANOCplus(tol = TOL)
+        y, it = solver(x0, f = f, g = h, Lf = Lf)
+        @test eltype(y) == T
+        @test norm(y - x_star, Inf) <= TOL
+        @test it < 45
+        @test x0 == x0_backup
+    end
+
 end
diff --git a/test/problems/test_lasso_small_v_split.jl b/test/problems/test_lasso_small_v_split.jl
@@ -135,4 +135,30 @@
 
     end
 
+    @testset "PANOCplus" begin
+
+        # PANOCplus/Nonadaptive
+
+        x0 = zeros(T, n)
+        x0_backup = copy(x0)
+        solver = ProximalAlgorithms.PANOCplus(tol = TOL)
+        x, it = solver(x0, f = f, A = A, g = g, Lf = opnorm([A1; A2])^2)
+        @test eltype(x) == T
+        @test norm(x - x_star, Inf) <= TOL
+        @test it < 20
+        @test x0 == x0_backup
+
+        ## PANOCplus/Adaptive
+
+        x0 = zeros(T, n)
+        x0_backup = copy(x0)
+        solver = ProximalAlgorithms.PANOCplus(adaptive = true, tol = TOL)
+        x, it = solver(x0, f = f, A = A, g = g)
+        @test eltype(x) == T
+        @test norm(x - x_star, Inf) <= TOL
+        @test it < 20
+        @test x0 == x0_backup
+
+    end
+
 end