diff --git a/README.md b/README.md index 265c918..7b40464 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ Vũ-Condat primal-dual algorithm[^chambolle_2011][^vu_2013][^condat_2013] | [`Vu Davis-Yin splitting[^davis_2017] | [`DavisYin`](src/algorithms/davis_yin.jl) Asymmetric forward-backward-adjoint splitting[^latafat_2017] | [`AFBA`](src/algorithms/primal_dual.jl) PANOC (L-BFGS)[^stella_2017] | [`PANOC`](src/algorithms/panoc.jl) +PANOC+ (L-BFGS)[^demarchi_2021] | [`PANOCplus`](src/algorithms/panocplus.jl) ZeroFPR (L-BFGS)[^themelis_2018] | [`ZeroFPR`](src/algorithms/zerofpr.jl) Douglas-Rachford line-search (L-BFGS)[^themelis_2020] | [`DRLS`](src/algorithms/drls.jl) @@ -65,3 +66,5 @@ Contributions are welcome in the form of [issues notification](https://github.co [^themelis_2018]: Themelis, Stella, Patrinos, *Forward-backward envelope for the sum of two nonconvex functions: Further properties and nonmonotone line-search algorithms*, SIAM Journal on Optimization, vol. 28, no. 3, pp. 2274–2303 (2018). [link](https://epubs.siam.org/doi/10.1137/16M1080240) [^themelis_2020]: Themelis, Stella, Patrinos, *Douglas-Rachford splitting and ADMM for nonconvex optimization: Accelerated and Newton-type algorithms*, arXiv preprint (2020). [link](https://arxiv.org/abs/2005.10230) + +[^demarchi_2021]: De Marchi, Themelis, *Proximal gradient algorithms under local Lipschitz gradient continuity: a convergence and robustness analysis of PANOC*, arXiv preprint (2021). [link](https://arxiv.org/abs/2112.13000) diff --git a/src/ProximalAlgorithms.jl b/src/ProximalAlgorithms.jl index 32b5b11..03647ec 100644 --- a/src/ProximalAlgorithms.jl +++ b/src/ProximalAlgorithms.jl @@ -33,5 +33,6 @@ include("algorithms/primal_dual.jl") include("algorithms/davis_yin.jl") include("algorithms/li_lin.jl") include("algorithms/fista.jl") +include("algorithms/panocplus.jl") end # module diff --git a/src/algorithms/panocplus.jl b/src/algorithms/panocplus.jl new file mode 100644 index 0000000..b1625f5 --- /dev/null +++ b/src/algorithms/panocplus.jl @@ -0,0 +1,223 @@ +# De Marchi, Themelis, "Proximal gradient algorithms under local Lipschitz +# gradient continuity: a convergence and robustness analysis of PANOC", +# arXiv:2112.13000 (2021). + +using Base.Iterators +using ProximalAlgorithms.IterationTools +using ProximalOperators: Zero +using LinearAlgebra +using Printf + +""" + PANOCplusIteration(; ) + +Instantiate the PANOCplus algorithm (see [1]) for solving optimization problems +of the form + + minimize f(Ax) + g(x), + +where `f` is locally smooth and `A` is a linear mapping (for example, a matrix). + +# Arguments +- `x0`: initial point. +- `f=Zero()`: smooth objective term. +- `A=I`: linear operator (e.g. a matrix). +- `g=Zero()`: proximable objective term. +- `Lf=nothing`: Lipschitz constant of the gradient of x ↦ f(Ax). +- `gamma=nothing`: stepsize to use, defaults to `1/Lf` if not set (but `Lf` is). +- `adaptive=false`: forces the method stepsize to be adaptively adjusted. +- `minimum_gamma=1e-7`: lower bound to `gamma` in case `adaptive == true`. +- `max_backtracks=20`: maximum number of line-search backtracks. +- `directions=LBFGS(5)`: strategy to use to compute line-search directions. + +# References +- [1] De Marchi, Themelis, "Proximal gradient algorithms under local Lipschitz +gradient continuity: a convergence and robustness analysis of PANOC", +arXiv:2112.13000 (2021). +""" + +Base.@kwdef struct PANOCplusIteration{R,Tx,Tf,TA,Tg,TLf,Tgamma,D} + f::Tf = Zero() + A::TA = I + g::Tg = Zero() + x0::Tx + alpha::R = real(eltype(x0))(0.95) + beta::R = real(eltype(x0))(0.5) + Lf::TLf = nothing + gamma::Tgamma = Lf === nothing ? nothing : (alpha / Lf) + adaptive::Bool = gamma === nothing + minimum_gamma::R = real(eltype(x0))(1e-7) + max_backtracks::Int = 20 + directions::D = LBFGS(5) +end + +Base.IteratorSize(::Type{<:PANOCplusIteration}) = Base.IsInfinite() + +Base.@kwdef mutable struct PANOCplusState{R,Tx,TAx,TH} + x::Tx # iterate + Ax::TAx # A times x + f_Ax::R # value of smooth term + grad_f_Ax::TAx # gradient of f at Ax + At_grad_f_Ax::Tx # gradient of smooth term + gamma::R # stepsize parameter of forward and backward steps + y::Tx # forward point + z::Tx # forward-backward point + g_z::R # value of nonsmooth term (at z) + res::Tx # fixed-point residual at iterate (= x - z) + H::TH # variable metric + tau::R = zero(gamma) + x_prev::Tx = similar(x) + res_prev::Tx = similar(x) + d::Tx = similar(x) + Ad::TAx = similar(Ax) + x_d::Tx = similar(x) + Ax_d::TAx = similar(Ax) + f_Ax_d::R = zero(real(eltype(x))) + grad_f_Ax_d::TAx = similar(Ax) + At_grad_f_Ax_d::Tx = similar(x) + z_curr::Tx = similar(x) + Az::TAx = similar(Ax) + grad_f_Az::TAx = similar(Ax) + At_grad_f_Az::Tx = similar(x) +end + +f_model(iter::PANOCplusIteration, state::PANOCplusState) = f_model(state.f_Ax, state.At_grad_f_Ax, state.res, iter.alpha / state.gamma) + +function Base.iterate(iter::PANOCplusIteration{R}) where {R} + x = copy(iter.x0) + Ax = iter.A * x + grad_f_Ax, f_Ax = gradient(iter.f, Ax) + gamma = iter.gamma === nothing ? iter.alpha / lower_bound_smoothness_constant(iter.f, iter.A, x, grad_f_Ax) : iter.gamma + At_grad_f_Ax = iter.A' * grad_f_Ax + y = x - gamma .* At_grad_f_Ax + z, g_z = prox(iter.g, y, gamma) + state = PANOCplusState( + x=x, Ax=Ax, f_Ax=f_Ax, grad_f_Ax=grad_f_Ax, At_grad_f_Ax=At_grad_f_Ax, + gamma=gamma, y=y, z=z, g_z=g_z, res=x-z, H=initialize(iter.directions, x), + ) + if (iter.gamma === nothing || iter.adaptive == true) + state.gamma, state.g_z, f_Az, f_Az_upp = backtrack_stepsize!( + state.gamma, iter.f, iter.A, iter.g, + state.x, state.f_Ax, state.At_grad_f_Ax, state.y, state.z, state.g_z, state.res, + state.Az, state.grad_f_Az, + alpha = iter.alpha, minimum_gamma = iter.minimum_gamma, + ) + end + return state, state +end + +set_next_direction!(::QuasiNewtonStyle, ::PANOCplusIteration, state::PANOCplusState) = mul!(state.d, state.H, -state.res) +set_next_direction!(::NoAccelerationStyle, ::PANOCplusIteration, state::PANOCplusState) = state.d .= .-state.res +set_next_direction!(iter::PANOCplusIteration, state::PANOCplusState) = set_next_direction!(acceleration_style(typeof(iter.directions)), iter, state) + +update_direction_state!(::QuasiNewtonStyle, ::PANOCplusIteration, state::PANOCplusState) = update!(state.H, state.x - state.x_prev, state.res - state.res_prev) +update_direction_state!(::NoAccelerationStyle, ::PANOCplusIteration, state::PANOCplusState) = return +update_direction_state!(iter::PANOCplusIteration, state::PANOCplusState) = update_direction_state!(acceleration_style(typeof(iter.directions)), iter, state) + +reset_direction_state!(::QuasiNewtonStyle, ::PANOCplusIteration, state::PANOCplusState) = reset!(state.H) +reset_direction_state!(::NoAccelerationStyle, ::PANOCplusIteration, state::PANOCplusState) = return +reset_direction_state!(iter::PANOCplusIteration, state::PANOCplusState) = reset_direction_state!(acceleration_style(typeof(iter.directions)), iter, state) + +function Base.iterate(iter::PANOCplusIteration{R}, state::PANOCplusState) where R + f_Az, a, b, c = R(Inf), R(Inf), R(Inf), R(Inf) + + # store iterate and residual for metric update later on + state.x_prev .= state.x + state.res_prev .= state.res + + # compute FBE + FBE_x = f_model(iter, state) + state.g_z + + sigma = iter.beta * (0.5 / state.gamma) * (1 - iter.alpha) + tol = 10 * eps(R) * (1 + abs(FBE_x)) + threshold = FBE_x - sigma * norm(state.res)^2 + tol + + tau_backtracks = 0 + can_update_direction = true + + while true + + if can_update_direction + # compute direction + set_next_direction!(iter, state) + # backtrack tau 1 → 0 + state.tau = R(1) + state.x .= state.x_prev .+ state.d + else + state.x .= (1 - state.tau) * (state.x_prev .- state.res_prev) + state.tau * (state.x_prev .+ state.d) + end + + mul!(state.Ax, iter.A, state.x) + state.f_Ax = gradient!(state.grad_f_Ax, iter.f, state.Ax) + mul!(state.At_grad_f_Ax, adjoint(iter.A), state.grad_f_Ax) + + state.y .= state.x .- state.gamma .* state.At_grad_f_Ax + state.g_z = prox!(state.z, iter.g, state.y, state.gamma) + state.res .= state.x .- state.z + + f_Az_upp = f_model(iter, state) + + if (iter.gamma === nothing || iter.adaptive == true) + mul!(state.Az, iter.A, state.z) + f_Az = gradient!(state.grad_f_Az, iter.f, state.Az) + tol = 10 * eps(R) * (1 + abs(f_Az)) + if f_Az > f_Az_upp + tol && state.gamma >= iter.minimum_gamma + state.gamma *= 0.5 + if state.gamma < iter.minimum_gamma + @warn "stepsize `gamma` became too small ($(state.gamma))" + end + can_update_direction = true + reset_direction_state!(iter, state) + continue + end + end + + FBE_x_new = f_Az_upp + state.g_z + if FBE_x_new <= threshold + # update metric + update_direction_state!(iter, state) + return state, state + end + state.tau *= 0.5 + if tau_backtracks > iter.max_backtracks + @warn "stepsize `tau` became too small ($(state.tau))" + return nothing + end + tau_backtracks += 1 + can_update_direction = false + + end + +end + +# Solver + +struct PANOCplus{R, K} + maxit::Int + tol::R + verbose::Bool + freq::Int + kwargs::K +end + +function (solver::PANOCplus)(x0; kwargs...) + stop(state::PANOCplusState) = norm(state.res, Inf) / state.gamma <= solver.tol + disp((it, state)) = @printf( + "%5d | %.3e | %.3e | %.3e\n", + it, + state.gamma, + norm(state.res, Inf) / state.gamma, + state.tau, + ) + iter = PANOCplusIteration(; x0=x0, solver.kwargs..., kwargs...) + iter = take(halt(iter, stop), solver.maxit) + iter = enumerate(iter) + if solver.verbose + iter = tee(sample(iter, solver.freq), disp) + end + num_iters, state_final = loop(iter) + return state_final.z, num_iters +end + +PANOCplus(; maxit=1_000, tol=1e-8, verbose=false, freq=10, kwargs...) = + PANOCplus(maxit, tol, verbose, freq, kwargs) diff --git a/test/problems/test_equivalence.jl b/test/problems/test_equivalence.jl index 318095e..004d618 100644 --- a/test/problems/test_equivalence.jl +++ b/test/problems/test_equivalence.jl @@ -5,6 +5,7 @@ using ProximalOperators using ProximalAlgorithms: DouglasRachfordIteration, DRLSIteration, ForwardBackwardIteration, PANOCIteration, + PANOCplusIteration, NoAcceleration @testset "DR/DRLS equivalence ($T)" for T in [Float32, Float64] @@ -62,3 +63,31 @@ end @test isapprox(fb_state.z, panoc_state.z) end end + +@testset "PANOC/PANOCplus equivalence ($T)" for T in [Float32, Float64] + A = T[ + 1.0 -2.0 3.0 -4.0 5.0 + 2.0 -1.0 0.0 -1.0 3.0 + -1.0 0.0 4.0 -3.0 2.0 + -1.0 -1.0 -1.0 1.0 3.0 + ] + b = T[1.0, 2.0, 3.0, 4.0] + + m, n = size(A) + + R = real(T) + + lam = R(0.1) * norm(A' * b, Inf) + + f = LeastSquares(A, b) + g = NormL1(lam) + + x0 = zeros(R, n) + + panoc_iter = PANOCIteration(f=f, g=g, x0=x0, gamma=R(0.95) / opnorm(A)^2) + panocplus_iter = PANOCplusIteration(f=f, g=g, x0=x0, gamma=R(0.95) / opnorm(A)^2) + + for (panoc_state, panocplus_state) in Iterators.take(zip(panoc_iter, panocplus_iter), 10) + @test isapprox(panoc_state.z, panocplus_state.z) + end +end diff --git a/test/problems/test_lasso_small.jl b/test/problems/test_lasso_small.jl index b6735d2..311094a 100644 --- a/test/problems/test_lasso_small.jl +++ b/test/problems/test_lasso_small.jl @@ -124,6 +124,28 @@ using ProximalAlgorithms: @test x0 == x0_backup end + @testset "PANOCplus (fixed step)" begin + x0 = zeros(T, n) + x0_backup = copy(x0) + solver = ProximalAlgorithms.PANOCplus(tol = TOL) + x, it = @inferred solver(x0, f = f, A = A, g = g, Lf = Lf) + @test eltype(x) == T + @test norm(x - x_star, Inf) <= TOL + @test it < 20 + @test x0 == x0_backup + end + + @testset "PANOCplus (adaptive step)" begin + x0 = zeros(T, n) + x0_backup = copy(x0) + solver = ProximalAlgorithms.PANOCplus(adaptive = true, tol = TOL) + x, it = @inferred solver(x0, f = f, A = A, g = g) + @test eltype(x) == T + @test norm(x - x_star, Inf) <= TOL + @test it < 20 + @test x0 == x0_backup + end + @testset "DouglasRachford" begin x0 = zeros(T, n) x0_backup = copy(x0) diff --git a/test/problems/test_lasso_small_h_split.jl b/test/problems/test_lasso_small_h_split.jl index 41a4c8e..8fc179a 100644 --- a/test/problems/test_lasso_small_h_split.jl +++ b/test/problems/test_lasso_small_h_split.jl @@ -142,4 +142,30 @@ end + @testset "PANOCplus" begin + + # PANOCplus/Nonadaptive + + x0 = ArrayPartition(zeros(T, n1), zeros(T, n2)) + x0_backup = copy(x0) + solver = ProximalAlgorithms.PANOCplus(tol = TOL) + x, it = solver(x0, f = f, A = A, g = g, Lf = Lf) + @test eltype(x) == T + @test norm(x - x_star, Inf) <= TOL + @test it < 20 + @test x0 == x0_backup + + ## PANOCplus/Adaptive + + x0 = ArrayPartition(zeros(T, n1), zeros(T, n2)) + x0_backup = copy(x0) + solver = ProximalAlgorithms.PANOCplus(adaptive = true, tol = TOL) + x, it = solver(x0, f = f, A = A, g = g) + @test eltype(x) == T + @test norm(x - x_star, Inf) <= TOL + @test it < 20 + @test x0 == x0_backup + + end + end diff --git a/test/problems/test_lasso_small_strongly_convex.jl b/test/problems/test_lasso_small_strongly_convex.jl index 1962fa3..6de93a2 100644 --- a/test/problems/test_lasso_small_strongly_convex.jl +++ b/test/problems/test_lasso_small_strongly_convex.jl @@ -69,4 +69,22 @@ using ProximalAlgorithms @test x0 == x0_backup end + @testset "PANOC" begin + solver = ProximalAlgorithms.PANOC(tol = TOL) + y, it = solver(x0, f = f, g = h, Lf = Lf) + @test eltype(y) == T + @test norm(y - x_star, Inf) <= TOL + @test it < 45 + @test x0 == x0_backup + end + + @testset "PANOCplus" begin + solver = ProximalAlgorithms.PANOCplus(tol = TOL) + y, it = solver(x0, f = f, g = h, Lf = Lf) + @test eltype(y) == T + @test norm(y - x_star, Inf) <= TOL + @test it < 45 + @test x0 == x0_backup + end + end diff --git a/test/problems/test_lasso_small_v_split.jl b/test/problems/test_lasso_small_v_split.jl index fdf28c9..5569adc 100644 --- a/test/problems/test_lasso_small_v_split.jl +++ b/test/problems/test_lasso_small_v_split.jl @@ -135,4 +135,30 @@ end + @testset "PANOCplus" begin + + # PANOCplus/Nonadaptive + + x0 = zeros(T, n) + x0_backup = copy(x0) + solver = ProximalAlgorithms.PANOCplus(tol = TOL) + x, it = solver(x0, f = f, A = A, g = g, Lf = opnorm([A1; A2])^2) + @test eltype(x) == T + @test norm(x - x_star, Inf) <= TOL + @test it < 20 + @test x0 == x0_backup + + ## PANOCplus/Adaptive + + x0 = zeros(T, n) + x0_backup = copy(x0) + solver = ProximalAlgorithms.PANOCplus(adaptive = true, tol = TOL) + x, it = solver(x0, f = f, A = A, g = g) + @test eltype(x) == T + @test norm(x - x_star, Inf) <= TOL + @test it < 20 + @test x0 == x0_backup + + end + end diff --git a/test/problems/test_nonconvex_qp.jl b/test/problems/test_nonconvex_qp.jl index fdc6aed..13ac2b5 100644 --- a/test/problems/test_nonconvex_qp.jl +++ b/test/problems/test_nonconvex_qp.jl @@ -28,6 +28,16 @@ using Test @test x0 == x0_backup end + @testset "PANOCplus" begin + x0 = zeros(T, n) + x0_backup = copy(x0) + solver = ProximalAlgorithms.PANOCplus() + x, it = solver(x0, f = f, g = g) + z = min.(upp, max.(low, x .- gamma .* (Q * x + q))) + @test norm(x - z, Inf) / gamma <= solver.tol + @test x0 == x0_backup + end + @testset "ZeroFPR" begin x0 = zeros(T, n) x0_backup = copy(x0) @@ -82,6 +92,16 @@ end @test x0 == x0_backup end + @testset "PANOCplus" begin + x0 = zeros(T, n) + x0_backup = copy(x0) + solver = ProximalAlgorithms.PANOCplus(tol = TOL) + x, it = solver(x0, f = f, g = g) + z = min.(upp, max.(low, x .- gamma .* (Q * x + q))) + @test norm(x - z, Inf) / gamma <= solver.tol + @test x0 == x0_backup + end + @testset "ZeroFPR" begin x0 = zeros(T, n) x0_backup = copy(x0) diff --git a/test/problems/test_sparse_logistic_small.jl b/test/problems/test_sparse_logistic_small.jl index 12531cd..976f6d3 100644 --- a/test/problems/test_sparse_logistic_small.jl +++ b/test/problems/test_sparse_logistic_small.jl @@ -68,4 +68,15 @@ @test it < 50 @test x0 == x0_backup + # PANOCplus/Adaptive + + x0 = zeros(T, n) + x0_backup = copy(x0) + solver = ProximalAlgorithms.PANOCplus(adaptive = true, tol = TOL) + x, it = solver(x0, f = f, A = A, g = g) + @test eltype(x) == T + @test norm(x - x_star, Inf) <= 1e-4 + @test it < 50 + @test x0 == x0_backup + end diff --git a/test/problems/test_verbose.jl b/test/problems/test_verbose.jl index abab201..46f2b9f 100644 --- a/test/problems/test_verbose.jl +++ b/test/problems/test_verbose.jl @@ -120,6 +120,28 @@ end + @testset "PANOCplus" begin + + # PANOCplus/Nonadaptive + + x0 = zeros(T, n) + solver = ProximalAlgorithms.PANOCplus(tol = TOL, verbose = true) + x, it = solver(x0, f = f, A = A, g = g, Lf = Lf) + @test eltype(x) == T + @test norm(x - x_star, Inf) <= TOL + @test it < 20 + + ## PANOCplus/Adaptive + + x0 = zeros(T, n) + solver = ProximalAlgorithms.PANOCplus(adaptive = true, tol = TOL, verbose = true) + x, it = solver(x0, f = f, A = A, g = g) + @test eltype(x) == T + @test norm(x - x_star, Inf) <= TOL + @test it < 20 + + end + @testset "DouglasRachford" begin # Douglas-Rachford