From f58002a1a9c2ad2bd4afbb3d88c8d196fdadd996 Mon Sep 17 00:00:00 2001
From: Alberto De Marchi <aldmarchi@gmail.com>
Date: Fri, 8 Mar 2024 16:43:37 +0100
Subject: [PATCH] Regret factor in ForwardBackward (#91)

This PR adds the possibility to specify a regret factor to increase the
stepsize gamma at every iteration, if adaptive, before backtracking.
Recent results provide convergence guarantees even without a maximum
value on gamma [Theorem 14,
[arXiv:2208.00779v2](https://arxiv.org/abs/2208.00799)].

This feature was implemented for ForwardBackward only. Although it does
not seem a good fit for accelerated methods like ZeroFPR, PANOC, and
PANOCplus, at least when based on quasi-Newton directions, it is unclear
whether the FastForwardBackward solver could benefit from it. See
discussion in #86 .

Practical performance seems to improve with values regret_gamma close to
1. Tests and references have also been updated accordingly.
---
 docs/references.bib                           |   8 ++
 docs/src/guide/implemented_algorithms.md      |   2 +-
 src/algorithms/fast_forward_backward.jl       |   3 +
 src/algorithms/forward_backward.jl            |   5 +
 test/problems/test_lasso_small.jl             |  22 ++++
 .../test_lasso_small_strongly_convex.jl       |  36 ++++++
 test/problems/test_sparse_logistic_small.jl   | 118 +++++++++++-------
 7 files changed, 145 insertions(+), 49 deletions(-)

diff --git a/docs/references.bib b/docs/references.bib
index 2ab7e30..d8c3987 100644
--- a/docs/references.bib
+++ b/docs/references.bib
@@ -185,3 +185,11 @@ @article{DeMarchi2022
   year={2022},
   url={https://doi.org/10.48550/arXiv.2112.13000}
 }
+
+@article{DeMarchi2024,
+  title={An interior proximal gradient method for nonconvex optimization},
+  author={De Marchi, Alberto and Themelis, Andreas},
+  journal={arXiv:2208.00799v2},
+  year={2024},
+  url={https://doi.org/10.48550/arXiv.2208.00799}
+}
\ No newline at end of file
diff --git a/docs/src/guide/implemented_algorithms.md b/docs/src/guide/implemented_algorithms.md
index a297a22..a14fe6b 100644
--- a/docs/src/guide/implemented_algorithms.md
+++ b/docs/src/guide/implemented_algorithms.md
@@ -24,7 +24,7 @@ This is the most popular model, by far the most thoroughly studied, and an abund
 
 Algorithm | Assumptions | Oracle | Implementation | References
 ----------|-------------|--------|----------------|-----------
-Proximal gradient | ``f`` smooth | ``\nabla f``, ``\operatorname{prox}_{\gamma g}`` | [`ForwardBackward`](@ref) | [Lions1979](@cite)
+Proximal gradient | ``f`` locally smooth | ``\nabla f``, ``\operatorname{prox}_{\gamma g}`` | [`ForwardBackward`](@ref) | [Lions1979](@cite), [DeMarchi2024](@cite)
 Douglas-Rachford | | ``\operatorname{prox}_{\gamma f}``, ``\operatorname{prox}_{\gamma g}`` | [`DouglasRachford`](@ref) | [Eckstein1992](@cite)
 Fast proximal gradient | ``f`` convex, smooth, ``g`` convex | ``\nabla f``, ``\operatorname{prox}_{\gamma g}`` | [`FastForwardBackward`](@ref) | [Tseng2008](@cite), [Beck2009](@cite)
 PANOC | ``f`` smooth | ``\nabla f``, ``\operatorname{prox}_{\gamma g}`` | [`PANOC`](@ref) | [Stella2017](@cite)
diff --git a/src/algorithms/fast_forward_backward.jl b/src/algorithms/fast_forward_backward.jl
index 006ebad..244148b 100644
--- a/src/algorithms/fast_forward_backward.jl
+++ b/src/algorithms/fast_forward_backward.jl
@@ -33,6 +33,7 @@ See also: [`FastForwardBackward`](@ref).
 - `gamma=nothing`: stepsize, defaults to `1/Lf` if `Lf` is set, and `nothing` otherwise.
 - `adaptive=true`: makes `gamma` adaptively adjust during the iterations; this is by default `gamma === nothing`.
 - `minimum_gamma=1e-7`: lower bound to `gamma` in case `adaptive == true`.
+- `regret_gamma=1.0`: factor to enlarge `gamma` in case `adaptive == true`, before backtracking.
 - `extrapolation_sequence=nothing`: sequence (iterator) of extrapolation coefficients to use for acceleration.
 
 # References
@@ -48,6 +49,7 @@ Base.@kwdef struct FastForwardBackwardIteration{R,Tx,Tf,Tg,TLf,Tgamma,Textr}
     gamma::Tgamma = Lf === nothing ? nothing : (1 / Lf)
     adaptive::Bool = gamma === nothing
     minimum_gamma::R = real(eltype(x0))(1e-7)
+    regret_gamma::R = real(eltype(x0))(1.0)
     extrapolation_sequence::Textr = nothing
 end
 
@@ -105,6 +107,7 @@ function Base.iterate(
     state::FastForwardBackwardState{R,Tx},
 ) where {R,Tx}
     state.gamma = if iter.adaptive == true
+        state.gamma *= iter.regret_gamma
         gamma, state.g_z = backtrack_stepsize!(
             state.gamma,
             iter.f,
diff --git a/src/algorithms/forward_backward.jl b/src/algorithms/forward_backward.jl
index d21ee7a..3e52241 100644
--- a/src/algorithms/forward_backward.jl
+++ b/src/algorithms/forward_backward.jl
@@ -28,9 +28,11 @@ See also: [`ForwardBackward`](@ref).
 - `gamma=nothing`: stepsize to use, defaults to `1/Lf` if not set (but `Lf` is).
 - `adaptive=false`: forces the method stepsize to be adaptively adjusted.
 - `minimum_gamma=1e-7`: lower bound to `gamma` in case `adaptive == true`.
+- `regret_gamma=1.0`: factor to enlarge `gamma` in case `adaptive == true`, before backtracking.
 
 # References
 1. Lions, Mercier, “Splitting algorithms for the sum of two nonlinear operators,” SIAM Journal on Numerical Analysis, vol. 16, pp. 964–979 (1979).
+2. De Marchi, Themelis, "An interior proximal gradient method for nonconvex optimization," arXiv:2208.00799v2 (2024).
 """
 Base.@kwdef struct ForwardBackwardIteration{R,Tx,Tf,Tg,TLf,Tgamma}
     f::Tf = Zero()
@@ -40,6 +42,7 @@ Base.@kwdef struct ForwardBackwardIteration{R,Tx,Tf,Tg,TLf,Tgamma}
     gamma::Tgamma = Lf === nothing ? nothing : (1 / Lf)
     adaptive::Bool = gamma === nothing
     minimum_gamma::R = real(eltype(x0))(1e-7)
+    regret_gamma::R = real(eltype(x0))(1.0)
 end
 
 Base.IteratorSize(::Type{<:ForwardBackwardIteration}) = Base.IsInfinite()
@@ -84,6 +87,7 @@ function Base.iterate(
     state::ForwardBackwardState{R,Tx},
 ) where {R,Tx}
     if iter.adaptive == true
+        state.gamma *= iter.regret_gamma
         state.gamma, state.g_z, state.f_x = backtrack_stepsize!(
             state.gamma,
             iter.f,
@@ -150,6 +154,7 @@ See also: [`ForwardBackwardIteration`](@ref), [`IterativeAlgorithm`](@ref).
 
 # References
 1. Lions, Mercier, “Splitting algorithms for the sum of two nonlinear operators,” SIAM Journal on Numerical Analysis, vol. 16, pp. 964–979 (1979).
+2. De Marchi, Themelis, "An interior proximal gradient method for nonconvex optimization," arXiv:2208.00799v2 (2024).
 """
 ForwardBackward(;
     maxit = 10_000,
diff --git a/test/problems/test_lasso_small.jl b/test/problems/test_lasso_small.jl
index 19a3c97..a40324a 100644
--- a/test/problems/test_lasso_small.jl
+++ b/test/problems/test_lasso_small.jl
@@ -65,6 +65,17 @@ using ProximalAlgorithms:
         @test x0 == x0_backup
     end
 
+    @testset "ForwardBackward (adaptive step, regret)" begin
+        x0 = zeros(T, n)
+        x0_backup = copy(x0)
+        solver = ProximalAlgorithms.ForwardBackward(tol = TOL, adaptive = true, regret_gamma=R(1.01))
+        x, it = @inferred solver(x0 = x0, f = fA_autodiff, g = g)
+        @test eltype(x) == T
+        @test norm(x - x_star, Inf) <= TOL
+        @test it < 150
+        @test x0 == x0_backup
+    end
+
     @testset "FastForwardBackward (fixed step)" begin
         x0 = zeros(T, n)
         x0_backup = copy(x0)
@@ -87,6 +98,17 @@ using ProximalAlgorithms:
         @test x0 == x0_backup
     end
 
+    @testset "FastForwardBackward (adaptive step, regret)" begin
+        x0 = zeros(T, n)
+        x0_backup = copy(x0)
+        solver = ProximalAlgorithms.FastForwardBackward(tol = TOL, adaptive = true, regret_gamma=R(1.01))
+        x, it = @inferred solver(x0 = x0, f = fA_autodiff, g = g)
+        @test eltype(x) == T
+        @test norm(x - x_star, Inf) <= TOL
+        @test it < 100
+        @test x0 == x0_backup
+    end
+
     @testset "FastForwardBackward (custom extrapolation)" begin
         x0 = zeros(T, n)
         x0_backup = copy(x0)
diff --git a/test/problems/test_lasso_small_strongly_convex.jl b/test/problems/test_lasso_small_strongly_convex.jl
index 678eb45..e45931d 100644
--- a/test/problems/test_lasso_small_strongly_convex.jl
+++ b/test/problems/test_lasso_small_strongly_convex.jl
@@ -70,6 +70,24 @@ using ProximalAlgorithms
         @test it < 110
         @test x0 == x0_backup
     end
+    
+    @testset "ForwardBackward (adaptive step)" begin
+        solver = ProximalAlgorithms.ForwardBackward(tol = TOL, adaptive = true)
+        y, it = solver(x0 = x0, f = fA_autodiff, g = g)
+        @test eltype(y) == T
+        @test norm(y - x_star, Inf) <= TOL
+        @test it < 300
+        @test x0 == x0_backup
+    end
+
+    @testset "ForwardBackward (adaptive step, regret)" begin
+        solver = ProximalAlgorithms.ForwardBackward(tol = TOL, adaptive = true, regret_gamma=T(1.01))
+        y, it = solver(x0 = x0, f = fA_autodiff, g = g)
+        @test eltype(y) == T
+        @test norm(y - x_star, Inf) <= TOL
+        @test it < 80
+        @test x0 == x0_backup
+    end
 
     @testset "FastForwardBackward" begin
         solver = ProximalAlgorithms.FastForwardBackward(tol = TOL)
@@ -80,6 +98,24 @@ using ProximalAlgorithms
         @test x0 == x0_backup
     end
 
+    @testset "FastForwardBackward (adaptive step)" begin
+        solver = ProximalAlgorithms.FastForwardBackward(tol = TOL, adaptive = true)
+        y, it = solver(x0 = x0, f = fA_autodiff, g = g)
+        @test eltype(y) == T
+        @test norm(y - x_star, Inf) <= TOL
+        @test it < 100
+        @test x0 == x0_backup
+    end
+
+    @testset "FastForwardBackward (adaptive step, regret)" begin
+        solver = ProximalAlgorithms.FastForwardBackward(tol = TOL, adaptive = true, regret_gamma=T(1.01))
+        y, it = solver(x0 = x0, f = fA_autodiff, g = g)
+        @test eltype(y) == T
+        @test norm(y - x_star, Inf) <= TOL
+        @test it < 100
+        @test x0 == x0_backup
+    end
+
     @testset "FastForwardBackward (custom extrapolation)" begin
         solver = ProximalAlgorithms.FastForwardBackward(tol = TOL)
         y, it = solver(
diff --git a/test/problems/test_sparse_logistic_small.jl b/test/problems/test_sparse_logistic_small.jl
index 70f876f..4b4b78b 100644
--- a/test/problems/test_sparse_logistic_small.jl
+++ b/test/problems/test_sparse_logistic_small.jl
@@ -35,59 +35,81 @@ using LinearAlgebra
 
     TOL = R(1e-6)
 
-    # Nonfast/Adaptive
-
-    x0 = zeros(T, n)
-    x0_backup = copy(x0)
-    solver = ProximalAlgorithms.ForwardBackward(tol = TOL, adaptive = true)
-    x, it = solver(x0 = x0, f = fA_autodiff, g = g)
-    @test eltype(x) == T
-    @test norm(x - x_star, Inf) <= 1e-4
-    @test it < 1100
-    @test x0 == x0_backup
-
-    # Fast/Adaptive
-
-    x0 = zeros(T, n)
-    x0_backup = copy(x0)
-    solver = ProximalAlgorithms.FastForwardBackward(tol = TOL, adaptive = true)
-    x, it = solver(x0 = x0, f = fA_autodiff, g = g)
-    @test eltype(x) == T
-    @test norm(x - x_star, Inf) <= 1e-4
-    @test it < 500
-    @test x0 == x0_backup
+    @testset "ForwardBackward (adaptive step)" begin
+        x0 = zeros(T, n)
+        x0_backup = copy(x0)
+        solver = ProximalAlgorithms.ForwardBackward(tol = TOL, adaptive = true)
+        x, it = solver(x0 = x0, f = fA_autodiff, g = g)
+        @test eltype(x) == T
+        @test norm(x - x_star, Inf) <= 1e-4
+        @test it < 1100
+        @test x0 == x0_backup
+    end
 
-    # ZeroFPR/Adaptive
+    @testset "ForwardBackward (adaptive step, regret)" begin
+        x0 = zeros(T, n)
+        x0_backup = copy(x0)
+        solver = ProximalAlgorithms.ForwardBackward(tol = TOL, adaptive = true, regret_gamma=R(1.01))
+        x, it = solver(x0 = x0, f = fA_autodiff, g = g)
+        @test eltype(x) == T
+        @test norm(x - x_star, Inf) <= 1e-4
+        @test it < 500
+        @test x0 == x0_backup
+    end
 
-    x0 = zeros(T, n)
-    x0_backup = copy(x0)
-    solver = ProximalAlgorithms.ZeroFPR(adaptive = true, tol = TOL)
-    x, it = solver(x0 = x0, f = f_autodiff, A = A, g = g)
-    @test eltype(x) == T
-    @test norm(x - x_star, Inf) <= 1e-4
-    @test it < 25
-    @test x0 == x0_backup
+    @testset "FastForwardBackward (adaptive step)" begin
+        x0 = zeros(T, n)
+        x0_backup = copy(x0)
+        solver = ProximalAlgorithms.FastForwardBackward(tol = TOL, adaptive = true)
+        x, it = solver(x0 = x0, f = fA_autodiff, g = g)
+        @test eltype(x) == T
+        @test norm(x - x_star, Inf) <= 1e-4
+        @test it < 500
+        @test x0 == x0_backup
+    end
 
-    # PANOC/Adaptive
+    @testset "FastForwardBackward (adaptive step, regret)" begin
+        x0 = zeros(T, n)
+        x0_backup = copy(x0)
+        solver = ProximalAlgorithms.FastForwardBackward(tol = TOL, adaptive = true, regret_gamma=R(1.01))
+        x, it = solver(x0 = x0, f = fA_autodiff, g = g)
+        @test eltype(x) == T
+        @test norm(x - x_star, Inf) <= 1e-4
+        @test it < 200
+        @test x0 == x0_backup
+    end
 
-    x0 = zeros(T, n)
-    x0_backup = copy(x0)
-    solver = ProximalAlgorithms.PANOC(adaptive = true, tol = TOL)
-    x, it = solver(x0 = x0, f = f_autodiff, A = A, g = g)
-    @test eltype(x) == T
-    @test norm(x - x_star, Inf) <= 1e-4
-    @test it < 50
-    @test x0 == x0_backup
+    @testset "ZeroFPR (adaptive step)" begin
+        x0 = zeros(T, n)
+        x0_backup = copy(x0)
+        solver = ProximalAlgorithms.ZeroFPR(adaptive = true, tol = TOL)
+        x, it = solver(x0 = x0, f = f_autodiff, A = A, g = g)
+        @test eltype(x) == T
+        @test norm(x - x_star, Inf) <= 1e-4
+        @test it < 25
+        @test x0 == x0_backup
+    end
 
-    # PANOCplus/Adaptive
+    @testset "PANOC (adaptive step)" begin
+        x0 = zeros(T, n)
+        x0_backup = copy(x0)
+        solver = ProximalAlgorithms.PANOC(adaptive = true, tol = TOL)
+        x, it = solver(x0 = x0, f = f_autodiff, A = A, g = g)
+        @test eltype(x) == T
+        @test norm(x - x_star, Inf) <= 1e-4
+        @test it < 50
+        @test x0 == x0_backup
+    end
 
-    x0 = zeros(T, n)
-    x0_backup = copy(x0)
-    solver = ProximalAlgorithms.PANOCplus(adaptive = true, tol = TOL)
-    x, it = solver(x0 = x0, f = f_autodiff, A = A, g = g)
-    @test eltype(x) == T
-    @test norm(x - x_star, Inf) <= 1e-4
-    @test it < 50
-    @test x0 == x0_backup
+    @testset "PANOCplus (adaptive step)" begin
+        x0 = zeros(T, n)
+        x0_backup = copy(x0)
+        solver = ProximalAlgorithms.PANOCplus(adaptive = true, tol = TOL)
+        x, it = solver(x0 = x0, f = f_autodiff, A = A, g = g)
+        @test eltype(x) == T
+        @test norm(x - x_star, Inf) <= 1e-4
+        @test it < 50
+        @test x0 == x0_backup
+    end
 
 end