Skip to content

Commit

Permalink
Cleanup ridge auto tuning
Browse files Browse the repository at this point in the history
  • Loading branch information
joaquimg committed Dec 3, 2021
1 parent ebee185 commit 9e12b41
Showing 1 changed file with 64 additions and 70 deletions.
134 changes: 64 additions & 70 deletions docs/src/examples/autotuning-ridge.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,105 +22,96 @@
# computed with DiffOpt, we can perform a gradient descent on top of the inner model
# to minimize the test loss.

using DiffOpt
using Statistics
using OSQP
using JuMP
using Plots
import Random
using LinearAlgebra

# This tutorial uses the following packages

"""
R2(y_true, y_pred)
Return the coefficient of determination R2 of the prediction in `[0,1]`.
"""
function R2(y_true, y_pred)
u = norm(y_pred - y_true)^2 # Regression sum of squares
v = norm(y_true .- mean(y_true))^2 # Total sum of squares
return 1 - u/v
end
using JuMP # The mathematical programming modelling language
import DiffOpt # JuMP extension for differentiable optimization
import OSQP # Optimization solver that handles quadratic programs
import Plots # Graphing tool
import LinearAlgebra: norm, dot
import Random

# ## Generating a noisy regression dataset

function create_problem(N, D, noise)
w = 10 * randn(D)
X = 10 * randn(N, D)
y = X * w + noise * randn(N)
l = N ÷ 2 # test train split
return (X[1:l, :], X[l+1:N, :], y[1:l], y[l+1:N])
end

Random.seed!(42)

X_train, X_test, y_train, y_test = create_problem(1000, 200, 50);
N = 100
D = 20
noise = 5

w_real = 10 * randn(D)
X = 10 * randn(N, D)
y = X * w_real + noise * randn(N)
l = N ÷ 2 # test train split

X_train = X[1:l, :]
X_test = X[l+1:N, :]
y_train = y[1:l]
y_test = y[l+1:N];

# ## Defining the regression problem

# We implement the regularized regression problem as a function taking the problem data,
# building a JuMP model and solving it.

function fit_ridge(X, y, α, model = Model(() -> diff_optimizer(OSQP.Optimizer)))
function fit_ridge(model, X, y, α)
JuMP.empty!(model)
set_optimizer_attribute(model, MOI.Silent(), true)
set_silent(model)
N, D = size(X)
@variable(model, w[1:D])
err_term = X * w - y
@objective(
model,
Min,
1/(2 * N * D) * dot(err_term, err_term) + 1/(2 * D) * α * dot(w, w),
dot(err_term, err_term) / (2 * N * D) + α * dot(w, w) / (2 * D),
)
optimize!(model)
if termination_status(model) != MOI.OPTIMAL
error("Unexpected status: $(termination_status(model))")
end
regularized_loss_value = objective_value(model)
training_loss_value = 1/(2 * N * D) * JuMP.value(dot(err_term, err_term))
return model, w, value.(w), training_loss_value, regularized_loss_value
@assert termination_status(model) == MOI.OPTIMAL
return w
end

# We can solve the problem for several values of α
# to visualize the effect of regularization on the testing and training loss.

αs = 0.0:0.01:0.5
Rs = Float64[]
αs = 0.05:0.01:0.35
mse_test = Float64[]
mse_train = Float64[]
model = JuMP.Model(() -> diff_optimizer(OSQP.Optimizer))
model = Model(() -> DiffOpt.diff_optimizer(OSQP.Optimizer))
(Ntest, D) = size(X_test)
(Ntrain, D) = size(X_train)
for α in αs
_, _, w_train, training_loss_value, _ = fit_ridge(X_train, y_train, α, model)
y_pred = X_test * w_train
push!(Rs, R2(y_test, y_pred))
push!(mse_test, dot(y_pred - y_test, y_pred - y_test) / (2 * Ntest * D))
push!(mse_train, training_loss_value)
w = fit_ridge(model, X_train, y_train, α)
= value.(w)
ŷ_test = X_test *
ŷ_train = X_train *
push!(mse_test, norm(ŷ_test - y_test)^2 / (2 * Ntest * D))
push!(mse_train, norm(ŷ_train - y_train)^2 / (2 * Ntrain * D))
end

# Visualize the R2 correlation metric

plot(αs, Rs, label=nothing, xaxis="α", yaxis="R2")
title!("Test coefficient of determination R2")

# Visualize the Mean Score Error metric

plot(αs, mse_test ./ sum(mse_test), label="MSE test", xaxis = "α", yaxis="MSE", legend=(0.8,0.2))
plot!(αs, mse_train ./ sum(mse_train), label="MSE train")
title!("Normalized MSE on training and testing sets")
Plots.plot(
αs, mse_test ./ sum(mse_test),
label="MSE test", xaxis = "α", yaxis="MSE", legend=(0.8, 0.2)
)
Plots.plot!(
αs, mse_train ./ sum(mse_train),
label="MSE train"
)
Plots.title!("Normalized MSE on training and testing sets")

# ## Leveraging differentiable optimization: computing the derivative of the solution

# Using DiffOpt, we can compute `∂w_i/∂α`, the derivative of the learned solution `̂w`
# w.r.t. the regularization parameter.

function compute_dw_dparam(model, w)
function compute_dw_dα(model, w)
D = length(w)
dw_dα = zeros(D)
MOI.set(
model,
DiffOpt.ForwardInObjective(),
1/2 * dot(w, w) / D,
dot(w, w) / (2 * D),
)
DiffOpt.forward(model)
for i in 1:D
Expand All @@ -133,13 +124,13 @@ function compute_dw_dparam(model, w)
return dw_dα
end

# Using `∂w_i/∂α` computed with `compute_dw_dparam`,
# Using `∂w_i/∂α` computed with `compute_dw_dα`,
# we can compute the derivative of the test loss w.r.t. the parameter α
# by composing derivatives.

function d_testloss_dα(model, X_test, y_test, w, ŵ)
N, D = size(X_test)
dw_dα = compute_dw_dparam(model, w)
dw_dα = compute_dw_dα(model, w)
err_term = X_test *- y_test
return sum(eachindex(err_term)) do i
dot(X_test[i,:], dw_dα) * err_term[i]
Expand All @@ -151,31 +142,34 @@ end

function descent(α0, max_iters=100; fixed_step = 0.01, grad_tol=1e-3)
α_s = Float64[]
test_loss_values = Float64[]
test_loss = Float64[]
α = α0
model = JuMP.Model(() -> DiffOpt.diff_optimizer(OSQP.Optimizer))
N, D = size(X_test)
model = Model(() -> DiffOpt.diff_optimizer(OSQP.Optimizer))
for iter in 1:max_iters
push!(α_s, α)
_, w, ŵ, _, = fit_ridge(X_train, y_train, α, model)
w = fit_ridge(model, X_train, y_train, α)
= value.(w)
err_term = X_test *- y_test
test_loss = norm(err_term)^2 / (2 * length(X_test))
push!(
test_loss_values,
test_loss,
)
push!(α_s, α)
push!(test_loss, norm(err_term)^2 / (2 * N * D))
∂α = d_testloss_dα(model, X_test, y_test, w, ŵ)
α -= fixed_step * ∂α
if abs(∂α) grad_tol
break
end
end
return α_s, test_loss_values
return α_s, test_loss
end

ᾱ, msē = descent(0.1, 500);
ᾱ_l, msē_l = descent(0.10, 500);
ᾱ_r, msē_r = descent(0.33, 500);

# Visualize gradient descent and convergence

plot(αs, mse_test, label="MSE test", xaxis = ("α"), legend=:topleft)
plot!(ᾱ, msē, label="learned α", lw = 2)
title!("Regularizer learning")
Plots.plot(
αs, mse_test,
label="MSE test", xaxis = ("α"), legend=:topleft
)
Plots.plot!(ᾱ_l, msē_l, label="learned α, start = 0.10", lw = 2)
Plots.plot!(ᾱ_r, msē_r, label="learned α, start = 0.33", lw = 2)
Plots.title!("Regularizer learning")

0 comments on commit 9e12b41

Please sign in to comment.