-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathsr1_update.m
176 lines (147 loc) · 4.62 KB
/
sr1_update.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
function [delta, M, W, h] = sr1_update(x, g, n_max, h)
% [DELTA, M, W, HIST] = LBFGS_UPDATE(X, G, N_MAX, HIST)
%
% Description
% Update the L-BFGS approximation to the Hessian of a function.
%
% The L-BFGS approximation to a Hessian, H, is given by
%
% H ~ DELTA * I - W * M * W*T.
%
% This is known as the compact, or outer product, representation of the
% quasi-Newton matrix (see Reference).
%
%
% Inputs
% X: Vector.
% Current value of X, the optimization variable.
%
% G: Vector.
% Current value of the gradient of the function to be approximated.
%
% N_MAX: Positive integer.
% The maximum number of previous (X, G) pairs to retain in the
% approximation. Recommended values of N_MAX are 3 and 5.
%
% HIST: Structure.
% Structure containing the previous (X, G) pairs of the function,
% as well as associated computed values.
%
% To initiate a restart, use HIST = [].
%
%
% Outputs
% DELTA: Positive scalar.
% See below.
%
% M, W: Matrices.
% DELTA, M, and W form the quasi-Newton (L-BFGS) approximation of the
% Hessian, as outlined above.
%
% HIST: Structure.
% Contains the updated history of previous (X, G) pairs. Use HIST as an
% input to the subsequent L-BFGS update.
%
%
% Reference
% Chapter 7.2, Nocedal and Wright, Numerical Optimization (Cambridge 2004).
%
% Check if we need to restart.
% A restart (or start) is initiated if there is no history.
%
if isempty(h)
% Components of the quasi-Newton approximation.
delta = 1; % For restart, we simply guess a scaling value of 1.
W = zeros(length(x), 0);
M = zeros(0);
% Initialize the history structure.
h.n = 0; % Index, tells us how "full" S and Y are.
h.S = zeros(length(x), n_max); % Corresponds to S in the reference.
h.Y = zeros(length(x), n_max); % Corresponds to Y in the reference.
h.x_prev = x; % Used to calculate s for next iteration.
h.g_prev = g; % Used to calculate y for next iteration.
h.delta = delta; % Store these in case we need to damp the next update.
h.W = W;
h.M = M;
return
end
%
% Calculate current values of s and y.
%
s = x - h.x_prev;
y = g - h.g_prev;
%
% Update history.
%
h.x_prev = x;
h.g_prev = g;
% % Compute Bs.
% yBs = y - (h.delta * s - h.W * (h.M * (h.W' * s)));
% syBs = s' * yBs;
%
% % Decide whether or not to skip update.
% if norm(syBs) < 1e-8 * norm(s) * norm(yBs)
% warning('Skipped SR1 update.');
% delta = h.delta;
% M = h.M;
% W = h.W;
% return
% end
%
if h.n < n_max % History not full.
n = h.n + 1;
else % History full, delete oldest entry.
n = n_max;
h.S(:, 1:n_max-1) = h.S(:, 2:n_max);
h.Y(:, 1:n_max-1) = h.Y(:, 2:n_max);
h.d(:, 1:n_max-1) = h.d(2:n_max);
h.L(1:n_max-1, 1:n_max-1) = h.L(2:n_max, 2:n_max);
h.S_dot_S(1:n_max-1, 1:n_max-1) = h.S_dot_S(2:n_max, 2:n_max);
end
h.n = n;
% %
% % Check curvature condition, and damp update if needed.
% %
%
%
% % Check curvature condition.
% if (sy <= 0.2 * sBs)
% % warning('Curvature condition broken (s dot y = %e)!', (s' * y));
%
% % Apply damped update (Chapter 18.3 of Reference).
% theta = 0.8 * sBs / (sBs - sy);
% y = theta * y + (1 - theta) * Bs; % Replace y.
% end
% % fprintf('0.2 * sBs: %e , sy: %e\n', 0.2*sBs, s' * y);
% Insert new values of s and y into the history.
h.S(:,n) = s;
h.Y(:,n) = y;
%
% Calculate components needed to form delta, M, and W.
%
% These dot products should be the bulk of the processing for this function.
s_dot_y = s' * h.Y(:, 1:n);
s_dot_s = s' * h.S(:, 1:n);
% Update components used to form delta, M, and W.
h.S_dot_S(n, 1:n) = s_dot_s; % Inner product S' * S.
h.S_dot_S(1:n-1, n) = s_dot_s(1:n-1);
h.d(n) = s_dot_y(n); % Vector containing diagonal elements of a diagonal matrix.
h.L(n, 1:n) = [s_dot_y(1:n-1), 0]; % Lower-diagonal matrix.
%
% Form delta, M, and W.
% The compact representation of the L-BFGS approximation.
%
delta = norm(y)^2 / s_dot_y(n); % Scaling factor.
% Use matrix inverse, since M is a square matrix of size (2*n_max) x (2*n_max).
M = -inv(-delta*h.S_dot_S + h.L + h.L' + diag(h.d));
W = h.Y(:, 1:n) - delta*h.S(:, 1:n);
% Recalculate delta to made sure we produce a positive definite approximation.
min_eig1 = min(real(eig(delta*eye(size(W,1)) - W * M * W')))
if (min_eig1 < 1)
delta = delta + (1 - min_eig1)
min_eig2 = min(real(eig(delta*eye(size(W,1)) - W * M * W')))
% pause
end
h.delta = delta;
h.W = W; % Store these in case we need to damp the next update.
h.M = M;