-
Notifications
You must be signed in to change notification settings - Fork 95
/
default_mlp.m
130 lines (112 loc) · 3.79 KB
/
default_mlp.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
% default_mlp -
% Copyright (C) 2011 KyungHyun Cho, Tapani Raiko, Alexander Ilin
%
% This program is free software; you can redistribute it and/or
% modify it under the terms of the GNU General Public License
% as published by the Free Software Foundation; either version 2
% of the License, or (at your option) any later version.
%
% This program is distributed in the hope that it will be useful,
% but WITHOUT ANY WARRANTY; without even the implied warranty of
% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
% GNU General Public License for more details.
%
% You should have received a copy of the GNU General Public License
% along with this program; if not, write to the Free Software
% Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
%
function [M] = default_mlp (layers)
% structure
n_layers = length(layers);
M.structure.layers = layers;
% data type
M.data.binary = 1;
%M.data.binary = 0;
% output type
M.output.binary = 1;% for classification
%M.output.binary = 0; % for regression
% nonlinearity: the name of the variable will change in the later revision
% 0 - sigmoid
% 1 - tanh
% 2 - relu
M.hidden.use_tanh = 0;
% is it being initialized with a DBM?
M.dbm.use = 0;
% learning parameters
M.learning.lrate = 1e-3;
M.learning.lrate0 = 5000;
M.learning.momentum = 0;
M.learning.weight_decay = 0;
M.learning.minibatch_sz = 100;
M.learning.lrate_anneal = 0.9;
M.valid_min_epochs = 10;
M.dropout.use = 0;
% by default
M.dropout.probs = cell(n_layers, 1);
for l = 1:n_layers
M.dropout.probs{l} = 0.5 * ones(layers(l), 1);
end
M.do_normalize = 1;
M.do_normalize_std = 1;
% stopping criterion
% if you happen to know some other criteria, please, do add them.
% if the criterion is zero, it won't stop unless the whole training epochs were consumed.
M.stop.criterion = 0;
% criterion == 1
M.stop.recon_error.tolerate_count = 1000;
% denoising
M.noise.drop = 0.1;
M.noise.level = 0.1;
% initializations
M.W = cell(n_layers, 1);
M.biases = cell(n_layers, 1);
for l = 1:n_layers
M.biases{l} = zeros(layers(l), 1);
if l < n_layers
%M.W{l} = 1/sqrt(layers(l)+layers(l+1)) * randn(layers(l), layers(l+1));
M.W{l} = 2 * sqrt(6)/sqrt(layers(l)+layers(l+1)) * (rand(layers(l), layers(l+1)) - 0.5);
end
end
% adagrad
M.adagrad.use = 0;
M.adagrad.epsilon = 1e-8;
M.adagrad.W = cell(n_layers, 1);
M.adagrad.biases = cell(n_layers, 1);
for l = 1:n_layers
M.adagrad.biases{l} = zeros(layers(l), 1);
if l < n_layers
M.adagrad.W{l} = zeros(layers(l), layers(l+1));
end
end
M.adadelta.use = 0;
M.adadelta.momentum = 0.99;
M.adadelta.epsilon = 1e-6;
M.adadelta.gW = cell(n_layers, 1);
M.adadelta.gbiases = cell(n_layers, 1);
M.adadelta.W = cell(n_layers, 1);
M.adadelta.biases = cell(n_layers, 1);
for l = 1:n_layers
M.adadelta.gbiases{l} = zeros(layers(l), 1);
M.adadelta.biases{l} = zeros(layers(l), 1);
if l < n_layers
M.adadelta.gW{l} = zeros(layers(l), layers(l+1));
M.adadelta.W{l} = zeros(layers(l), layers(l+1));
end
end
% iteration
M.iteration.n_epochs = 100;
M.iteration.n_updates = 0;
% learning signals
M.signals.recon_errors = [];
M.signals.valid_errors = [];
M.signals.lrates = [];
M.signals.n_epochs = 0;
% debug
M.verbose = 0;
M.debug.do_display = 0;
M.debug.display_interval = 10;
M.debug.display_fid = 1;
M.debug.display_function = @visualize_dae;
% hook
M.hook.per_epoch = {@save_intermediate, {'mlp.mat'}};
M.hook.per_update = {@print_n_updates, {}};