-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathNNAgent.m
236 lines (214 loc) · 9.51 KB
/
NNAgent.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
classdef NNAgent < handle
%NNAGENT Agent that plays 2048 game.
% Uses neural network to calculate Q-values.
properties (Constant)
rows = 4;
cols = 4;
actions = 4;
memory_size = 1000000;
action_labels = {'UP', 'RIGHT', 'DOWN', 'LEFT'};
end
properties
exploration_steps;
exploration_rate;
discount_rate;
minibatch_size;
preprocess;
mem;
nnet;
game;
end
methods
function this = NNAgent(opts)
% NNAGENT Creates deep Q-learning agent.
% Parameters:
% exporation_rate - choose random move with this probability, i.e. 0.05.
% discount_rate - discount future rewards by this coefficient, i.e. 0.9.
% learning_rate - learning rate of neural network, i.e. 0.01.
% momentum - momentum of learning, set to 0 for no momentum.
% layers - sizes of hidden layers as vector, i.e. [256 256].
% minibatch_size - minibatch size when training the network.
% remember parameters
this.exploration_steps = opts.exploration_steps;
this.exploration_rate = opts.exploration_rate;
this.discount_rate = opts.discount_rate;
this.minibatch_size = opts.minibatch_size;
this.preprocess = opts.preprocess;
% initialize memory
this.mem = Memory(this.memory_size, [this.rows this.cols]);
% initialize neural network
this.nnet = nnsetup([this.rows*this.cols opts.layers this.actions]);
this.nnet.output = 'linear';
this.nnet.momentum = opts.momentum;
this.nnet.activation_function = opts.activation_function;
this.nnet.dropoutFraction = opts.dropout_fraction;
this.nnet.weightPenaltyL2 = opts.weight_penalty;
this.nnet.learningRate = opts.learning_rate;
% initialize game
this.game = Game(this.rows, this.cols);
end
function results = play(this, nr_games)
% PLAY Play 2048 game with deep Q-learning agent.
% Parameters:
% nr_games - number of games to play. If 1, then also displays debug info.
% Returns scores in those games as a vector.
% You can call this function several times and it continues
% improving the same network, but learning_rate, momentum and
% layers take effect only on first time.
iteration = 0;
scores = [];
avg_scores = [];
game_count = 0;
game_counts = [];
losses = [];
q_states = {};
q_values = [];
% play nr_games
for i = (1:nr_games)
% initialize game field
this.game.new();
results(i) = 0;
if (nr_games == 1)
disp(this.game.state);
end
% play till end
terminal = false;
while (~terminal)
epsilon = this.compute_epsilon(iteration);
% choose random action with probability exploration_rate
if (unifrnd(0, 1) < epsilon)
% choose random action
action = randi(this.actions);
if (nr_games == 1)
disp([this.action_labels{action} '(random)'])
end
else
% choose action with the best Q-value
qvalues = this.predict(this.game.state);
[~, action] = max(qvalues, [], 2);
if (nr_games == 1)
disp(['Q-values: ' num2str(qvalues)]);
disp([this.action_labels{action} '(predicted)'])
end
end
% make a move and observe reward
a = this.game.state;
[points, changed] = this.game.move(action);
b = this.game.state;
terminal = this.game.end();
results(i) = results(i) + points;
if (nr_games == 1)
disp(['Reward: ', num2str(points)]);
disp(b);
end
% add state transition to memory only if changed
if changed
this.mem.add(a, action, points, b, terminal);
end
% if memory contain enough states
if this.mem.size > this.minibatch_size
% get minibatch from memory and train network
this.train(this.mem.minibatch(this.minibatch_size));
iteration = iteration + 1;
if mod(iteration, 10000) == 0
if isempty(q_states)
q_states = this.mem.minibatch(this.minibatch_size);
fig = figure;
end
figure(fig);
% average scores
subplot(2,2,1);
avg_scores = [avg_scores mean(scores)];
scores = [];
plot(avg_scores);
title('Average game score');
% nr of games
subplot(2,2,2);
game_counts = [game_counts game_count];
game_count = 0;
plot(game_counts);
title('Number of games');
% network loss
subplot(2,2,3);
losses = [losses this.nnet.L];
plot(losses);
title('Network loss');
% avg max Q-value
subplot(2,2,4);
x = this.preprocess(q_states.prestates(:,:));
this.nnet.testing = 1;
this.nnet = nnff(this.nnet, x, zeros(size(x,1), this.nnet.size(end)));
this.nnet.testing = 0;
y = this.nnet.a{end};
q_values = [q_values mean(max(y, [], 2))];
plot(q_values);
title('Average Q-value');
drawnow;
end
end
end
disp([num2str(i) ' ' num2str(results(i)) ' ' num2str(this.compute_epsilon(this.mem.size)) ' ' num2str(this.mem.size) ' ' num2str(iteration)]);
scores = [scores results(i)];
game_count = game_count + 1;
end
end
function e = compute_epsilon(this, iteration)
% COMPUTE_EPSILON Compute exploration rate based on number of trainings.
% Parameters:
% iteration - number of training passes done
% Returns exploration rate, that decays until exploration_steps are achieved.
e = max(1 - iteration / this.exploration_steps, this.exploration_rate);
end
function y = predict(this, a)
% PREDICT Predict Q-values for state a.
% Parameters:
% a - game state
% Returns predicted Q-values.
% flatten the matrix and turn into one-element minibatch
x = this.preprocess(a(:)');
% copied from nnpredict()
this.nnet.testing = 1;
this.nnet = nnff(this.nnet, x, zeros(size(x,1), this.nnet.size(end)));
this.nnet.testing = 0;
y = this.nnet.a{end};
end
function train(this, b)
% TRAIN Train the network nn with minibatch b and discount_rate.
% Parameters:
% b - minibatch
% Returns trained neural network.
% flatten states for input to neural network
x = this.preprocess(b.prestates(:,:));
xx = this.preprocess(b.poststates(:,:));
% predict Q-values of prestates
this.nnet.testing = 1;
this.nnet = nnff(this.nnet, x, zeros(size(x,1), this.nnet.size(end)));
this.nnet.testing = 0;
y = this.nnet.a{end};
if this.discount_rate > 0
% predict Q-values of poststates
this.nnet.testing = 1;
this.nnet = nnff(this.nnet, xx, zeros(size(xx,1), this.nnet.size(end)));
this.nnet.testing = 0;
yy = this.nnet.a{end};
% maximum Q-value for each poststate
yymax = max(yy, [], 2);
end
% calculate discounted future reward for each state transition in batch
for i = 1:size(x,1)
% preprocess reward the same way as input
reward = this.preprocess(b.rewards(i));
% only change one action, other Q-values stay the same
if b.terminals(i) || this.discount_rate == 0
y(i, b.actions(i)) = reward;
else
y(i, b.actions(i)) = reward + this.discount_rate * yymax(i);
end
end
% train the network (copied from nntrain())
this.nnet = nnff(this.nnet, x, y);
this.nnet = nnbp(this.nnet);
this.nnet = nnapplygrads(this.nnet);
end
end
end