-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathmodel_setup.py
88 lines (63 loc) · 3.9 KB
/
model_setup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from keras.layers import Conv2D, Dense, Flatten, Input
from keras.models import Model
import tensorflow as tf
class Agent(object):
def __init__(self, name, state_shape, n_actions, reuse=False):
""" A simple actor-critic agent """
with tf.variable_scope(name, reuse=reuse):
inp = Input(state_shape)
conv1 = Conv2D(32, kernel_size=(3, 3), strides=2, activation='relu')(inp)
conv2 = Conv2D(64, kernel_size=(3, 3), strides=2, activation='relu')(conv1)
dense1 = Dense(256)(Flatten()(conv2))
dense2 = Dense(32)(dense1)
out1 = Dense(n_actions)(dense2)
out2 = Dense(1)(dense2)
self.network = Model(inp, [out1, out2])
self.state_t = tf.placeholder('float32', [None, ] + list(state_shape))
self.agent_outputs = self.symbolic_step(self.state_t)
def symbolic_step(self, state_t):
"""
Takes agent's previous step and observation, returns next state and
whatever it needs to learn (tf tensors).
:param state_t:
:return:
"""
logits, state_value = self.network(state_t)
state_value = state_value[:, 0]
assert tf.is_numeric_tensor(state_value) and state_value.shape.ndims == 1, \
'please return 1D tf tensor of state values [you got %s]' % repr(state_value)
assert tf.is_numeric_tensor(logits) and logits.shape.ndims == 2, \
'please return 2d tf tensor of logits [you got %s]' % repr(logits)
# hint: if you triggered state_values assert with your shape being [None, 1],
# just select [:, 0]-th element of state values as new state values
return logits, state_value
def step(self, sess, state_t):
""" Same as symbolic_step except operates on numpy arrays """
return sess.run(self.agent_outputs, {self.state_t: state_t})
class ActorCritic(object):
def __init__(self, obs_shape, n_actions, agent, gamma=0.99, learning_rate=1e-4):
self.states_ph = tf.placeholder('float32', [None, ] + list(obs_shape))
self.next_states_ph = tf.placeholder('float32', [None, ] + list(obs_shape))
self.actions_ph = tf.placeholder('int32', (None,))
self.rewards_ph = tf.placeholder('float32', (None,))
self.is_done_ph = tf.placeholder('float32', (None,))
# logits[n_envs, n_actions] and state_values[n_envs, n_actions]
logits, state_values = agent.symbolic_step(self.states_ph)
next_logits, next_state_values = agent.symbolic_step(self.next_states_ph)
next_state_values = next_state_values * (1 - self.is_done_ph)
# probabilities and log-probabilities for all actions
probs = tf.nn.softmax(logits) # [n_envs, n_actions]
log_probs = tf.nn.log_softmax(logits) # [n_envs, n_actions]
# log-probabilities only for agent's chosen actions
logp_actions = tf.reduce_mean(log_probs * tf.one_hot(self.actions_ph, n_actions), axis=-1) # [n_envs]
# compute advantage using rewards_ph, state_values, and next_state_values
self.advantage = self.rewards_ph + gamma * (next_state_values - state_values)
assert self.advantage.shape.ndims == 1, 'please compute advantage for each sample, vector of shape [n_envs,]'
self.entropy = -tf.reduce_sum(probs * log_probs, 1)
assert self.entropy.shape.ndims == 1, 'please compute pointwise entropy vector of shape [n_envs,]'
self.actor_loss = -tf.reduce_mean(logp_actions * tf.stop_gradient(self.advantage)) \
- 0.001 * tf.reduce_mean(self.entropy)
# compute target state values using temporal difference formula
target_state_values = self.rewards_ph + gamma * next_state_values
self.critic_loss = tf.reduce_mean((state_values - tf.stop_gradient(target_state_values))**2)
self.train_step = tf.train.AdamOptimizer(learning_rate).minimize(self.actor_loss + self.critic_loss)