-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcwrnn_tf.py
214 lines (163 loc) · 9.63 KB
/
cwrnn_tf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import numpy as np
import tensorflow as tf
class ClockworkRNN(object):
'''
A Clockwork RNN - Koutnik et al. 2014 [arXiv, https://arxiv.org/abs/1402.3511]
The Clockwork RNN (CW-RNN), in which the hidden layer is partitioned into separate modules,
each processing inputs at its own temporal granularity, making computations only at its prescribed clock rate.
Rather than making the standard RNN models more complex, CW-RNN reduces the number of RNN parameters,
improves the performance significantly in the tasks tested, and speeds up the network evaluation
'''
def __init__(self, config):
self.config = config
# Check if the number of groups (periods) in the hidden layer
# is compatible with the total number of units in the layer. Note that
# this is not a requirement in the paper; there the extra neurons are
# divided over the higher frequency groups.
assert self.config.num_hidden % len(self.config.periods) == 0
# The size of each group(number of neuron of each Group)
self.group_size = self.config.num_hidden // len(self.config.periods)
# Global training step
self.global_step = tf.Variable(0, name='global_step', trainable=False)
# Initialize placeholders
self.inputs = tf.placeholder(
dtype=tf.float32,
shape=[None, self.config.num_steps, self.config.num_input],
name="inputs")
self.targets = tf.placeholder(
dtype=tf.float32,
shape=[None, self.config.num_output],
name="targets")
# Build the complete model
self._build_model()
# Initialize the optimizer with gradient clipping
self._init_optimizer()
# Operations for creating summaries
self._build_summary_ops()
def _build_model(self):
# Weight and bias initializers
initializer_weights = tf.contrib.layers.variance_scaling_initializer()
initializer_bias = tf.constant_initializer(0.0)
# Activation functions of the hidden and output state
activation_hidden = tf.tanh
activation_output = tf.nn.relu
# Split into list of tensors, one for each timestep
x_list = [tf.squeeze(x, axis=[1])
for x in tf.split(
axis=1, num_or_size_splits=self.config.num_steps,
value=self.inputs, name="inputs_list")]
# Periods of each group: 1,2,4, ..., 256 (in the case num_periods=9)
self.clockwork_periods = self.config.periods
# Mask for matrix W_I to make sure it's upper triangular
# self.clockwork_mask = tf.constant(np.triu(np.ones([self.config.num_hidden, self.config.num_hidden])), dtype=tf.float32, name="mask")
mask = np.zeros((self.config.num_hidden, self.config.num_hidden))
for i in range(len(self.clockwork_periods)):
mask[i*self.group_size:(i+1)*self.group_size, i*self.group_size:] = 1
self.clockwork_mask = tf.constant(mask, dtype=tf.float32, name="mask")
with tf.variable_scope("input"):
self.input_W = tf.get_variable("W", shape=[self.config.num_input, self.config.num_hidden], initializer=initializer_weights) # W_I
self.input_b = tf.get_variable("b", shape=[self.config.num_hidden], initializer=initializer_bias) # b_I
with tf.variable_scope("hidden"):
self.hidden_W = tf.get_variable("W", shape=[self.config.num_hidden, self.config.num_hidden], initializer=initializer_weights) # W_H
self.hidden_W = tf.multiply(self.hidden_W, self.clockwork_mask) # => upper triangular matrix # W_H
self.hidden_b = tf.get_variable("b", shape=[self.config.num_hidden], initializer=initializer_bias) # b_H
with tf.variable_scope("output"):
self.output_W = tf.get_variable("W", shape=[self.config.num_hidden, self.config.num_output], initializer=initializer_weights) # W_O
self.output_b = tf.get_variable("b", shape=[self.config.num_output], initializer=initializer_bias) # b_O
with tf.variable_scope("clockwork_cell") as scope:
# Initialize the hidden state of the cell to zero (this is y_{t_1})
self.state = tf.get_variable("state", shape=[self.config.batch_size, self.config.num_hidden], initializer=tf.zeros_initializer(), trainable=False)
for time_step in range(self.config.num_steps):
# Only initialize variables in the first step
if time_step > 0: scope.reuse_variables()
# Find the groups of the hidden layer that are active
group_index = 0
for i in range(len(self.clockwork_periods)):
# Check if (t MOD T_i == 0)
if time_step % self.clockwork_periods[i] == 0:
group_index = i+1 # note the +1
# the real index of group in matrix
group_index = self.group_size * group_index
# Compute (W_I*x_t + b_I)
WI_x = tf.matmul(x_list[time_step], tf.slice(self.input_W, [0, 0], [-1, group_index]))
WI_x = tf.nn.bias_add(WI_x, tf.slice(self.input_b, [0], [group_index]), name="WI_x")
# Compute (W_H*y_{t-1} + b_H), note the multiplication of the clockwork mask (upper triangular matrix)
self.hidden_W = tf.multiply(self.hidden_W, self.clockwork_mask)
WH_y = tf.matmul(self.state, tf.slice(self.hidden_W, [0, 0], [-1, group_index]))
WH_y = tf.nn.bias_add(WH_y, tf.slice(self.hidden_b, [0], [group_index]), name="WH_y")
# Compute y_t = (...) and update the cell state
y_update = tf.add(WH_y, WI_x, name="state_update")
y_update = activation_hidden(y_update)
# Copy the updates to the cell state
self.state = tf.concat(
axis=1, values=[y_update, tf.slice(self.state, [0, group_index], [-1,-1])])
# Save the final hidden state
self.final_state = self.state
# Compute the output, y = f(W_O*y_t + b_O)
self.predictions = tf.matmul(self.final_state, self.output_W)
self.predictions = tf.nn.bias_add(self.predictions, self.output_b)
#self.predictions = activation_output(self.predictions, name="output")
# Compute the loss
self.error = tf.reduce_sum(tf.square(self.targets - self.predictions), axis=1)
self.loss = tf.reduce_mean(self.error, name="loss")
def _init_optimizer(self):
# Learning rate decay, note that is self.learning_rate_decay == 1.0,
# the decay schedule is disabled, i.e. learning rate is constant.
self.learning_rate = tf.train.exponential_decay(
self.config.learning_rate,
self.global_step,
self.config.learning_rate_step,
self.config.learning_rate_decay,
staircase=True
)
self.learning_rate = tf.maximum(self.learning_rate, self.config.learning_rate_min)
tf.summary.scalar("learning_rate", self.learning_rate)
# Definition of the optimizer and computing gradients operation
if self.config.optimizer == 'adam':
# Adam optimizer
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
elif self.config.optimizer == 'rmsprop':
# RMSProper optimizer
self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate)
elif self.config.optimizer == 'adagrad':
# AdaGrad optimizer
self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate)
else:
raise ValueError("Unknown optimizer specified")
# Compute the gradients for each variable
self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
# Optionally perform gradient clipping by max-norm
if self.config.max_norm_gradient > 0:
# Perform gradient clipping by the global norm
grads, variables = zip(*self.grads_and_vars)
grads_clipped, _ = tf.clip_by_global_norm(
grads, clip_norm=self.config.max_norm_gradient)
# Apply the gradients after clipping them
self.train_op = self.optimizer.apply_gradients(
zip(grads_clipped, variables),
global_step=self.global_step
)
else:
# Unclipped gradients
self.train_op = self.optimizer.apply_gradients(
self.grads_and_vars,
global_step=self.global_step
)
# Keep track of gradient values and their sparsity
grad_summaries = []
for g, v in self.grads_and_vars:
if g is not None:
grad_hist_summary = tf.summary.histogram("gradients/{}/hist".format(v.name), g)
sparsity_summary = tf.summary.scalar("gradients/{}/sparsity".format(v.name), tf.nn.zero_fraction(g))
grad_summaries.append(grad_hist_summary)
grad_summaries.append(sparsity_summary)
self.gradient_summaries_merged = tf.summary.merge(grad_summaries)
def _build_summary_ops(self):
# Training summaries
training_summaries = [
tf.summary.scalar("train/loss", self.loss),
tf.summary.scalar("train/learning_rate", self.learning_rate),
]
# Combine the training summaries with the gradient summaries
self.train_summary_op = tf.summary.merge(
[training_summaries, self.gradient_summaries_merged])