Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GRU support for DS2 model. #214

Merged
merged 4 commits into from
Sep 4, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions deep_speech_2/demo_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,11 @@
default=512,
type=int,
help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--use_gru",
default=True,
type=bool,
help="Use GRU or simple RNN. (default: %(default)s)")
parser.add_argument(
"--use_gpu",
default=True,
Expand Down Expand Up @@ -199,6 +204,7 @@ def start_server():
num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru,
pretrained_model_path=args.model_filepath)

# prepare ASR inference handler
Expand Down
6 changes: 6 additions & 0 deletions deep_speech_2/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@
default=512,
type=int,
help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--use_gru",
default=True,
type=bool,
help="Use GRU or simple RNN. (default: %(default)s)")
parser.add_argument(
"--use_gpu",
default=True,
Expand Down Expand Up @@ -142,6 +147,7 @@ def evaluate():
num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru,
pretrained_model_path=args.model_filepath)

error_rate_func = cer if args.error_rate_type == 'cer' else wer
Expand Down
6 changes: 6 additions & 0 deletions deep_speech_2/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@
default=512,
type=int,
help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--use_gru",
default=True,
type=bool,
help="Use GRU or simple RNN. (default: %(default)s)")
parser.add_argument(
"--use_gpu",
default=True,
Expand Down Expand Up @@ -143,6 +148,7 @@ def infer():
num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru,
pretrained_model_path=args.model_filepath)
result_transcripts = ds2_model.infer_batch(
infer_data=infer_data,
Expand Down
73 changes: 65 additions & 8 deletions deep_speech_2/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,45 @@ def bidirectional_simple_rnn_bn_layer(name, input, size, act):
return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])


def bidirectional_gru_bn_layer(name, input, size, act):
"""Bidirectonal gru layer with sequence-wise batch normalization.
The batch normalization is only performed on input-state weights.

:param name: Name of the layer.
:type name: string
:param input: Input layer.
:type input: LayerOutput
:param size: Number of RNN cells.
:type size: int
:param act: Activation type.
:type act: BaseActivation
:return: Bidirectional simple rnn layer.
:rtype: LayerOutput
"""
# input-hidden weights shared across bi-direcitonal rnn.
input_proj_forward = paddle.layer.fc(
input=input,
size=size * 3,
act=paddle.activation.Linear(),
bias_attr=False)
input_proj_backward = paddle.layer.fc(
input=input,
size=size * 3,
act=paddle.activation.Linear(),
bias_attr=False)
# batch norm is only performed on input-state projection
input_proj_bn_forward = paddle.layer.batch_norm(
input=input_proj_forward, act=paddle.activation.Linear())
input_proj_bn_backward = paddle.layer.batch_norm(
input=input_proj_backward, act=paddle.activation.Linear())
# forward and backward in time
forward_gru = paddle.layer.grumemory(
input=input_proj_bn_forward, act=act, reverse=False)
backward_gru = paddle.layer.grumemory(
input=input_proj_bn_backward, act=act, reverse=True)
return paddle.layer.concat(input=[forward_gru, backward_gru])


def conv_group(input, num_stacks):
"""Convolution group with stacked convolution layers.

Expand All @@ -87,9 +126,9 @@ def conv_group(input, num_stacks):
filter_size=(11, 41),
num_channels_in=1,
num_channels_out=32,
stride=(3, 2),
stride=(2, 2),
padding=(5, 20),
act=paddle.activation.BRelu())
act=paddle.activation.Relu())
for i in xrange(num_stacks - 1):
conv = conv_bn_layer(
input=conv,
Expand All @@ -98,13 +137,13 @@ def conv_group(input, num_stacks):
num_channels_out=32,
stride=(1, 2),
padding=(5, 10),
act=paddle.activation.BRelu())
act=paddle.activation.Relu())
output_num_channels = 32
output_height = 160 // pow(2, num_stacks) + 1
return conv, output_num_channels, output_height


def rnn_group(input, size, num_stacks):
def rnn_group(input, size, num_stacks, use_gru):
"""RNN group with stacked bidirectional simple RNN layers.

:param input: Input layer.
Expand All @@ -113,13 +152,25 @@ def rnn_group(input, size, num_stacks):
:type size: int
:param num_stacks: Number of stacked rnn layers.
:type num_stacks: int
:param use_gru: Use gru if set True. Use simple rnn if set False.
:type use_gru: bool
:return: Output layer of the RNN group.
:rtype: LayerOutput
"""
output = input
for i in xrange(num_stacks):
output = bidirectional_simple_rnn_bn_layer(
name=str(i), input=output, size=size, act=paddle.activation.BRelu())
if use_gru:
output = bidirectional_gru_bn_layer(
name=str(i),
input=output,
size=size,
act=paddle.activation.Relu())
else:
output = bidirectional_simple_rnn_bn_layer(
name=str(i),
input=output,
size=size,
act=paddle.activation.Relu())
return output


Expand All @@ -128,7 +179,8 @@ def deep_speech2(audio_data,
dict_size,
num_conv_layers=2,
num_rnn_layers=3,
rnn_size=256):
rnn_size=256,
use_gru=True):
"""
The whole DeepSpeech2 model structure (a simplified version).

Expand All @@ -144,6 +196,8 @@ def deep_speech2(audio_data,
:type num_rnn_layers: int
:param rnn_size: RNN layer size (number of RNN cells).
:type rnn_size: int
:param use_gru: Use gru if set True. Use simple rnn if set False.
:type use_gru: bool
:return: A tuple of an output unnormalized log probability layer (
before softmax) and a ctc cost layer.
:rtype: tuple of LayerOutput
Expand All @@ -161,7 +215,10 @@ def deep_speech2(audio_data,
block_y=conv_group_height)
# rnn group
rnn_group_output = rnn_group(
input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers)
input=conv2seq,
size=rnn_size,
num_stacks=num_rnn_layers,
use_gru=use_gru)
fc = paddle.layer.fc(
input=rnn_group_output,
size=dict_size + 1,
Expand Down
9 changes: 5 additions & 4 deletions deep_speech_2/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ class DeepSpeech2Model(object):
"""

def __init__(self, vocab_size, num_conv_layers, num_rnn_layers,
rnn_layer_size, pretrained_model_path):
rnn_layer_size, use_gru, pretrained_model_path):
self._create_network(vocab_size, num_conv_layers, num_rnn_layers,
rnn_layer_size)
rnn_layer_size, use_gru)
self._create_parameters(pretrained_model_path)
self._inferer = None
self._loss_inferer = None
Expand Down Expand Up @@ -226,7 +226,7 @@ def _create_parameters(self, model_path=None):
gzip.open(model_path))

def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers,
rnn_layer_size):
rnn_layer_size, use_gru):
"""Create data layers and model network."""
# paddle.data_type.dense_array is used for variable batch input.
# The size 161 * 161 is only an placeholder value and the real shape
Expand All @@ -243,4 +243,5 @@ def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers,
dict_size=vocab_size,
num_conv_layers=num_conv_layers,
num_rnn_layers=num_rnn_layers,
rnn_size=rnn_layer_size)
rnn_size=rnn_layer_size,
use_gru=use_gru)
8 changes: 7 additions & 1 deletion deep_speech_2/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,14 @@
help="RNN layer number. (default: %(default)s)")
parser.add_argument(
"--rnn_layer_size",
default=512,
default=1280,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mark here, replace by actual size later.

type=int,
help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--use_gru",
default=True,
type=bool,
help="Use GRU or simple RNN. (default: %(default)s)")
parser.add_argument(
"--adam_learning_rate",
default=5e-4,
Expand Down Expand Up @@ -170,6 +175,7 @@ def train():
num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru,
pretrained_model_path=args.init_model_path)
ds2_model.train(
train_batch_reader=train_batch_reader,
Expand Down
6 changes: 6 additions & 0 deletions deep_speech_2/tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@
default=512,
type=int,
help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--use_gru",
default=True,
type=bool,
help="Use GRU or simple RNN. (default: %(default)s)")
parser.add_argument(
"--use_gpu",
default=True,
Expand Down Expand Up @@ -158,6 +163,7 @@ def tune():
num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru,
pretrained_model_path=args.model_filepath)

# create grid for search
Expand Down