Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert back to support input-hidden weights sharing between bi-directional RNNs. #224

Merged
merged 1 commit into from
Sep 4, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions deep_speech_2/demo_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,16 @@
help="RNN layer number. (default: %(default)s)")
parser.add_argument(
"--rnn_layer_size",
default=512,
default=2048,
type=int,
help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--share_rnn_weights",
default=True,
type=distutils.util.strtobool,
help="Whether to share input-hidden weights between forword and backward "
"directional simple RNNs. Only available when use_gru=False. "
"(default: %(default)s)")
parser.add_argument(
"--use_gru",
default=False,
Expand Down Expand Up @@ -205,7 +212,8 @@ def start_server():
num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru,
pretrained_model_path=args.model_filepath)
pretrained_model_path=args.model_filepath,
share_rnn_weights=args.share_rnn_weights)

# prepare ASR inference handler
def file_to_transcript(filename):
Expand Down
12 changes: 10 additions & 2 deletions deep_speech_2/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,16 @@
help="RNN layer number. (default: %(default)s)")
parser.add_argument(
"--rnn_layer_size",
default=512,
default=2048,
type=int,
help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--share_rnn_weights",
default=True,
type=distutils.util.strtobool,
help="Whether to share input-hidden weights between forword and backward "
"directional simple RNNs. Only available when use_gru=False. "
"(default: %(default)s)")
parser.add_argument(
"--use_gru",
default=False,
Expand Down Expand Up @@ -148,7 +155,8 @@ def evaluate():
num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru,
pretrained_model_path=args.model_filepath)
pretrained_model_path=args.model_filepath,
share_rnn_weights=args.share_rnn_weights)

error_rate_func = cer if args.error_rate_type == 'cer' else wer
error_sum, num_ins = 0.0, 0
Expand Down
12 changes: 10 additions & 2 deletions deep_speech_2/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,16 @@
help="RNN layer number. (default: %(default)s)")
parser.add_argument(
"--rnn_layer_size",
default=512,
default=2048,
type=int,
help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--share_rnn_weights",
default=True,
type=distutils.util.strtobool,
help="Whether to share input-hidden weights between forword and backward "
"directional simple RNNs. Only available when use_gru=False. "
"(default: %(default)s)")
parser.add_argument(
"--use_gru",
default=False,
Expand Down Expand Up @@ -149,7 +156,8 @@ def infer():
num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru,
pretrained_model_path=args.model_filepath)
pretrained_model_path=args.model_filepath,
share_rnn_weights=args.share_rnn_weights)
result_transcripts = ds2_model.infer_batch(
infer_data=infer_data,
decode_method=args.decode_method,
Expand Down
89 changes: 63 additions & 26 deletions deep_speech_2/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
return paddle.layer.batch_norm(input=conv_layer, act=act)


def bidirectional_simple_rnn_bn_layer(name, input, size, act):
def bidirectional_simple_rnn_bn_layer(name, input, size, act, share_weights):
"""Bidirectonal simple rnn layer with sequence-wise batch normalization.
The batch normalization is only performed on input-state weights.

Expand All @@ -51,24 +51,50 @@ def bidirectional_simple_rnn_bn_layer(name, input, size, act):
:type size: int
:param act: Activation type.
:type act: BaseActivation
:param share_weights: Whether to share input-hidden weights between
forward and backward directional RNNs.
:type share_weights: bool
:return: Bidirectional simple rnn layer.
:rtype: LayerOutput
"""
# input-hidden weights shared across bi-direcitonal rnn.
input_proj_forward = paddle.layer.fc(
input=input, size=size, act=paddle.activation.Linear(), bias_attr=False)
input_proj_backward = paddle.layer.fc(
input=input, size=size, act=paddle.activation.Linear(), bias_attr=False)
# batch norm is only performed on input-state projection
input_proj_bn_forward = paddle.layer.batch_norm(
input=input_proj_forward, act=paddle.activation.Linear())
input_proj_bn_backward = paddle.layer.batch_norm(
input=input_proj_backward, act=paddle.activation.Linear())
# forward and backward in time
forward_simple_rnn = paddle.layer.recurrent(
input=input_proj_bn_forward, act=act, reverse=False)
backward_simple_rnn = paddle.layer.recurrent(
input=input_proj_bn_backward, act=act, reverse=True)
if share_weights:
# input-hidden weights shared between bi-direcitonal rnn.
input_proj = paddle.layer.fc(
input=input,
size=size,
act=paddle.activation.Linear(),
bias_attr=False)
# batch norm is only performed on input-state projection
input_proj_bn = paddle.layer.batch_norm(
input=input_proj, act=paddle.activation.Linear())
# forward and backward in time
forward_simple_rnn = paddle.layer.recurrent(
input=input_proj_bn, act=act, reverse=False)
backward_simple_rnn = paddle.layer.recurrent(
input=input_proj_bn, act=act, reverse=True)

else:
input_proj_forward = paddle.layer.fc(
input=input,
size=size,
act=paddle.activation.Linear(),
bias_attr=False)
input_proj_backward = paddle.layer.fc(
input=input,
size=size,
act=paddle.activation.Linear(),
bias_attr=False)
# batch norm is only performed on input-state projection
input_proj_bn_forward = paddle.layer.batch_norm(
input=input_proj_forward, act=paddle.activation.Linear())
input_proj_bn_backward = paddle.layer.batch_norm(
input=input_proj_backward, act=paddle.activation.Linear())
# forward and backward in time
forward_simple_rnn = paddle.layer.recurrent(
input=input_proj_bn_forward, act=act, reverse=False)
backward_simple_rnn = paddle.layer.recurrent(
input=input_proj_bn_backward, act=act, reverse=True)

return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])


Expand All @@ -87,7 +113,6 @@ def bidirectional_gru_bn_layer(name, input, size, act):
:return: Bidirectional simple rnn layer.
:rtype: LayerOutput
"""
# input-hidden weights shared across bi-direcitonal rnn.
input_proj_forward = paddle.layer.fc(
input=input,
size=size * 3,
Expand All @@ -98,7 +123,7 @@ def bidirectional_gru_bn_layer(name, input, size, act):
size=size * 3,
act=paddle.activation.Linear(),
bias_attr=False)
# batch norm is only performed on input-state projection
# batch norm is only performed on input-related projections
input_proj_bn_forward = paddle.layer.batch_norm(
input=input_proj_forward, act=paddle.activation.Linear())
input_proj_bn_backward = paddle.layer.batch_norm(
Expand Down Expand Up @@ -126,9 +151,9 @@ def conv_group(input, num_stacks):
filter_size=(11, 41),
num_channels_in=1,
num_channels_out=32,
stride=(2, 2),
stride=(3, 2),
padding=(5, 20),
act=paddle.activation.Relu())
act=paddle.activation.BRelu())
for i in xrange(num_stacks - 1):
conv = conv_bn_layer(
input=conv,
Expand All @@ -137,13 +162,13 @@ def conv_group(input, num_stacks):
num_channels_out=32,
stride=(1, 2),
padding=(5, 10),
act=paddle.activation.Relu())
act=paddle.activation.BRelu())
output_num_channels = 32
output_height = 160 // pow(2, num_stacks) + 1
return conv, output_num_channels, output_height


def rnn_group(input, size, num_stacks, use_gru):
def rnn_group(input, size, num_stacks, use_gru, share_rnn_weights):
"""RNN group with stacked bidirectional simple RNN layers.

:param input: Input layer.
Expand All @@ -154,6 +179,10 @@ def rnn_group(input, size, num_stacks, use_gru):
:type num_stacks: int
:param use_gru: Use gru if set True. Use simple rnn if set False.
:type use_gru: bool
:param share_rnn_weights: Whether to share input-hidden weights between
forward and backward directional RNNs.
It is only available when use_gru=False.
:type share_weights: bool
:return: Output layer of the RNN group.
:rtype: LayerOutput
"""
Expand All @@ -165,12 +194,14 @@ def rnn_group(input, size, num_stacks, use_gru):
input=output,
size=size,
act=paddle.activation.Relu())
# BRelu does not support hppl, need to add later. Use Relu instead.
else:
output = bidirectional_simple_rnn_bn_layer(
name=str(i),
input=output,
size=size,
act=paddle.activation.Relu())
act=paddle.activation.BRelu(),
share_weights=share_rnn_weights)
return output


Expand All @@ -180,9 +211,10 @@ def deep_speech2(audio_data,
num_conv_layers=2,
num_rnn_layers=3,
rnn_size=256,
use_gru=True):
use_gru=False,
share_rnn_weights=True):
"""
The whole DeepSpeech2 model structure (a simplified version).
The whole DeepSpeech2 model structure.

:param audio_data: Audio spectrogram data layer.
:type audio_data: LayerOutput
Expand All @@ -198,6 +230,10 @@ def deep_speech2(audio_data,
:type rnn_size: int
:param use_gru: Use gru if set True. Use simple rnn if set False.
:type use_gru: bool
:param share_rnn_weights: Whether to share input-hidden weights between
forward and backward direction RNNs.
It is only available when use_gru=False.
:type share_weights: bool
:return: A tuple of an output unnormalized log probability layer (
before softmax) and a ctc cost layer.
:rtype: tuple of LayerOutput
Expand All @@ -218,7 +254,8 @@ def deep_speech2(audio_data,
input=conv2seq,
size=rnn_size,
num_stacks=num_rnn_layers,
use_gru=use_gru)
use_gru=use_gru,
share_rnn_weights=share_rnn_weights)
fc = paddle.layer.fc(
input=rnn_group_output,
size=dict_size + 1,
Expand Down
14 changes: 10 additions & 4 deletions deep_speech_2/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,17 @@ class DeepSpeech2Model(object):
:param pretrained_model_path: Pretrained model path. If None, will train
from stratch.
:type pretrained_model_path: basestring|None
:param share_rnn_weights: Whether to share input-hidden weights between
forward and backward directional RNNs.Notice that
for GRU, weight sharing is not supported.
:type share_rnn_weights: bool
"""

def __init__(self, vocab_size, num_conv_layers, num_rnn_layers,
rnn_layer_size, use_gru, pretrained_model_path):
rnn_layer_size, use_gru, pretrained_model_path,
share_rnn_weights):
self._create_network(vocab_size, num_conv_layers, num_rnn_layers,
rnn_layer_size, use_gru)
rnn_layer_size, use_gru, share_rnn_weights)
self._create_parameters(pretrained_model_path)
self._inferer = None
self._loss_inferer = None
Expand Down Expand Up @@ -226,7 +231,7 @@ def _create_parameters(self, model_path=None):
gzip.open(model_path))

def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers,
rnn_layer_size, use_gru):
rnn_layer_size, use_gru, share_rnn_weights):
"""Create data layers and model network."""
# paddle.data_type.dense_array is used for variable batch input.
# The size 161 * 161 is only an placeholder value and the real shape
Expand All @@ -244,4 +249,5 @@ def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers,
num_conv_layers=num_conv_layers,
num_rnn_layers=num_rnn_layers,
rnn_size=rnn_layer_size,
use_gru=use_gru)
use_gru=use_gru,
share_rnn_weights=share_rnn_weights)
12 changes: 10 additions & 2 deletions deep_speech_2/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,16 @@
help="RNN layer number. (default: %(default)s)")
parser.add_argument(
"--rnn_layer_size",
default=1024,
default=2048,
type=int,
help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--share_rnn_weights",
default=True,
type=distutils.util.strtobool,
help="Whether to share input-hidden weights between forword and backward "
"directional simple RNNs. Only available when use_gru=False. "
"(default: %(default)s)")
parser.add_argument(
"--use_gru",
default=False,
Expand Down Expand Up @@ -176,7 +183,8 @@ def train():
num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru,
pretrained_model_path=args.init_model_path)
pretrained_model_path=args.init_model_path,
share_rnn_weights=args.share_rnn_weights)
ds2_model.train(
train_batch_reader=train_batch_reader,
dev_batch_reader=dev_batch_reader,
Expand Down
12 changes: 10 additions & 2 deletions deep_speech_2/tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,16 @@
help="RNN layer number. (default: %(default)s)")
parser.add_argument(
"--rnn_layer_size",
default=512,
default=2048,
type=int,
help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--share_rnn_weights",
default=True,
type=distutils.util.strtobool,
help="Whether to share input-hidden weights between forword and backward "
"directional simple RNNs. Only available when use_gru=False. "
"(default: %(default)s)")
parser.add_argument(
"--use_gru",
default=False,
Expand Down Expand Up @@ -164,7 +171,8 @@ def tune():
num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru,
pretrained_model_path=args.model_filepath)
pretrained_model_path=args.model_filepath,
share_rnn_weights=args.share_rnn_weights)

# create grid for search
cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas)
Expand Down
6 changes: 3 additions & 3 deletions deep_speech_2/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@ def print_arguments(args):
Usage:

.. code-block:: python

parser = argparse.ArgumentParser()
parser.add_argument("name", default="Jonh", type=str, help="User name.")
args = parser.parse_args()
args = parser.parse_args()
print_arguments(args)

:param args: Input argparse.Namespace for printing.
:type args: argparse.Namespace
"""
Expand Down