Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

[v2.0] RNN: use rnn_params #20384

Merged
merged 22 commits into from
Oct 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ci/docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ services:
dockerfile: Dockerfile.build.centos7
target: base
args:
BASE_IMAGE: nvidia/cuda:10.2-cudnn7-devel-centos7
BASE_IMAGE: nvidia/cuda:10.2-cudnn8-devel-centos7
cache_from:
- ${DOCKER_CACHE_REGISTRY}/build.centos7_gpu_cu102:latest
centos7_gpu_cu110:
Expand Down
3 changes: 3 additions & 0 deletions python/mxnet/gluon/parameter.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,9 @@ def initialize(self, init=None, ctx=None, default_init=initializer.Uniform(),
ctx = [context.current_context()]
if isinstance(ctx, Context):
ctx = [ctx]
if isinstance(self.init, initializer.RNNFused):
self.init.set_initializer(init if init else default_init)
init = default_init = self.init
if init is None:
init = default_init if self.init is None else self.init
if not shape_is_known(self.shape):
Expand Down
105 changes: 29 additions & 76 deletions python/mxnet/gluon/rnn/rnn_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

__all__ = ['RNN', 'LSTM', 'GRU']

from ... import np, npx, context
from ... import np, npx, context, initializer
from .. import HybridBlock, tensor_types
from ..parameter import Parameter
from ...util import use_np
Expand All @@ -50,11 +50,6 @@ def __init__(self, hidden_size, num_layers, layout,
self._dropout = dropout
self._dir = 2 if bidirectional else 1
self._input_size = input_size
self._i2h_weight_initializer = i2h_weight_initializer
self._h2h_weight_initializer = h2h_weight_initializer
self._i2h_bias_initializer = i2h_bias_initializer
self._h2h_bias_initializer = h2h_bias_initializer
self._h2r_weight_initializer = h2r_weight_initializer
self._lstm_state_clip_min = lstm_state_clip_min
self._lstm_state_clip_max = lstm_state_clip_max
self._lstm_state_clip_nan = lstm_state_clip_nan
Expand All @@ -64,48 +59,17 @@ def __init__(self, hidden_size, num_layers, layout,

self._gates = {'rnn_relu': 1, 'rnn_tanh': 1, 'lstm': 4, 'gru': 3}[mode]

ng, ni, nh = self._gates, input_size, hidden_size
if not projection_size:
for i in range(num_layers):
for j in ['l', 'r'][:self._dir]:
self._register_param('{}{}_i2h_weight'.format(j, i),
shape=(ng*nh, ni),
init=i2h_weight_initializer, dtype=dtype)
self._register_param('{}{}_h2h_weight'.format(j, i),
shape=(ng*nh, nh),
init=h2h_weight_initializer, dtype=dtype)
self._register_param('{}{}_i2h_bias'.format(j, i),
shape=(ng*nh,),
init=i2h_bias_initializer, dtype=dtype)
self._register_param('{}{}_h2h_bias'.format(j, i),
shape=(ng*nh,),
init=h2h_bias_initializer, dtype=dtype)
ni = nh * self._dir
else:
ps = self._projection_size
for i in range(num_layers):
for j in ['l', 'r'][:self._dir]:
self._register_param('{}{}_i2h_weight'.format(j, i),
shape=(ng*nh, ni),
init=i2h_weight_initializer, dtype=dtype)
self._register_param('{}{}_h2h_weight'.format(j, i),
shape=(ng*nh, ps),
init=h2h_weight_initializer, dtype=dtype)
self._register_param('{}{}_i2h_bias'.format(j, i),
shape=(ng*nh,),
init=i2h_bias_initializer, dtype=dtype)
self._register_param('{}{}_h2h_bias'.format(j, i),
shape=(ng*nh,),
init=h2h_bias_initializer, dtype=dtype)
self._register_param('{}{}_h2r_weight'.format(j, i),
shape=(ps, nh),
init=h2r_weight_initializer, dtype=dtype)
ni = ps * self._dir

def _register_param(self, name, shape, init, dtype):
p = Parameter(name, shape=shape, init=init, allow_deferred_init=True, dtype=dtype)
setattr(self, name, p)
return p
param_initializer = initializer.RNNFused(
mode, num_layers, hidden_size,
bidirectional, projection_size,
i2h_weight_initializer=i2h_weight_initializer,
h2h_weight_initializer=h2h_weight_initializer,
i2h_bias_initializer=i2h_bias_initializer,
h2h_bias_initializer=h2h_bias_initializer,
h2r_weight_initializer=h2r_weight_initializer)

self.rnn_param = Parameter('rnn_param', shape=(-1,), init=param_initializer,
allow_deferred_init=True, dtype=dtype)

def __repr__(self):
s = '{name}({mapping}, {_layout}'
Expand All @@ -116,8 +80,7 @@ def __repr__(self):
if self._dir == 2:
s += ', bidirectional'
s += ')'
shape = self.l0_i2h_weight.shape
mapping = '{0} -> {1}'.format(shape[1] if shape[1] else None, shape[0] // self._gates)
mapping = '{0} -> {1}'.format(self._input_size if self._input_size else None, self._hidden_size)
return s.format(name=self.__class__.__name__,
mapping=mapping,
**self.__dict__)
Expand Down Expand Up @@ -196,37 +159,26 @@ def forward(self, inputs, states, sequence_length=None):
def infer_shape(self, inputs, *args):
assert inputs.ndim == 3, \
"Input data should be rank-3 tensor of dim [sequence length, batch size, input size]"
if not self._projection_size:
step = self._hidden_size
else:
step = self._projection_size
ni = inputs.shape[2]
for i in range(self._num_layers):
for j in ['l', 'r'][:self._dir]:
name = '{}{}_i2h_weight'.format(j, i)
getattr(self, name).shape = (self._gates*self._hidden_size, ni)
ni = step * self._dir
self._input_size = inputs.shape[2]
ng, ni, nh = self._gates, inputs.shape[2], self._hidden_size

size = nh * self._dir * ng
size1 = (ni + nh + 2) * size # first layer size
size2 = (nh * self._dir + nh + 2) * size # second layer size
if self._projection_size:
size1 = (ni + self._projection_size + 2) * size # first layer size
size2 = (self._projection_size * self._dir + \
self._projection_size + 2) * size # second layer size
param_size = size1 + (self._num_layers - 1) * size2
if self._projection_size:
param_size += self._projection_size * nh * self._num_layers * self._dir
self.rnn_param.shape = (param_size, )

def _forward_kernel(self, inputs, states, sequence_length):
""" forward using CUDNN or CPU kenrel"""
ctx = inputs.ctx
if self._layout == 'NTC':
inputs = np.swapaxes(inputs, 0, 1)
if self._projection_size is None:
params = (getattr(self, '{}{}_{}_{}'.format(d, l, g, t)).data(ctx).reshape(-1)
for t in ['weight', 'bias']
for l in range(self._num_layers)
for d in ['l', 'r'][:self._dir]
for g in ['i2h', 'h2h'])
else:
params = (getattr(self, '{}{}_{}_{}'.format(d, l, g, t)).data(ctx).reshape(-1)
for t in ['weight', 'bias']
for l in range(self._num_layers)
for d in ['l', 'r'][:self._dir]
for g in ['i2h', 'h2h', 'h2r']
if g != 'h2r' or t != 'bias')

params = np.concatenate(params, axis=0)

if self._use_sequence_length:
rnn_args = states + [sequence_length]
Expand All @@ -238,7 +190,8 @@ def _forward_kernel(self, inputs, states, sequence_length):
new_args = args.as_in_ctx(ctx)
rnn_args_ctx.append(new_args)

rnn = npx.rnn(inputs, params, *rnn_args_ctx, use_sequence_length=self._use_sequence_length,
rnn = npx.rnn(inputs, self.rnn_param.data().as_in_ctx(ctx), *rnn_args_ctx,
use_sequence_length=self._use_sequence_length,
state_size=self._hidden_size, projection_size=self._projection_size,
num_layers=self._num_layers, bidirectional=self._dir == 2,
p=self._dropout, state_outputs=True, mode=self._mode,
Expand Down
76 changes: 76 additions & 0 deletions python/mxnet/gluon/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,3 +504,79 @@ def _check_block_input_np_ndarrays(inputs):
for i in inputs:
_check_block_input_np_ndarrays(i)
# pylint: enable=no-else-raise


# pylint: disable=too-many-nested-blocks
def split_rnn_params(param, mode, num_layers, input_size, hidden_size, bidirectional=False, projection_size=None):
"""Split rnn layer parameter into weight and bias in different layer.

Parameters
----------
param : ndarray
The parameter of rnn layer.
mode : str
Mode of rnn. Supported modes: rnn_relu, rnn_tanh, lstm, gru
num_layers : int, default 1
Number of recurrent layers.
input_size: int, default 0
The number of expected features in the input x.
If not specified, it will be inferred from input.
hidden_size: int
The number of features in the hidden state h.
bidirectional: bool, default False
If `True`, becomes a bidirectional RNN.
projection_size: int, default None
The number of features after projection.
"""
gates = {'rnn_relu': 1, 'rnn_tanh': 1, 'lstm': 4, 'gru': 3}[mode]
dir = 2 if bidirectional else 1
param_dict = {}
begin = 0
if not projection_size:
for p in ['weight', 'bias']:
for l in range(num_layers):
for d in ['l', 'r'][:dir]:
for g in ['i2h', 'h2h']:
ni = input_size
if l != 0:
ni = hidden_size * dir
if g == 'h2h':
ni = hidden_size
shape0 = gates * hidden_size
if p == 'weight':
cur_len = shape0 * ni
param_dict['{}{}_{}_{}'.format(d, l, g, p)] = \
param[begin:begin+cur_len].reshape(shape0, ni)
else:
cur_len = shape0
param_dict['{}{}_{}_{}'.format(d, l, g, p)] = \
param[begin:begin+cur_len].reshape(shape0,)
begin += cur_len
else:
for p in ['weight', 'bias']:
for l in range(num_layers):
for d in ['l', 'r'][:dir]:
for g in ['i2h', 'h2h', 'h2r']:
if g != 'h2r' or p != 'bias':
if g == 'h2r':
cur_len = projection_size * hidden_size
param_dict['{}{}_{}_{}'.format(d, l, g, p)] = \
param[begin:begin+cur_len]. \
reshape(projection_size, hidden_size)
else:
ni = input_size
if l != 0:
ni = projection_size * dir
if g == 'h2h':
ni = projection_size
shape0 = gates * hidden_size
if p == 'weight':
cur_len = shape0 * ni
param_dict['{}{}_{}_{}'.format(d, l, g, p)] = \
param[begin:begin+cur_len].reshape(shape0, ni)
else:
cur_len = shape0
param_dict['{}{}_{}_{}'.format(d, l, g, p)] = \
param[begin:begin+cur_len].reshape(shape0,)
begin += cur_len
return param_dict
119 changes: 119 additions & 0 deletions python/mxnet/initializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -711,3 +711,122 @@ def _init_weight(self, name, arr):
# gate of the 4 LSTM gates, we modify the according values.
num_hidden = int(arr.shape[0] / 4)
arr[num_hidden:2*num_hidden] = self.forget_bias


@register
class RNNFused(Initializer):
"""Initialize RNN fused parameter with bias part initialized to 0.0 and
weight initialized with random values uniformly sampled from a given range.

Parameters
----------
mode : {'gru', 'lstm', 'rnn_relu', 'rnn_tanh'}, required
the type of RNN to compute
num_layers : int (non-negative), required
number of stacked layers
state_size : int (non-negative), required
size of the state for each layer
bidirectional : boolean, optional, default=0
whether to use bidirectional recurrent layers
projection_size : int or None, optional, default='None'
size of project size
scale : float, optional
The bound on the range of the generated random values for weights.
Values are generated from the range [-`scale`, `scale`].
Default scale is 0.07.
"""
def __init__(self, mode, num_layers, state_size, bidirectional=False,
projection_size=None, i2h_weight_initializer=None,
h2h_weight_initializer=None, i2h_bias_initializer=None,
h2h_bias_initializer=None, h2r_weight_initializer=None):
super(RNNFused, self).__init__(mode=mode, num_layers=num_layers,
state_size=state_size,
bidirectional=bidirectional,
projection_size=projection_size,
i2h_weight_initializer=i2h_weight_initializer,
h2h_weight_initializer=h2h_weight_initializer,
i2h_bias_initializer=i2h_bias_initializer,
h2h_bias_initializer=h2h_bias_initializer,
h2r_weight_initializer=h2r_weight_initializer)
self.gates = {'rnn_relu': 1, 'rnn_tanh': 1, 'lstm': 4, 'gru': 3}[mode]
self.num_layers = num_layers
self.num_hidden = state_size
self.dir = 2 if bidirectional else 1
self.projection_size = projection_size
self._i2h_weight_initializer = i2h_weight_initializer
self._h2h_weight_initializer = h2h_weight_initializer
self._i2h_bias_initializer = i2h_bias_initializer
self._h2h_bias_initializer = h2h_bias_initializer
self._h2r_weight_initializer = h2r_weight_initializer

# pylint: disable=too-many-nested-blocks
def _init_weight(self, name, arr):
arr_len = arr.shape[0]
size = self.num_hidden * self.dir * self.gates
if not self.projection_size:
# second layer size
size2 = (self.num_hidden * self.dir + self.num_hidden + 2) * size
input_size = (arr_len - (self.num_layers - 1) * size2) // \
size - 2 - self.num_hidden
else:
# second layer size
size2 = (self.projection_size * self.dir + self.projection_size + 2) * size
size_projection = self.projection_size * self.num_hidden * self.num_layers * self.dir
input_size = (arr_len - size_projection - (self.num_layers - 1) * size2) // \
size - 2 - self.projection_size
begin = 0
if not self.projection_size:
for param in ['weight', 'bias']:
for layer_num in range(self.num_layers):
for _ in range(self.dir):
for connect in ['i2h', 'h2h']:
num_inputs = input_size
if layer_num != 0:
num_inputs = self.num_hidden * self.dir
if connect == 'h2h':
num_inputs = self.num_hidden
shape0 = self.gates * self.num_hidden
if param == 'weight':
cur_len = shape0 * num_inputs
else:
cur_len = shape0
self._init_util(param, connect, arr[begin:begin+cur_len])
begin += cur_len
else:
for param in ['weight', 'bias']:
for layer_num in range(self.num_layers):
for _ in range(self.dir):
for connect in ['i2h', 'h2h', 'h2r']:
if connect != 'h2r' or param != 'bias':
if connect == 'h2r':
cur_len = self.projection_size * self.num_hidden
else:
num_inputs = input_size
if layer_num != 0:
num_inputs = self.projection_size * self.dir
if connect == 'h2h':
num_inputs = self.projection_size
shape0 = self.gates * self.num_hidden
if param == 'weight':
cur_len = shape0 * num_inputs
else:
cur_len = shape0
self._init_util(param, connect, arr[begin:begin+cur_len])
begin += cur_len

def _init_util(self, param, connect, arr):
name = "_{}_{}_initializer".format(connect, param)
init = getattr(self, name)
create(init)(InitDesc(name, {'__init__': init}), arr)

def set_initializer(self, init):
self._i2h_weight_initializer = \
init if not self._i2h_weight_initializer else 'uniform'
self._h2h_weight_initializer = \
init if not self._h2h_weight_initializer else 'uniform'
self._i2h_bias_initializer = \
init if not self._i2h_bias_initializer else 'zero'
self._h2h_bias_initializer = \
init if not self._i2h_bias_initializer else 'zero'
self._h2r_weight_initializer = \
init if not self._h2r_weight_initializer else 'uniform'
Loading