Skip to content

Commit

Permalink
Add cudnn layernorm lowerbound benchmark (pytorch#53)
Browse files Browse the repository at this point in the history
* Add cudnn layernorm lowerbound benchmark

* backward=None
  • Loading branch information
wanchaol authored Feb 23, 2019
1 parent 15cf583 commit 2cd03bd
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 1 deletion.
3 changes: 2 additions & 1 deletion rnns/fastrnns/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,8 @@ def bench(rnn_runners, group_name, print_json=False, sep=' ', **params):
rnns = args.rnns or ['cudnn', 'aten', 'jit', 'jit_premul', 'jit_simple',
'jit_multilayer', 'py']
# TODO: Maybe add a separate section for the layernorm/dropout lstms
# 'jit_layernorm', 'jit_layernom_decom', 'jit', 'jit_dropout', 'cudnn_dropout'
# 'cudnn_layernorm', jit_layernorm', 'jit_layernom_decom',
# 'jit', 'jit_dropout', 'cudnn_dropout'
vlrnns = ['vl_cudnn', 'vl_jit', 'vl_py']
cnns = ['resnet18', 'resnet18_jit', 'resnet50', 'resnet50_jit']
if args.print_json:
Expand Down
37 changes: 37 additions & 0 deletions rnns/fastrnns/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,43 @@ def varlen_lstm_creator(script=False, **kwargs):
backward=simple_backward)


# cudnn_layernorm_lstm: since cudnn does not have Layernorm LSTM, we cannot benchmark
# the lowerbound directly. Instead, we only benchmark the foward pass by mimicing the
# computation of a cudnn lstm + seq_len * 3 layernorm computation. This should serve
# as a perf lowerbound for the Layernorm LSTM forward pass(given that Layernorm itself
# is invariant), the lowerbound of backward pass is hard to get since we lose the
# intermediate results, we can still optimize the layernorm implementation to make
# a faster foward lowerbound though.
def layernorm_pytorch_lstm_creator(**kwargs):
input, hidden, _, module = lstm_inputs(return_module=True, **kwargs)
batch_size = kwargs['miniBatch']
hidden_size = kwargs['hiddenSize']
ln_i = torch.nn.LayerNorm(4 * hidden_size).cuda()
ln_h = torch.nn.LayerNorm(4 * hidden_size).cuda()
ln_c = torch.nn.LayerNorm(hidden_size).cuda()
ln_input1 = torch.randn(batch_size, 4 * hidden_size, device='cuda')

def forward(input, hidden):
out, new_hidden = module(input, hidden)
# plus (seq_len * three laynorm cell computation) to mimic the lower bound of
# Layernorm cudnn LSTM in the forward pass
seq_len = len(input.unbind(0))
hy, cy = new_hidden
for i in range(seq_len):
ln_i_output = ln_i(ln_input1)
ln_h_output = ln_h(ln_input1)
cy = ln_c(cy)

return out, (hy, cy)

return ModelDef(
inputs=[input, hidden],
params=flatten_list(module.all_weights),
forward=forward,
backward_setup=lstm_backward_setup,
backward=None)


# input: lstm.all_weights format (wih, whh, bih, bhh = lstm.all_weights[layer])
# output: packed_weights with format
# packed_weights[0] is wih with size (layer, 4*hiddenSize, inputSize)
Expand Down
1 change: 1 addition & 0 deletions rnns/fastrnns/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def get_rnn_runners(*names):
rnn_runners = {
'cudnn': RNNRunner('cudnn', pytorch_lstm_creator, DummyContext),
'cudnn_dropout': RNNRunner('cudnn_dropout', partial(pytorch_lstm_creator, dropout=0.4), DummyContext),
'cudnn_layernorm': RNNRunner('cudnn_layernorm', layernorm_pytorch_lstm_creator, DummyContext),
'vl_cudnn': RNNRunner('vl_cudnn', varlen_pytorch_lstm_creator, DummyContext),
'vl_jit': RNNRunner('vl_jit', partial(varlen_lstm_creator, script=True), DummyContext),
'vl_py': RNNRunner('vl_py', varlen_lstm_creator, DummyContext),
Expand Down

0 comments on commit 2cd03bd

Please sign in to comment.