-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgen_layernorm_benchmark.py
138 lines (127 loc) · 7.47 KB
/
gen_layernorm_benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import subprocess
from nvprof_parser import parse_nvprof_out
import pandas as pd
import re
LARGE_BERT_SHAPES = [(128 * 32, 768), (128 * 32, 1024), (128 * 64, 768), (128 * 64, 1024)]
TEST_BATCH_L = [128, 128 * 20, 128 * 32, 128 * 64, 128 * 128]
TEST_CHANNEL_L = [32, 64, 128, 256, 512, 768, 1024]
NVPROF_EXE = 'nvprof'
PYTHON_EXE = 'python3'
N_REPEAT = 3
EPS = 1E-5
USE_GPU = 1
DTYPE = 'float32'
TIME_R = r'\d+\.?\d*'
LN_OUT_REG = r'Forward: ({})us, Backward: ({})us'.format(TIME_R, TIME_R)
MX_FWD_KEYWORD = 'LayerNormFusedForwardKernel'
MX_BWD_DATA_KEYWORD = 'LayerNormFusedBackwardKernel_Data'
MX_BWD_GAMMA_BETA_KEYWORD = ['LayerNormFusedBackwardKernel_PartGammaBeta', 'LayerNormFusedBackwardKernel_GammaBeta']
MX_OLD_KEYWORD = ['op::broadcast::reduce_kernel', 'binary_broadcast_kernel',
'mshadow_op3mul']
def as_markdown_table(df):
ret = ''
# Print header
ret += '| ' + '|' + '|'.join([' B={} '.format(ele) for ele in df.columns]) + ' |\n'
ret += '| --- ' + '|' + '|'.join([' --- ' for ele in df.columns]) + ' |\n'
for c in df.index:
ret += '|**C={}**'.format(c) + '|' + '|'.join([' {:g} '.format(df.loc[c, b])
if (b, c) not in LARGE_BERT_SHAPES
else ' **{:g}** '.format(df.loc[c, b])
for b in df.columns]) + ' |\n'
return ret
def test_speed(codebase, test_batch_l, test_channel_l, eps, use_gpu, dtype, profile_nv,
profile_old=False):
py_time_fwd_df = pd.DataFrame(columns=test_batch_l, index=test_channel_l)
py_time_bwd_df = pd.DataFrame(columns=test_batch_l, index=test_channel_l)
nv_time_fwd_df = pd.DataFrame(columns=test_batch_l, index=test_channel_l)
nv_time_bwd_df = pd.DataFrame(columns=test_batch_l, index=test_channel_l)
nv_time_bwd_data_df = pd.DataFrame(columns=test_batch_l, index=test_channel_l)
nv_time_bwd_gamma_beta_df = pd.DataFrame(columns=test_batch_l, index=test_channel_l)
for nbatch in test_batch_l:
for nchannel in test_channel_l:
if codebase == 'mxnet':
run_args = [PYTHON_EXE, 'layer_norm_mx.py', '--use_gpu', str(use_gpu), '--nbatch', str(nbatch),
'--nchannel', str(nchannel),
'--eps', str(eps), '--dtype', dtype, '--nrepeat', str(N_REPEAT)]
if profile_old:
fwd_keyword = MX_OLD_KEYWORD
bwd_data_keyword = MX_OLD_KEYWORD
bwd_gamma_beta_keyword = MX_OLD_KEYWORD
else:
fwd_keyword = 'LayerNormFusedForwardKernel'
bwd_data_keyword = 'LayerNormFusedBackwardKernel_Data'
bwd_gamma_beta_keyword = ['LayerNormFusedBackwardKernel_PartGammaBeta', 'LayerNormFusedBackwardKernel_GammaBeta']
elif codebase == 'pytorch':
run_args = [PYTHON_EXE, 'layer_norm_pytorch.py', '--use_gpu', str(use_gpu), '--nbatch', str(nbatch),
'--nchannel', str(nchannel),
'--eps', str(eps), '--dtype', dtype, '--nrepeat', str(N_REPEAT)]
fwd_keyword = None
bwd_data_keyword = None
bwd_gamma_beta_keyword = None
elif codebase == 'pytorch_apex':
run_args = [PYTHON_EXE, 'layer_norm_pytorch.py', '--use_gpu', str(use_gpu), '--nbatch', str(nbatch),
'--nchannel', str(nchannel),
'--eps', str(eps), '--dtype', dtype, '--nrepeat', str(N_REPEAT), '--apex']
fwd_keyword = 'cuApplyLayerNorm'
bwd_data_keyword = 'cuComputeGradInput'
bwd_gamma_beta_keyword = ['cuComputePartGradGammaBeta', 'cuComputeGradGammaBeta']
else:
raise NotImplementedError
if profile_nv:
run_args = [NVPROF_EXE] + run_args
ret = subprocess.run(run_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
runfile_out = ret.stdout.decode('utf-8')
fwd_time, bwd_time = re.match(LN_OUT_REG, runfile_out).groups()
fwd_time = float(fwd_time)
bwd_time = float(bwd_time)
py_time_fwd_df.loc[nchannel, nbatch] = round(fwd_time)
py_time_bwd_df.loc[nchannel, nbatch] = round(bwd_time)
if profile_nv:
nvprof_result = parse_nvprof_out(ret.stderr.decode('utf-8'))
_, nv_fwd_time, _, _, _ = nvprof_result.fetch_run_time(keyword=fwd_keyword, unit='us')
nv_fwd_time = sum(nv_fwd_time)
_, nv_bwd_data_time, _, _, _ = nvprof_result.fetch_run_time(keyword=bwd_data_keyword, unit='us')
nv_bwd_data_time = sum(nv_bwd_data_time)
_, nv_bwd_gamma_beta_time, _, _, _ = nvprof_result.fetch_run_time(keyword=bwd_gamma_beta_keyword, unit='us')
nv_bwd_gamma_beta_time = sum(nv_bwd_gamma_beta_time)
nv_bwd_time = nv_bwd_data_time + nv_bwd_gamma_beta_time
print('{}, B={}, C={}, fwd={}, bwd={}, bwd_data={}, bwd_gamma_beta={}'.format(codebase, nbatch, nchannel,
round(nv_fwd_time),
round(nv_bwd_time),
round(nv_bwd_data_time),
round(nv_bwd_gamma_beta_time)))
nv_time_fwd_df.loc[nchannel, nbatch] = round(nv_fwd_time)
nv_time_bwd_df.loc[nchannel, nbatch] = round(nv_bwd_time)
nv_time_bwd_data_df.loc[nchannel, nbatch] = round(nv_bwd_data_time)
nv_time_bwd_gamma_beta_df.loc[nchannel, nbatch] = round(nv_bwd_gamma_beta_time)
return py_time_fwd_df, py_time_bwd_df, nv_time_fwd_df, nv_time_bwd_df, nv_time_bwd_data_df, nv_time_bwd_gamma_beta_df
apex_py_fwd_time, apex_py_bwd_time, apex_nv_fwd_time, apex_nv_bwd_time, apex_nv_bwd_data_time, apex_nv_bwd_gamma_beta_time \
= test_speed('pytorch_apex', TEST_BATCH_L, TEST_CHANNEL_L, EPS, USE_GPU, DTYPE, profile_nv=True)
print('PyTorch Apex')
print('Forward (nvprof timer)\n')
print(as_markdown_table(apex_nv_fwd_time))
print('Backward (nvprof timer)\n')
print(as_markdown_table(apex_nv_bwd_time))
print('Backward Data (nvprof timer)\n')
print(as_markdown_table(apex_nv_bwd_data_time))
print('Backward Gamma & Beta (nvprof timer)\n')
print(as_markdown_table(apex_nv_bwd_gamma_beta_time))
print('Forward (python timer)\n')
print(as_markdown_table(apex_py_fwd_time))
print('Backward (python timer)\n')
print(as_markdown_table(apex_py_bwd_time))
mx_py_fwd_time, mx_py_bwd_time, mx_nv_fwd_time, mx_nv_bwd_time, mx_nv_bwd_data_time, mx_nv_bwd_gamma_beta_time =\
test_speed('mxnet', TEST_BATCH_L, TEST_CHANNEL_L, EPS, USE_GPU, DTYPE, profile_nv=True)
print('MXNet')
print('Forward (nvprof timer)\n')
print(as_markdown_table(mx_nv_fwd_time))
print('Backward (nvprof timer)\n')
print(as_markdown_table(mx_nv_bwd_time))
print('Backward Data (nvprof timer)\n')
print(as_markdown_table(mx_nv_bwd_data_time))
print('Backward Gamma & Beta (nvprof timer)\n')
print(as_markdown_table(mx_nv_bwd_gamma_beta_time))
print('Forward (python timer)\n')
print(as_markdown_table(mx_py_fwd_time))
print('Backward (python timer)\n')
print(as_markdown_table(mx_py_bwd_time))