-
Notifications
You must be signed in to change notification settings - Fork 1.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[OpenCL]optimize conv3x3 when group==1 #5618
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and | |
limitations under the License. */ | ||
|
||
#include <cl_common.h> | ||
|
||
__kernel void conv2d_3x3_opt(__private const int item_ch, | ||
__private const int item_w, | ||
__private const int item_h, | ||
|
@@ -36,22 +35,21 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, | |
const int item_w_id = get_global_id(1); | ||
const int item_h_id = get_global_id(2); | ||
|
||
// out_width_id_per_blk and out_batch_id | ||
int out_batch_id = item_h_id / in_h; | ||
int out_w_base_id = item_ch_id * out_w; | ||
// out_width_id_per_blk | ||
int out_w_base_id = mul24(item_ch_id, out_w); | ||
int out_w_id0 = item_w_id; | ||
int out_w_id1 = out_w_id0 + item_w; | ||
int out_w_id2 = out_w_id1 + item_w; | ||
int out_w_id3 = out_w_id2 + item_w; | ||
int out_w_id4 = out_w_id3 + item_w; | ||
|
||
// in_width_id_per_blk and in_height_id_per_batch | ||
int in_h_id = (item_h_id % out_h) * stride - pad; | ||
int in_w_id0 = item_w_id * stride - pad; | ||
int in_w_id1 = in_w_id0 + item_w * stride; | ||
int in_w_id2 = in_w_id1 + item_w * stride; | ||
int in_w_id3 = in_w_id2 + item_w * stride; | ||
int in_w_id4 = in_w_id3 + item_w * stride; | ||
int in_h_id = mad24((item_h_id % out_h), stride, (-pad)); | ||
int in_w_id0 = mad24(item_w_id, stride, (-pad)); | ||
int in_w_id1 = mad24(item_w, stride, in_w_id0); | ||
int in_w_id2 = mad24(item_w, stride, in_w_id1); | ||
int in_w_id3 = mad24(item_w, stride, in_w_id2); | ||
int in_w_id4 = mad24(item_w, stride, in_w_id3); | ||
|
||
#ifdef BIASE_CH | ||
|
||
|
@@ -99,81 +97,56 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, | |
#endif | ||
|
||
CL_DTYPE4 filter[4] = {0.0f}; | ||
CL_DTYPE4 filter_trans[4] = {0.0f}; | ||
CL_DTYPE4 input[5] = {0.0f}; | ||
|
||
int filter_h_val0 = item_ch_id * 4 * 3; | ||
int filter_h_val1 = filter_h_val0 + 3; | ||
int filter_h_val2 = filter_h_val1 + 3; | ||
int filter_h_val3 = filter_h_val2 + 3; | ||
|
||
for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { | ||
int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; | ||
for (int ch = 0; ch < ((in_ch + 3) >> 2); ch++) { | ||
int ch_surplus = ((ch + 1) << 2) - in_ch > 0 ? ((ch + 1)<< 2) - in_ch : 0; | ||
|
||
const int in_w_base_id = mul24(ch, in_w); | ||
|
||
int filter_w_val = ch * 3; | ||
int filter_w_val = ch << 2; | ||
int filter_h_val = mul24(item_ch_id, 9); | ||
|
||
for (int h = 0; h < 3; h++) { | ||
int in_h_val = select(out_batch_id * in_h + in_h_id + h, | ||
-1, | ||
(out_batch_id * in_h + in_h_id + h < 0 || | ||
out_batch_id * in_h + in_h_id + h >= in_h)); | ||
|
||
int in_h_val = in_h_id + h; | ||
|
||
for (int w = 0; w < 3; w++) { | ||
int in_w_val0 = select(in_w_base_id + in_w_id0 + w, | ||
-1, | ||
(in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); | ||
(in_w_id0 + w < 0 | in_w_id0 + w >= in_w)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 同上,按位与 操作比 或 操作,有多少性能提升,可以单独测下只修改此处的性能变化,如果有提升,select 都可以按此方式修改下。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 嗯嗯,我直接用模型再测一下,上次测得模型都是未tune的,这次测试把tune之后的性能变化也补上。本来是修改成int in_w_val0 = ((in_w_base_id + in_w_id0 + w + 1) & -(in_w_id0 + w >= 0 & in_w_id0 + w < in_w)) - 1这种的,发现如果不修改filter实现方式性能有提升,修改后加上这个修改性能反而下降。 |
||
int in_w_val1 = select(in_w_base_id + in_w_id1 + w, | ||
-1, | ||
(in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); | ||
(in_w_id1 + w < 0 | in_w_id1 + w >= in_w)); | ||
int in_w_val2 = select(in_w_base_id + in_w_id2 + w, | ||
-1, | ||
(in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); | ||
(in_w_id2 + w < 0 | in_w_id2 + w >= in_w)); | ||
int in_w_val3 = select(in_w_base_id + in_w_id3 + w, | ||
-1, | ||
(in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); | ||
(in_w_id3 + w < 0 | in_w_id3 + w >= in_w)); | ||
int in_w_val4 = select(in_w_base_id + in_w_id4 + w, | ||
-1, | ||
(in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); | ||
(in_w_id4 + w < 0 | in_w_id4 + w >= in_w)); | ||
|
||
filter[0] = READ_IMG_TYPE( | ||
CL_DTYPE_CHAR, | ||
filter_image, | ||
SAMPLER, | ||
(int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0 | ||
(int2)(filter_w_val, filter_h_val)); | ||
filter[1] = READ_IMG_TYPE( | ||
CL_DTYPE_CHAR, | ||
filter_image, | ||
SAMPLER, | ||
(int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1 | ||
(int2)(filter_w_val + 1, filter_h_val)); | ||
filter[2] = READ_IMG_TYPE( | ||
CL_DTYPE_CHAR, | ||
filter_image, | ||
SAMPLER, | ||
(int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2 | ||
(int2)(filter_w_val + 2, filter_h_val)); | ||
filter[3] = READ_IMG_TYPE( | ||
CL_DTYPE_CHAR, | ||
filter_image, | ||
SAMPLER, | ||
(int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3 | ||
|
||
filter_trans[0] = (CL_DTYPE4)(filter[0].x, | ||
filter[1].x, | ||
filter[2].x, | ||
filter[3].x); // in_ch:0,out_ch:0-3 | ||
filter_trans[1] = (CL_DTYPE4)(filter[0].y, | ||
filter[1].y, | ||
filter[2].y, | ||
filter[3].y); // in_ch:1,out_ch:0-3 | ||
filter_trans[2] = (CL_DTYPE4)(filter[0].z, | ||
filter[1].z, | ||
filter[2].z, | ||
filter[3].z); // in_ch:2,out_ch:0-3 | ||
filter_trans[3] = (CL_DTYPE4)(filter[0].w, | ||
filter[1].w, | ||
filter[2].w, | ||
filter[3].w); // in_ch:3,out_ch:0-3 | ||
(int2)(filter_w_val + 3, filter_h_val++)); | ||
|
||
input[0] = READ_IMG_TYPE( | ||
CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val0, in_h_val)); | ||
|
@@ -186,37 +159,36 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, | |
input[4] = READ_IMG_TYPE( | ||
CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val4, in_h_val)); | ||
|
||
output[0] = mad(input[0].x, filter_trans[0], output[0]); | ||
output[1] = mad(input[1].x, filter_trans[0], output[1]); | ||
output[2] = mad(input[2].x, filter_trans[0], output[2]); | ||
output[3] = mad(input[3].x, filter_trans[0], output[3]); | ||
output[4] = mad(input[4].x, filter_trans[0], output[4]); | ||
output[0] = mad(input[0].x, filter[0], output[0]); | ||
output[1] = mad(input[1].x, filter[0], output[1]); | ||
output[2] = mad(input[2].x, filter[0], output[2]); | ||
output[3] = mad(input[3].x, filter[0], output[3]); | ||
output[4] = mad(input[4].x, filter[0], output[4]); | ||
|
||
if (ch_surplus < 3) { | ||
output[0] = mad(input[0].y, filter_trans[1], output[0]); | ||
output[1] = mad(input[1].y, filter_trans[1], output[1]); | ||
output[2] = mad(input[2].y, filter_trans[1], output[2]); | ||
output[3] = mad(input[3].y, filter_trans[1], output[3]); | ||
output[4] = mad(input[4].y, filter_trans[1], output[4]); | ||
output[0] = mad(input[0].y, filter[1], output[0]); | ||
output[1] = mad(input[1].y, filter[1], output[1]); | ||
output[2] = mad(input[2].y, filter[1], output[2]); | ||
output[3] = mad(input[3].y, filter[1], output[3]); | ||
output[4] = mad(input[4].y, filter[1], output[4]); | ||
} | ||
if (ch_surplus < 2) { | ||
output[0] = mad(input[0].z, filter_trans[2], output[0]); | ||
output[1] = mad(input[1].z, filter_trans[2], output[1]); | ||
output[2] = mad(input[2].z, filter_trans[2], output[2]); | ||
output[3] = mad(input[3].z, filter_trans[2], output[3]); | ||
output[4] = mad(input[4].z, filter_trans[2], output[4]); | ||
output[0] = mad(input[0].z, filter[2], output[0]); | ||
output[1] = mad(input[1].z, filter[2], output[1]); | ||
output[2] = mad(input[2].z, filter[2], output[2]); | ||
output[3] = mad(input[3].z, filter[2], output[3]); | ||
output[4] = mad(input[4].z, filter[2], output[4]); | ||
} | ||
if (ch_surplus < 1) { | ||
output[0] = mad(input[0].w, filter_trans[3], output[0]); | ||
output[1] = mad(input[1].w, filter_trans[3], output[1]); | ||
output[2] = mad(input[2].w, filter_trans[3], output[2]); | ||
output[3] = mad(input[3].w, filter_trans[3], output[3]); | ||
output[4] = mad(input[4].w, filter_trans[3], output[4]); | ||
output[0] = mad(input[0].w, filter[3], output[0]); | ||
output[1] = mad(input[1].w, filter[3], output[1]); | ||
output[2] = mad(input[2].w, filter[3], output[2]); | ||
output[3] = mad(input[3].w, filter[3], output[3]); | ||
output[4] = mad(input[4].w, filter[3], output[4]); | ||
} | ||
} | ||
} | ||
} | ||
|
||
CL_DTYPE4 alpha[5]; | ||
#ifdef PRELU_CH //{ | ||
alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); | ||
|
@@ -324,27 +296,27 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch, | |
__private const int out_w, | ||
__private const int out_h, | ||
__read_only image2d_t prelu_alpha) { | ||
// item_id | ||
// item_id | ||
const int item_ch_id = get_global_id(0); | ||
const int item_w_id = get_global_id(1); | ||
const int item_h_id = get_global_id(2); | ||
|
||
// out_width_id_per_blk and out_batch_id | ||
// out_width_id_per_blk | ||
int out_batch_id = item_h_id / in_h; | ||
int out_w_base_id = item_ch_id * out_w; | ||
int out_w_base_id = mul24(item_ch_id, out_w); | ||
int out_w_id0 = item_w_id; | ||
int out_w_id1 = out_w_id0 + item_w; | ||
int out_w_id2 = out_w_id1 + item_w; | ||
int out_w_id3 = out_w_id2 + item_w; | ||
int out_w_id4 = out_w_id3 + item_w; | ||
|
||
// in_width_id_per_blk and in_height_id_per_batch | ||
int in_h_id = (item_h_id % out_h) * stride - pad; | ||
int in_w_id0 = item_w_id * stride - pad; | ||
int in_w_id1 = in_w_id0 + item_w * stride; | ||
int in_w_id2 = in_w_id1 + item_w * stride; | ||
int in_w_id3 = in_w_id2 + item_w * stride; | ||
int in_w_id4 = in_w_id3 + item_w * stride; | ||
int in_h_id = mad24((item_h_id % out_h), stride, (-pad)); | ||
int in_w_id0 = mad24(item_w_id, stride, (-pad)); | ||
int in_w_id1 = mad24(item_w, stride, in_w_id0); | ||
int in_w_id2 = mad24(item_w, stride, in_w_id1); | ||
int in_w_id3 = mad24(item_w, stride, in_w_id2); | ||
int in_w_id4 = mad24(item_w, stride, in_w_id3); | ||
|
||
#ifdef BIASE_CH | ||
|
||
|
@@ -392,82 +364,60 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch, | |
#endif | ||
|
||
CL_DTYPE4 filter[4] = {0.0f}; | ||
CL_DTYPE4 filter_trans[4] = {0.0f}; | ||
CL_DTYPE4 input[5] = {0.0f}; | ||
|
||
int filter_h_val0 = item_ch_id * 4 * 3; | ||
int filter_h_val1 = filter_h_val0 + 3; | ||
int filter_h_val2 = filter_h_val1 + 3; | ||
int filter_h_val3 = filter_h_val2 + 3; | ||
|
||
for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { | ||
int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; | ||
for (int ch = 0; ch < ((in_ch + 3) >> 2); ch++) { | ||
int ch_surplus = ((ch + 1) << 2) - in_ch > 0 ? ((ch + 1)<< 2) - in_ch : 0; | ||
|
||
const int in_w_base_id = mul24(ch, in_w); | ||
|
||
int filter_w_val = ch * 3; | ||
int filter_w_val = ch << 2; | ||
int filter_h_val = mul24(item_ch_id, 9); | ||
|
||
for (int h = 0; h < 3; h++) { | ||
int in_h_val = select( | ||
out_batch_id * in_h + in_h_id + h, | ||
mad24(out_batch_id, in_h, (in_h_id + h)), | ||
-1, | ||
(out_batch_id * in_h + in_h_id + h < out_batch_id * in_h || | ||
out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h)); | ||
|
||
(mad24(out_batch_id, in_h, (in_h_id + h)) < mul24(out_batch_id, in_h) || | ||
mad24(out_batch_id, in_h, (in_h_id + h)) >= mul24((out_batch_id + 1), in_h))); | ||
for (int w = 0; w < 3; w++) { | ||
int in_w_val0 = select(in_w_base_id + in_w_id0 + w, | ||
-1, | ||
(in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); | ||
(in_w_id0 + w < 0 | in_w_id0 + w >= in_w)); | ||
int in_w_val1 = select(in_w_base_id + in_w_id1 + w, | ||
-1, | ||
(in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); | ||
(in_w_id1 + w < 0 | in_w_id1 + w >= in_w)); | ||
int in_w_val2 = select(in_w_base_id + in_w_id2 + w, | ||
-1, | ||
(in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); | ||
(in_w_id2 + w < 0 | in_w_id2 + w >= in_w)); | ||
int in_w_val3 = select(in_w_base_id + in_w_id3 + w, | ||
-1, | ||
(in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); | ||
(in_w_id3 + w < 0 | in_w_id3 + w >= in_w)); | ||
int in_w_val4 = select(in_w_base_id + in_w_id4 + w, | ||
-1, | ||
(in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); | ||
(in_w_id4 + w < 0 | in_w_id4 + w >= in_w)); | ||
|
||
filter[0] = READ_IMG_TYPE( | ||
CL_DTYPE_CHAR, | ||
filter_image, | ||
SAMPLER, | ||
(int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0 | ||
(int2)(filter_w_val, filter_h_val)); | ||
filter[1] = READ_IMG_TYPE( | ||
CL_DTYPE_CHAR, | ||
filter_image, | ||
SAMPLER, | ||
(int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1 | ||
(int2)(filter_w_val + 1, filter_h_val)); | ||
filter[2] = READ_IMG_TYPE( | ||
CL_DTYPE_CHAR, | ||
filter_image, | ||
SAMPLER, | ||
(int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2 | ||
(int2)(filter_w_val + 2, filter_h_val)); | ||
filter[3] = READ_IMG_TYPE( | ||
CL_DTYPE_CHAR, | ||
filter_image, | ||
SAMPLER, | ||
(int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3 | ||
|
||
filter_trans[0] = (CL_DTYPE4)(filter[0].x, | ||
filter[1].x, | ||
filter[2].x, | ||
filter[3].x); // in_ch:0,out_ch:0-3 | ||
filter_trans[1] = (CL_DTYPE4)(filter[0].y, | ||
filter[1].y, | ||
filter[2].y, | ||
filter[3].y); // in_ch:1,out_ch:0-3 | ||
filter_trans[2] = (CL_DTYPE4)(filter[0].z, | ||
filter[1].z, | ||
filter[2].z, | ||
filter[3].z); // in_ch:2,out_ch:0-3 | ||
filter_trans[3] = (CL_DTYPE4)(filter[0].w, | ||
filter[1].w, | ||
filter[2].w, | ||
filter[3].w); // in_ch:3,out_ch:0-3 | ||
(int2)(filter_w_val + 3, filter_h_val++)); | ||
|
||
input[0] = READ_IMG_TYPE( | ||
CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val0, in_h_val)); | ||
|
@@ -480,32 +430,32 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch, | |
input[4] = READ_IMG_TYPE( | ||
CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val4, in_h_val)); | ||
|
||
output[0] = mad(input[0].x, filter_trans[0], output[0]); | ||
output[1] = mad(input[1].x, filter_trans[0], output[1]); | ||
output[2] = mad(input[2].x, filter_trans[0], output[2]); | ||
output[3] = mad(input[3].x, filter_trans[0], output[3]); | ||
output[4] = mad(input[4].x, filter_trans[0], output[4]); | ||
output[0] = mad(input[0].x, filter[0], output[0]); | ||
output[1] = mad(input[1].x, filter[0], output[1]); | ||
output[2] = mad(input[2].x, filter[0], output[2]); | ||
output[3] = mad(input[3].x, filter[0], output[3]); | ||
output[4] = mad(input[4].x, filter[0], output[4]); | ||
|
||
if (ch_surplus < 3) { | ||
output[0] = mad(input[0].y, filter_trans[1], output[0]); | ||
output[1] = mad(input[1].y, filter_trans[1], output[1]); | ||
output[2] = mad(input[2].y, filter_trans[1], output[2]); | ||
output[3] = mad(input[3].y, filter_trans[1], output[3]); | ||
output[4] = mad(input[4].y, filter_trans[1], output[4]); | ||
output[0] = mad(input[0].y, filter[1], output[0]); | ||
output[1] = mad(input[1].y, filter[1], output[1]); | ||
output[2] = mad(input[2].y, filter[1], output[2]); | ||
output[3] = mad(input[3].y, filter[1], output[3]); | ||
output[4] = mad(input[4].y, filter[1], output[4]); | ||
} | ||
if (ch_surplus < 2) { | ||
output[0] = mad(input[0].z, filter_trans[2], output[0]); | ||
output[1] = mad(input[1].z, filter_trans[2], output[1]); | ||
output[2] = mad(input[2].z, filter_trans[2], output[2]); | ||
output[3] = mad(input[3].z, filter_trans[2], output[3]); | ||
output[4] = mad(input[4].z, filter_trans[2], output[4]); | ||
output[0] = mad(input[0].z, filter[2], output[0]); | ||
output[1] = mad(input[1].z, filter[2], output[1]); | ||
output[2] = mad(input[2].z, filter[2], output[2]); | ||
output[3] = mad(input[3].z, filter[2], output[3]); | ||
output[4] = mad(input[4].z, filter[2], output[4]); | ||
} | ||
if (ch_surplus < 1) { | ||
output[0] = mad(input[0].w, filter_trans[3], output[0]); | ||
output[1] = mad(input[1].w, filter_trans[3], output[1]); | ||
output[2] = mad(input[2].w, filter_trans[3], output[2]); | ||
output[3] = mad(input[3].w, filter_trans[3], output[3]); | ||
output[4] = mad(input[4].w, filter_trans[3], output[4]); | ||
output[0] = mad(input[0].w, filter[3], output[0]); | ||
output[1] = mad(input[1].w, filter[3], output[1]); | ||
output[2] = mad(input[2].w, filter[3], output[2]); | ||
output[3] = mad(input[3].w, filter[3], output[3]); | ||
output[4] = mad(input[4].w, filter[3], output[4]); | ||
} | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
直接写乘加实现,与显式使用
mad24
,单纯修改这类有多少性能提升,测试过这个吗?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
这个我单测时测过,没有特别明显的变化,模型没有对应单独测,我再测一下。mad24手册上是建议对性考虑时优先使用