Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[OpenCL]optimize conv3x3 when group==1 #5618

Merged
merged 4 commits into from
Mar 29, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
228 changes: 89 additions & 139 deletions lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */

#include <cl_common.h>

__kernel void conv2d_3x3_opt(__private const int item_ch,
__private const int item_w,
__private const int item_h,
Expand All @@ -36,22 +35,21 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
const int item_w_id = get_global_id(1);
const int item_h_id = get_global_id(2);

// out_width_id_per_blk and out_batch_id
int out_batch_id = item_h_id / in_h;
int out_w_base_id = item_ch_id * out_w;
// out_width_id_per_blk
int out_w_base_id = mul24(item_ch_id, out_w);
int out_w_id0 = item_w_id;
int out_w_id1 = out_w_id0 + item_w;
int out_w_id2 = out_w_id1 + item_w;
int out_w_id3 = out_w_id2 + item_w;
int out_w_id4 = out_w_id3 + item_w;

// in_width_id_per_blk and in_height_id_per_batch
int in_h_id = (item_h_id % out_h) * stride - pad;
int in_w_id0 = item_w_id * stride - pad;
int in_w_id1 = in_w_id0 + item_w * stride;
int in_w_id2 = in_w_id1 + item_w * stride;
int in_w_id3 = in_w_id2 + item_w * stride;
int in_w_id4 = in_w_id3 + item_w * stride;
int in_h_id = mad24((item_h_id % out_h), stride, (-pad));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

直接写乘加实现,与显式使用mad24,单纯修改这类有多少性能提升,测试过这个吗?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个我单测时测过,没有特别明显的变化,模型没有对应单独测,我再测一下。mad24手册上是建议对性考虑时优先使用

int in_w_id0 = mad24(item_w_id, stride, (-pad));
int in_w_id1 = mad24(item_w, stride, in_w_id0);
int in_w_id2 = mad24(item_w, stride, in_w_id1);
int in_w_id3 = mad24(item_w, stride, in_w_id2);
int in_w_id4 = mad24(item_w, stride, in_w_id3);

#ifdef BIASE_CH

Expand Down Expand Up @@ -99,81 +97,56 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
#endif

CL_DTYPE4 filter[4] = {0.0f};
CL_DTYPE4 filter_trans[4] = {0.0f};
CL_DTYPE4 input[5] = {0.0f};

int filter_h_val0 = item_ch_id * 4 * 3;
int filter_h_val1 = filter_h_val0 + 3;
int filter_h_val2 = filter_h_val1 + 3;
int filter_h_val3 = filter_h_val2 + 3;

for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
for (int ch = 0; ch < ((in_ch + 3) >> 2); ch++) {
int ch_surplus = ((ch + 1) << 2) - in_ch > 0 ? ((ch + 1)<< 2) - in_ch : 0;

const int in_w_base_id = mul24(ch, in_w);

int filter_w_val = ch * 3;
int filter_w_val = ch << 2;
int filter_h_val = mul24(item_ch_id, 9);

for (int h = 0; h < 3; h++) {
int in_h_val = select(out_batch_id * in_h + in_h_id + h,
-1,
(out_batch_id * in_h + in_h_id + h < 0 ||
out_batch_id * in_h + in_h_id + h >= in_h));

int in_h_val = in_h_id + h;

for (int w = 0; w < 3; w++) {
int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
-1,
(in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
(in_w_id0 + w < 0 | in_w_id0 + w >= in_w));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

同上,按位与 操作比 或 操作,有多少性能提升,可以单独测下只修改此处的性能变化,如果有提升,select 都可以按此方式修改下。

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

嗯嗯,我直接用模型再测一下,上次测得模型都是未tune的,这次测试把tune之后的性能变化也补上。本来是修改成int in_w_val0 = ((in_w_base_id + in_w_id0 + w + 1) & -(in_w_id0 + w >= 0 & in_w_id0 + w < in_w)) - 1这种的,发现如果不修改filter实现方式性能有提升,修改后加上这个修改性能反而下降。

int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
-1,
(in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
(in_w_id1 + w < 0 | in_w_id1 + w >= in_w));
int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
-1,
(in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
(in_w_id2 + w < 0 | in_w_id2 + w >= in_w));
int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
-1,
(in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
(in_w_id3 + w < 0 | in_w_id3 + w >= in_w));
int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
-1,
(in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
(in_w_id4 + w < 0 | in_w_id4 + w >= in_w));

filter[0] = READ_IMG_TYPE(
CL_DTYPE_CHAR,
filter_image,
SAMPLER,
(int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0
(int2)(filter_w_val, filter_h_val));
filter[1] = READ_IMG_TYPE(
CL_DTYPE_CHAR,
filter_image,
SAMPLER,
(int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1
(int2)(filter_w_val + 1, filter_h_val));
filter[2] = READ_IMG_TYPE(
CL_DTYPE_CHAR,
filter_image,
SAMPLER,
(int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2
(int2)(filter_w_val + 2, filter_h_val));
filter[3] = READ_IMG_TYPE(
CL_DTYPE_CHAR,
filter_image,
SAMPLER,
(int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3

filter_trans[0] = (CL_DTYPE4)(filter[0].x,
filter[1].x,
filter[2].x,
filter[3].x); // in_ch:0,out_ch:0-3
filter_trans[1] = (CL_DTYPE4)(filter[0].y,
filter[1].y,
filter[2].y,
filter[3].y); // in_ch:1,out_ch:0-3
filter_trans[2] = (CL_DTYPE4)(filter[0].z,
filter[1].z,
filter[2].z,
filter[3].z); // in_ch:2,out_ch:0-3
filter_trans[3] = (CL_DTYPE4)(filter[0].w,
filter[1].w,
filter[2].w,
filter[3].w); // in_ch:3,out_ch:0-3
(int2)(filter_w_val + 3, filter_h_val++));

input[0] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val0, in_h_val));
Expand All @@ -186,37 +159,36 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
input[4] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val4, in_h_val));

output[0] = mad(input[0].x, filter_trans[0], output[0]);
output[1] = mad(input[1].x, filter_trans[0], output[1]);
output[2] = mad(input[2].x, filter_trans[0], output[2]);
output[3] = mad(input[3].x, filter_trans[0], output[3]);
output[4] = mad(input[4].x, filter_trans[0], output[4]);
output[0] = mad(input[0].x, filter[0], output[0]);
output[1] = mad(input[1].x, filter[0], output[1]);
output[2] = mad(input[2].x, filter[0], output[2]);
output[3] = mad(input[3].x, filter[0], output[3]);
output[4] = mad(input[4].x, filter[0], output[4]);

if (ch_surplus < 3) {
output[0] = mad(input[0].y, filter_trans[1], output[0]);
output[1] = mad(input[1].y, filter_trans[1], output[1]);
output[2] = mad(input[2].y, filter_trans[1], output[2]);
output[3] = mad(input[3].y, filter_trans[1], output[3]);
output[4] = mad(input[4].y, filter_trans[1], output[4]);
output[0] = mad(input[0].y, filter[1], output[0]);
output[1] = mad(input[1].y, filter[1], output[1]);
output[2] = mad(input[2].y, filter[1], output[2]);
output[3] = mad(input[3].y, filter[1], output[3]);
output[4] = mad(input[4].y, filter[1], output[4]);
}
if (ch_surplus < 2) {
output[0] = mad(input[0].z, filter_trans[2], output[0]);
output[1] = mad(input[1].z, filter_trans[2], output[1]);
output[2] = mad(input[2].z, filter_trans[2], output[2]);
output[3] = mad(input[3].z, filter_trans[2], output[3]);
output[4] = mad(input[4].z, filter_trans[2], output[4]);
output[0] = mad(input[0].z, filter[2], output[0]);
output[1] = mad(input[1].z, filter[2], output[1]);
output[2] = mad(input[2].z, filter[2], output[2]);
output[3] = mad(input[3].z, filter[2], output[3]);
output[4] = mad(input[4].z, filter[2], output[4]);
}
if (ch_surplus < 1) {
output[0] = mad(input[0].w, filter_trans[3], output[0]);
output[1] = mad(input[1].w, filter_trans[3], output[1]);
output[2] = mad(input[2].w, filter_trans[3], output[2]);
output[3] = mad(input[3].w, filter_trans[3], output[3]);
output[4] = mad(input[4].w, filter_trans[3], output[4]);
output[0] = mad(input[0].w, filter[3], output[0]);
output[1] = mad(input[1].w, filter[3], output[1]);
output[2] = mad(input[2].w, filter[3], output[2]);
output[3] = mad(input[3].w, filter[3], output[3]);
output[4] = mad(input[4].w, filter[3], output[4]);
}
}
}
}

CL_DTYPE4 alpha[5];
#ifdef PRELU_CH //{
alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0));
Expand Down Expand Up @@ -324,27 +296,27 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch,
__private const int out_w,
__private const int out_h,
__read_only image2d_t prelu_alpha) {
// item_id
// item_id
const int item_ch_id = get_global_id(0);
const int item_w_id = get_global_id(1);
const int item_h_id = get_global_id(2);

// out_width_id_per_blk and out_batch_id
// out_width_id_per_blk
int out_batch_id = item_h_id / in_h;
int out_w_base_id = item_ch_id * out_w;
int out_w_base_id = mul24(item_ch_id, out_w);
int out_w_id0 = item_w_id;
int out_w_id1 = out_w_id0 + item_w;
int out_w_id2 = out_w_id1 + item_w;
int out_w_id3 = out_w_id2 + item_w;
int out_w_id4 = out_w_id3 + item_w;

// in_width_id_per_blk and in_height_id_per_batch
int in_h_id = (item_h_id % out_h) * stride - pad;
int in_w_id0 = item_w_id * stride - pad;
int in_w_id1 = in_w_id0 + item_w * stride;
int in_w_id2 = in_w_id1 + item_w * stride;
int in_w_id3 = in_w_id2 + item_w * stride;
int in_w_id4 = in_w_id3 + item_w * stride;
int in_h_id = mad24((item_h_id % out_h), stride, (-pad));
int in_w_id0 = mad24(item_w_id, stride, (-pad));
int in_w_id1 = mad24(item_w, stride, in_w_id0);
int in_w_id2 = mad24(item_w, stride, in_w_id1);
int in_w_id3 = mad24(item_w, stride, in_w_id2);
int in_w_id4 = mad24(item_w, stride, in_w_id3);

#ifdef BIASE_CH

Expand Down Expand Up @@ -392,82 +364,60 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch,
#endif

CL_DTYPE4 filter[4] = {0.0f};
CL_DTYPE4 filter_trans[4] = {0.0f};
CL_DTYPE4 input[5] = {0.0f};

int filter_h_val0 = item_ch_id * 4 * 3;
int filter_h_val1 = filter_h_val0 + 3;
int filter_h_val2 = filter_h_val1 + 3;
int filter_h_val3 = filter_h_val2 + 3;

for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
for (int ch = 0; ch < ((in_ch + 3) >> 2); ch++) {
int ch_surplus = ((ch + 1) << 2) - in_ch > 0 ? ((ch + 1)<< 2) - in_ch : 0;

const int in_w_base_id = mul24(ch, in_w);

int filter_w_val = ch * 3;
int filter_w_val = ch << 2;
int filter_h_val = mul24(item_ch_id, 9);

for (int h = 0; h < 3; h++) {
int in_h_val = select(
out_batch_id * in_h + in_h_id + h,
mad24(out_batch_id, in_h, (in_h_id + h)),
-1,
(out_batch_id * in_h + in_h_id + h < out_batch_id * in_h ||
out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h));

(mad24(out_batch_id, in_h, (in_h_id + h)) < mul24(out_batch_id, in_h) ||
mad24(out_batch_id, in_h, (in_h_id + h)) >= mul24((out_batch_id + 1), in_h)));
for (int w = 0; w < 3; w++) {
int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
-1,
(in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
(in_w_id0 + w < 0 | in_w_id0 + w >= in_w));
int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
-1,
(in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
(in_w_id1 + w < 0 | in_w_id1 + w >= in_w));
int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
-1,
(in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
(in_w_id2 + w < 0 | in_w_id2 + w >= in_w));
int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
-1,
(in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
(in_w_id3 + w < 0 | in_w_id3 + w >= in_w));
int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
-1,
(in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
(in_w_id4 + w < 0 | in_w_id4 + w >= in_w));

filter[0] = READ_IMG_TYPE(
CL_DTYPE_CHAR,
filter_image,
SAMPLER,
(int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0
(int2)(filter_w_val, filter_h_val));
filter[1] = READ_IMG_TYPE(
CL_DTYPE_CHAR,
filter_image,
SAMPLER,
(int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1
(int2)(filter_w_val + 1, filter_h_val));
filter[2] = READ_IMG_TYPE(
CL_DTYPE_CHAR,
filter_image,
SAMPLER,
(int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2
(int2)(filter_w_val + 2, filter_h_val));
filter[3] = READ_IMG_TYPE(
CL_DTYPE_CHAR,
filter_image,
SAMPLER,
(int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3

filter_trans[0] = (CL_DTYPE4)(filter[0].x,
filter[1].x,
filter[2].x,
filter[3].x); // in_ch:0,out_ch:0-3
filter_trans[1] = (CL_DTYPE4)(filter[0].y,
filter[1].y,
filter[2].y,
filter[3].y); // in_ch:1,out_ch:0-3
filter_trans[2] = (CL_DTYPE4)(filter[0].z,
filter[1].z,
filter[2].z,
filter[3].z); // in_ch:2,out_ch:0-3
filter_trans[3] = (CL_DTYPE4)(filter[0].w,
filter[1].w,
filter[2].w,
filter[3].w); // in_ch:3,out_ch:0-3
(int2)(filter_w_val + 3, filter_h_val++));

input[0] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val0, in_h_val));
Expand All @@ -480,32 +430,32 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch,
input[4] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val4, in_h_val));

output[0] = mad(input[0].x, filter_trans[0], output[0]);
output[1] = mad(input[1].x, filter_trans[0], output[1]);
output[2] = mad(input[2].x, filter_trans[0], output[2]);
output[3] = mad(input[3].x, filter_trans[0], output[3]);
output[4] = mad(input[4].x, filter_trans[0], output[4]);
output[0] = mad(input[0].x, filter[0], output[0]);
output[1] = mad(input[1].x, filter[0], output[1]);
output[2] = mad(input[2].x, filter[0], output[2]);
output[3] = mad(input[3].x, filter[0], output[3]);
output[4] = mad(input[4].x, filter[0], output[4]);

if (ch_surplus < 3) {
output[0] = mad(input[0].y, filter_trans[1], output[0]);
output[1] = mad(input[1].y, filter_trans[1], output[1]);
output[2] = mad(input[2].y, filter_trans[1], output[2]);
output[3] = mad(input[3].y, filter_trans[1], output[3]);
output[4] = mad(input[4].y, filter_trans[1], output[4]);
output[0] = mad(input[0].y, filter[1], output[0]);
output[1] = mad(input[1].y, filter[1], output[1]);
output[2] = mad(input[2].y, filter[1], output[2]);
output[3] = mad(input[3].y, filter[1], output[3]);
output[4] = mad(input[4].y, filter[1], output[4]);
}
if (ch_surplus < 2) {
output[0] = mad(input[0].z, filter_trans[2], output[0]);
output[1] = mad(input[1].z, filter_trans[2], output[1]);
output[2] = mad(input[2].z, filter_trans[2], output[2]);
output[3] = mad(input[3].z, filter_trans[2], output[3]);
output[4] = mad(input[4].z, filter_trans[2], output[4]);
output[0] = mad(input[0].z, filter[2], output[0]);
output[1] = mad(input[1].z, filter[2], output[1]);
output[2] = mad(input[2].z, filter[2], output[2]);
output[3] = mad(input[3].z, filter[2], output[3]);
output[4] = mad(input[4].z, filter[2], output[4]);
}
if (ch_surplus < 1) {
output[0] = mad(input[0].w, filter_trans[3], output[0]);
output[1] = mad(input[1].w, filter_trans[3], output[1]);
output[2] = mad(input[2].w, filter_trans[3], output[2]);
output[3] = mad(input[3].w, filter_trans[3], output[3]);
output[4] = mad(input[4].w, filter_trans[3], output[4]);
output[0] = mad(input[0].w, filter[3], output[0]);
output[1] = mad(input[1].w, filter[3], output[1]);
output[2] = mad(input[2].w, filter[3], output[2]);
output[3] = mad(input[3].w, filter[3], output[3]);
output[4] = mad(input[4].w, filter[3], output[4]);
}
}
}
Expand Down
Loading