diff --git a/lite/backends/opencl/cl_kernel/buffer/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/depthwise_conv2d_kernel.cl index ab575ba9b38..cc45f66859b 100644 --- a/lite/backends/opencl/cl_kernel/buffer/depthwise_conv2d_kernel.cl +++ b/lite/backends/opencl/cl_kernel/buffer/depthwise_conv2d_kernel.cl @@ -62,7 +62,8 @@ __kernel void depthwise_conv2d(const int numel, // num of elements v += bias_data[c]; } #ifdef RELU - output_data[index] = activation(v); + CL_DTYPE alpha; + output_data[index] = activation(v, alpha); #else output_data[index] = v; #endif diff --git a/lite/backends/opencl/cl_kernel/buffer/elementwise_add_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/elementwise_add_kernel.cl index bb6faea629c..05566798ad7 100644 --- a/lite/backends/opencl/cl_kernel/buffer/elementwise_add_kernel.cl +++ b/lite/backends/opencl/cl_kernel/buffer/elementwise_add_kernel.cl @@ -37,7 +37,8 @@ __kernel void elementwise_add(__global const CL_DTYPE* x_data, for (int n = 0; n < num; ++n) { // n: [0, h*w) *dout_ptr = *din_ptr + diny_data; #ifdef RELU - *dout_ptr = activation(*dout_ptr); + CL_DTYPE alpha; + *dout_ptr = activation(*dout_ptr, alpha); #endif ++dout_ptr; ++din_ptr; diff --git a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl index 080ce2b4574..b48b83e788a 100644 --- a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl +++ b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl @@ -54,7 +54,8 @@ void fc_gemm_naive(__global const CL_DTYPE* a, } #ifdef RELU - c[row * N + col] = activation(c0); + CL_DTYPE alpha; + c[row * N + col] = activation(c0, alpha); #else c[row * N + col] = c0; #endif @@ -91,7 +92,8 @@ void gemm_batch_naive(__global const CL_DTYPE* a, c0 += a0 * b0; } - cur_c[row * N + col] = activation(c0); + CL_DTYPE alpha; + cur_c[row * N + col] = activation(c0, alpha); } @@ -235,7 +237,8 @@ void fc_gemv_naive(__global const CL_DTYPE* a, } #ifdef RELU - c[col] = activation(c0); + CL_DTYPE alpha; + c[col] = activation(c0, alpha); #else c[col] = c0; #endif @@ -254,6 +257,7 @@ void fc_gemv_1x4(__global const CL_DTYPE* a, const int M, const int N, const int K) { const int col = get_global_id(0) << 2; // gws[0]: [0, N >> 2) height of B == N + half alpha; if (col + 3 < N) { half4 c0 = 0.0f; if (bias) { @@ -310,11 +314,11 @@ void fc_gemv_1x4(__global const CL_DTYPE* a, } else { switch (col % 4) { case 3: - c[col + 2] = activation(c0.z); + c[col + 2] = activation(c0.z, alpha); case 2: - c[col + 1] = activation(c0.y); + c[col + 1] = activation(c0.y, alpha); case 1: - c[col] = activation(c0.x); + c[col] = activation(c0.x, alpha); } } #else @@ -341,7 +345,7 @@ void fc_gemv_1x4(__global const CL_DTYPE* a, c0 += a0 * b0; } #ifdef RELU - c[col + col_offset] = activation(c0); + c[col + col_offset] = activation(c0, alpha); #else c[col + col_offset] = c0; #endif diff --git a/lite/backends/opencl/cl_kernel/buffer/relu_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/relu_kernel.cl index b07dc8132f4..fcf74685924 100644 --- a/lite/backends/opencl/cl_kernel/buffer/relu_kernel.cl +++ b/lite/backends/opencl/cl_kernel/buffer/relu_kernel.cl @@ -16,7 +16,8 @@ limitations under the License. */ __kernel void relu(__global const CL_DTYPE* x_data, const int count, __global CL_DTYPE* out_data) { const int index = get_global_id(0); + CL_DTYPE alpha; if (index < count) { - out_data[index] = activation(x_data[index]); + out_data[index] = activation(x_data[index], alpha); } } diff --git a/lite/backends/opencl/cl_kernel/cl_common.h b/lite/backends/opencl/cl_kernel/cl_common.h index a8013e4ec4c..6998051e1e9 100644 --- a/lite/backends/opencl/cl_kernel/cl_common.h +++ b/lite/backends/opencl/cl_kernel/cl_common.h @@ -90,12 +90,7 @@ __constant sampler_t SAMPLER = ///////////////////////////////// // activation / activation_type4 ///////////////////////////////// -inline CL_DTYPE activation(CL_DTYPE in -#ifdef PRELU - , - CL_DTYPE prelu_alpha -#endif - ) { +inline CL_DTYPE activation(CL_DTYPE in, CL_DTYPE prelu_alpha) { CL_DTYPE output = in; #ifdef PRELU output = select(prelu_alpha * in, in, in >= (CL_DTYPE)0); @@ -138,12 +133,7 @@ inline CL_DTYPE activation(CL_DTYPE in return output; } -inline CL_DTYPE4 activation_type4(CL_DTYPE4 in -#ifdef PRELU - , - CL_DTYPE4 prelu_alpha -#endif - ) { +inline CL_DTYPE4 activation_type4(CL_DTYPE4 in, CL_DTYPE4 prelu_alpha) { CL_DTYPE4 output = in; #ifdef PRELU output = select(prelu_alpha * in, in, isgreaterequal(in, (CL_DTYPE4)0)); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl index 57880dfec43..67b0ccdd50e 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl @@ -21,7 +21,8 @@ __kernel void conv2d_1x1_opt( __private const int input_height, /* of one block */ __private const int output_width, __private const int output_height, - __private const int old_w) { + __private const int old_w, + __read_only image2d_t prelu_alpha) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); @@ -251,10 +252,33 @@ __kernel void conv2d_1x1_opt( READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, SAMPLER, (int2)(out_c, 0)); #endif - output0 = activation_type4(output0); - output1 = activation_type4(output1); - output2 = activation_type4(output2); - output3 = activation_type4(output3); +CL_DTYPE4 alpha0,alpha1,alpha2,alpha3; +#ifdef PRELU_CH //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); + alpha1 = alpha0; + alpha2 = alpha0; + alpha3 = alpha0; + //} +#elif defined(PRELU_ELE) //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0); + alpha1 = alpha0; + alpha2 = alpha0; + alpha3 = alpha0; + //} +#elif defined(PRELU_ALL) //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; + alpha1 = alpha0; + alpha2 = alpha0; + alpha3 = alpha0; + //} +#endif + output0 = activation_type4(output0, alpha0); + output1 = activation_type4(output1, alpha1); + output2 = activation_type4(output2, alpha2); + output3 = activation_type4(output3, alpha3); #ifdef SCALE_ACTIVATION output0 = fuse_scale(output0, 1.f, 0.f, 0.f); @@ -301,7 +325,8 @@ __kernel void conv2d_1x1_simple( __private const int input_height, /* of one block */ __private const int output_width, __private const int output_height, - __private const int old_w) { + __private const int old_w, + __read_only image2d_t prelu_alpha) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); @@ -421,10 +446,33 @@ __kernel void conv2d_1x1_simple( READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, SAMPLER, (int2)(out_c, 0)); #endif - output0 = activation_type4(output0); - output1 = activation_type4(output1); - output2 = activation_type4(output2); - output3 = activation_type4(output3); +CL_DTYPE4 alpha0,alpha1,alpha2,alpha3; +#ifdef PRELU_CH //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); + alpha1 = alpha0; + alpha2 = alpha0; + alpha3 = alpha0; + //} +#elif defined(PRELU_ELE) //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0); + alpha1 = alpha0; + alpha2 = alpha0; + alpha3 = alpha0; + //} +#elif defined(PRELU_ALL) //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; + alpha1 = alpha0; + alpha2 = alpha0; + alpha3 = alpha0; + //} +#endif + output0 = activation_type4(output0, alpha0); + output1 = activation_type4(output1, alpha1); + output2 = activation_type4(output2, alpha2); + output3 = activation_type4(output3, alpha3); #ifdef SCALE_ACTIVATION output0 = fuse_scale(output0, 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl index 4e16ead836f..65c119a23c4 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl @@ -34,7 +34,8 @@ __kernel void conv2d_3x3(__private const int global_size_dim0, __private const int filter_width, __private const int filter_height, __private const int group, - __private const int input_tensor_c) { + __private const int input_tensor_c, + __read_only image2d_t prelu_alpha) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); @@ -251,7 +252,22 @@ __kernel void conv2d_3x3(__private const int global_size_dim0, output.w = (i == 3) ? output.w + tmp_out : output.w; } } - output = activation_type4(output); + +CL_DTYPE4 alpha0; +#ifdef PRELU_CH //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); + //} +#elif defined(PRELU_ELE) //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); + //} +#elif defined(PRELU_ALL) //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; + //} +#endif + output = activation_type4(output, alpha0); #ifdef SCALE_ACTIVATION output = fuse_scale(output, 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl index 02a0c778103..ebd0e021b99 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl @@ -29,7 +29,8 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, __private const int in_w, __private const int in_h, __private const int out_w, - __private const int out_h) { + __private const int out_h, + __read_only image2d_t prelu_alpha) { // item_id const int item_ch_id = get_global_id(0); const int item_w_id = get_global_id(1); @@ -216,11 +217,56 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, } } - output[0] = activation_type4(output[0]); - output[1] = activation_type4(output[1]); - output[2] = activation_type4(output[2]); - output[3] = activation_type4(output[3]); - output[4] = activation_type4(output[4]); +CL_DTYPE4 alpha[5]; +#ifdef PRELU_CH //{ + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); + alpha[1] = alpha[0]; + alpha[2] = alpha[0]; + alpha[3] = alpha[0]; + alpha[4] = alpha[0]; + //} +#elif defined(PRELU_ELE) //{ + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + alpha[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + alpha[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + alpha[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + alpha[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } + //} +#elif defined(PRELU_ALL) //{ + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha[0].y = alpha[0].x; alpha[0].z = alpha[0].x; alpha[0].w = alpha[0].x; + alpha[1] = alpha[0]; alpha[2] = alpha[0]; + alpha[3] = alpha[0]; alpha[4] = alpha[0]; + //} +#endif + output[0] = activation_type4(output[0], alpha[0]); + output[1] = activation_type4(output[1], alpha[1]); + output[2] = activation_type4(output[2], alpha[2]); + output[3] = activation_type4(output[3], alpha[3]); + output[4] = activation_type4(output[4], alpha[4]); #ifdef SCALE_ACTIVATION output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); @@ -276,7 +322,8 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch, __private const int in_w, __private const int in_h, __private const int out_w, - __private const int out_h) { + __private const int out_h, + __read_only image2d_t prelu_alpha) { // item_id const int item_ch_id = get_global_id(0); const int item_w_id = get_global_id(1); @@ -464,11 +511,56 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch, } } - output[0] = activation_type4(output[0]); - output[1] = activation_type4(output[1]); - output[2] = activation_type4(output[2]); - output[3] = activation_type4(output[3]); - output[4] = activation_type4(output[4]); +CL_DTYPE4 alpha[5]; +#ifdef PRELU_CH //{ + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); + alpha[1] = alpha[0]; + alpha[2] = alpha[0]; + alpha[3] = alpha[0]; + alpha[4] = alpha[0]; + //} +#elif defined(PRELU_ELE) //{ + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + alpha[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + alpha[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + alpha[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + alpha[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } + //} +#elif defined(PRELU_ALL) //{ + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha[0].y = alpha[0].x; alpha[0].z = alpha[0].x; alpha[0].w = alpha[0].x; + alpha[1] = alpha[0]; alpha[2] = alpha[0]; + alpha[3] = alpha[0]; alpha[4] = alpha[0]; + //} +#endif + output[0] = activation_type4(output[0], alpha[0]); + output[1] = activation_type4(output[1], alpha[1]); + output[2] = activation_type4(output[2], alpha[2]); + output[3] = activation_type4(output[3], alpha[3]); + output[4] = activation_type4(output[4], alpha[4]); #ifdef SCALE_ACTIVATION output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl index 587d488e9ab..d96dc8cc11b 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl @@ -32,7 +32,8 @@ __kernel void conv2d_5x5(__private const int global_size_dim0, __private const int input_width, /* of one block */ __private const int input_height, /* of one block */ __private const int output_width, - __private const int output_height) { + __private const int output_height, + __read_only image2d_t prelu_alpha) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); @@ -172,7 +173,21 @@ __kernel void conv2d_5x5(__private const int global_size_dim0, READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, SAMPLER, (int2)(out_c, 0)); #endif - output = activation_type4(output); +CL_DTYPE4 alpha0; +#ifdef PRELU_CH //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); + //} +#elif defined(PRELU_ELE) //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); + //} +#elif defined(PRELU_ALL) //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; + //} +#endif + output = activation_type4(output, alpha0); #ifdef SCALE_ACTIVATION output = fuse_scale(output, 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl index 927d56d5ac2..b041eb73ac1 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl @@ -30,7 +30,8 @@ __kernel void conv2d_5x5_opt(__private const int item_ch, __private const int in_w, __private const int in_h, __private const int out_w, - __private const int out_h) { + __private const int out_h, + __read_only image2d_t prelu_alpha) { // filter const int filter_w = 5; const int filter_h = 5; @@ -222,11 +223,56 @@ __kernel void conv2d_5x5_opt(__private const int item_ch, } } - output[0] = activation_type4(output[0]); - output[1] = activation_type4(output[1]); - output[2] = activation_type4(output[2]); - output[3] = activation_type4(output[3]); - output[4] = activation_type4(output[4]); +CL_DTYPE4 alpha[5]; +#ifdef PRELU_CH //{ + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); + alpha[1] = alpha[0]; + alpha[2] = alpha[0]; + alpha[3] = alpha[0]; + alpha[4] = alpha[0]; + //} +#elif defined(PRELU_ELE) //{ + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + alpha[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + alpha[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + alpha[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + alpha[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } + //} +#elif defined(PRELU_ALL) //{ + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha[0].y = alpha[0].x; alpha[0].z = alpha[0].x; alpha[0].w = alpha[0].x; + alpha[1] = alpha[0]; alpha[2] = alpha[0]; + alpha[3] = alpha[0]; alpha[4] = alpha[0]; + //} +#endif + output[0] = activation_type4(output[0], alpha[0]); + output[1] = activation_type4(output[1], alpha[1]); + output[2] = activation_type4(output[2], alpha[2]); + output[3] = activation_type4(output[3], alpha[3]); + output[4] = activation_type4(output[4], alpha[4]); #ifdef SCALE_ACTIVATION output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); @@ -281,7 +327,8 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch, __private const int in_w, __private const int in_h, __private const int out_w, - __private const int out_h) { + __private const int out_h, + __read_only image2d_t prelu_alpha) { // filter const int filter_w = 5; const int filter_h = 5; @@ -477,11 +524,56 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch, } } - output[0] = activation_type4(output[0]); - output[1] = activation_type4(output[1]); - output[2] = activation_type4(output[2]); - output[3] = activation_type4(output[3]); - output[4] = activation_type4(output[4]); +CL_DTYPE4 alpha[5]; +#ifdef PRELU_CH //{ + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); + alpha[1] = alpha[0]; + alpha[2] = alpha[0]; + alpha[3] = alpha[0]; + alpha[4] = alpha[0]; + //} +#elif defined(PRELU_ELE) //{ + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + alpha[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + alpha[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + alpha[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + alpha[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } + //} +#elif defined(PRELU_ALL) //{ + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha[0].y = alpha[0].x; alpha[0].z = alpha[0].x; alpha[0].w = alpha[0].x; + alpha[1] = alpha[0]; alpha[2] = alpha[0]; + alpha[3] = alpha[0]; alpha[4] = alpha[0]; + //} +#endif + output[0] = activation_type4(output[0], alpha[0]); + output[1] = activation_type4(output[1], alpha[1]); + output[2] = activation_type4(output[2], alpha[2]); + output[3] = activation_type4(output[3], alpha[3]); + output[4] = activation_type4(output[4], alpha[4]); #ifdef SCALE_ACTIVATION output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl index 67ffea51539..2ffd988f180 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl @@ -18,7 +18,8 @@ __kernel void conv2d_7x7(__private const int global_size_dim0, __private const int input_width, /* of one block */ __private const int input_height, /* of one block */ __private const int output_width, - __private const int output_height) { + __private const int output_height, + __read_only image2d_t prelu_alpha) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); @@ -123,7 +124,21 @@ __kernel void conv2d_7x7(__private const int global_size_dim0, READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, SAMPLER, (int2)(out_c, 0)); #endif - output = activation_type4(output); +CL_DTYPE4 alpha0; +#ifdef PRELU_CH //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); + //} +#elif defined(PRELU_ELE) //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); + //} +#elif defined(PRELU_ALL) //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; + //} +#endif + output = activation_type4(output, alpha0); #ifdef SCALE_ACTIVATION output = fuse_scale(output, 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl index cc64137fcd5..497234428a8 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl @@ -30,7 +30,8 @@ __kernel void conv2d_7x7_opt(__private const int item_ch, __private const int in_w, __private const int in_h, __private const int out_w, - __private const int out_h) { + __private const int out_h, + __read_only image2d_t prelu_alpha) { // filter const int filter_w = 7; const int filter_h = 7; @@ -222,11 +223,56 @@ __kernel void conv2d_7x7_opt(__private const int item_ch, } } - output[0] = activation_type4(output[0]); - output[1] = activation_type4(output[1]); - output[2] = activation_type4(output[2]); - output[3] = activation_type4(output[3]); - output[4] = activation_type4(output[4]); +CL_DTYPE4 alpha[5]; +#ifdef PRELU_CH //{ + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); + alpha[1] = alpha[0]; + alpha[2] = alpha[0]; + alpha[3] = alpha[0]; + alpha[4] = alpha[0]; + //} +#elif defined(PRELU_ELE) //{ + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + alpha[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + alpha[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + alpha[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + alpha[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } + //} +#elif defined(PRELU_ALL) //{ + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha[0].y = alpha[0].x; alpha[0].z = alpha[0].x; alpha[0].w = alpha[0].x; + alpha[1] = alpha[0]; alpha[2] = alpha[0]; + alpha[3] = alpha[0]; alpha[4] = alpha[0]; + //} +#endif + output[0] = activation_type4(output[0], alpha[0]); + output[1] = activation_type4(output[1], alpha[1]); + output[2] = activation_type4(output[2], alpha[2]); + output[3] = activation_type4(output[3], alpha[3]); + output[4] = activation_type4(output[4], alpha[4]); #ifdef SCALE_ACTIVATION output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); @@ -281,7 +327,8 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch, __private const int in_w, __private const int in_h, __private const int out_w, - __private const int out_h) { + __private const int out_h, + __read_only image2d_t prelu_alpha) { // filter const int filter_w = 7; const int filter_h = 7; @@ -477,11 +524,56 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch, } } - output[0] = activation_type4(output[0]); - output[1] = activation_type4(output[1]); - output[2] = activation_type4(output[2]); - output[3] = activation_type4(output[3]); - output[4] = activation_type4(output[4]); +CL_DTYPE4 alpha[5]; +#ifdef PRELU_CH //{ + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); + alpha[1] = alpha[0]; + alpha[2] = alpha[0]; + alpha[3] = alpha[0]; + alpha[4] = alpha[0]; + //} +#elif defined(PRELU_ELE) //{ + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + alpha[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + alpha[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + alpha[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + alpha[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } + //} +#elif defined(PRELU_ALL) //{ + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha[0].y = alpha[0].x; alpha[0].z = alpha[0].x; alpha[0].w = alpha[0].x; + alpha[1] = alpha[0]; alpha[2] = alpha[0]; + alpha[3] = alpha[0]; alpha[4] = alpha[0]; + //} +#endif + output[0] = activation_type4(output[0], alpha[0]); + output[1] = activation_type4(output[1], alpha[1]); + output[2] = activation_type4(output[2], alpha[2]); + output[3] = activation_type4(output[3], alpha[3]); + output[4] = activation_type4(output[4], alpha[4]); #ifdef SCALE_ACTIVATION output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_common_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_common_kernel.cl index 962fa3a7182..d60440aaea5 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_common_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_common_kernel.cl @@ -33,7 +33,8 @@ __kernel void conv2d_common(__private const int global_size_dim0, __private const int padding_width, __private const int padding_height, __private const int dilation_width, - __private const int dilation_height) { + __private const int dilation_height, + __read_only image2d_t prelu_alpha) { const int out_channel_block_idx = get_global_id(0); const int out_width_block_idx = get_global_id(1); const int output_bh_idx = get_global_id(2); @@ -86,7 +87,6 @@ __kernel void conv2d_common(__private const int global_size_dim0, CL_DTYPE4 out3 = out0; #endif - int in_width0 = mad24(out_width_block_idx, stride_width << 2, -padding_width); int in_width1 = in_width0 + stride_width; int in_width2 = in_width0 + stride_width * 2; @@ -152,16 +152,57 @@ __kernel void conv2d_common(__private const int global_size_dim0, } } } - out0 = activation_type4(out0); - out1 = activation_type4(out1); - out2 = activation_type4(out2); - out3 = activation_type4(out3); +CL_DTYPE4 alpha0, alpha1, alpha2, alpha3; +#ifdef PRELU_CH //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_channel_block_idx, 0)); + alpha1 = alpha0; + alpha2 = alpha0; + alpha3 = alpha0; + //} +#elif defined(PRELU_ELE) //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id0, output_bh_idx)); + if (out_w_id1 < output_width) { + alpha1 = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id1, output_bh_idx)); + } + if (out_w_id2 < output_width) { + alpha2 = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id2, output_bh_idx)); + } + if (out_w_id3 < output_width) { + alpha3 = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id3, output_bh_idx)); + } + //} +#elif defined(PRELU_ALL) //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; + alpha1 = alpha0; + alpha2 = alpha0; + alpha3 = alpha0; + //} +#endif + out0 = activation_type4(out0, alpha0); + out1 = activation_type4(out1, alpha1); + out2 = activation_type4(out2, alpha2); + out3 = activation_type4(out3, alpha3); #ifdef SCALE_ACTIVATION - out0 = fuse_scale(out0, 1.f, 0.f, 0.f); - out1 = fuse_scale(out1, 1.f, 0.f, 0.f); - out2 = fuse_scale(out2, 1.f, 0.f, 0.f); - out3 = fuse_scale(out3, 1.f, 0.f, 0.f); + out0 = fuse_scale(out0, 1.f, 0.f, 0.f); + out1 = fuse_scale(out1, 1.f, 0.f, 0.f); + out2 = fuse_scale(out2, 1.f, 0.f, 0.f); + out3 = fuse_scale(out3, 1.f, 0.f, 0.f); #endif const int out_x_base = mul24(out_channel_block_idx, output_width); diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl index 1d8a64793fd..717db4784b6 100755 --- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl @@ -33,7 +33,8 @@ __kernel void depth_conv2d_common(__private const int global_size_dim0, // (out_ __private const int output_width, __private const int output_height, __private const int filter_width, - __private const int filter_height) { + __private const int filter_height, + __read_only image2d_t prelu_alpha) { const int out_c_blk = get_global_id(0); // [0, (C+3)/4) const int out_w_blk = get_global_id(1); // [0, (W+3)/4) @@ -107,10 +108,51 @@ __kernel void depth_conv2d_common(__private const int global_size_dim0, // (out_ } } - out0 = activation_type4(out0); - out1 = activation_type4(out1); - out2 = activation_type4(out2); - out3 = activation_type4(out3); +CL_DTYPE4 alpha0, alpha1, alpha2, alpha3; +#ifdef PRELU_CH //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_channel_block_idx, 0)); + alpha1 = alpha0; + alpha2 = alpha0; + alpha3 = alpha0; + //} +#elif defined(PRELU_ELE) //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id0, output_bh_idx)); + if (out_w_id1 < output_width) { + alpha1 = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id1, output_bh_idx)); + } + if (out_w_id2 < output_width) { + alpha2 = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id2, output_bh_idx)); + } + if (out_w_id3 < output_width) { + alpha3 = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id3, output_bh_idx)); + } + //} +#elif defined(PRELU_ALL) //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; + alpha1 = alpha0; + alpha2 = alpha0; + alpha3 = alpha0; + //} +#endif + out0 = activation_type4(out0, alpha0); + out1 = activation_type4(out1, alpha1); + out2 = activation_type4(out2, alpha2); + out3 = activation_type4(out3, alpha3); #ifdef SCALE_ACTIVATION out0 = fuse_scale(out0, 1.f, 0.f, 0.f); @@ -156,7 +198,8 @@ __kernel void depth_conv2d(__private const int global_size_dim0, __private const int output_width, __private const int output_height, __private const int filter_width, - __private const int filter_height) { + __private const int filter_height, + __read_only image2d_t prelu_alpha) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); @@ -210,7 +253,21 @@ __kernel void depth_conv2d(__private const int global_size_dim0, } } - output = activation_type4(output); +CL_DTYPE4 alpha0; +#ifdef PRELU_CH //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); + //} +#elif defined(PRELU_ELE) //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); + //} +#else //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; + //} +#endif + output = activation_type4(output, alpha0); #ifdef SCALE_ACTIVATION output = fuse_scale(output, 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl index 729298a4b6b..c5ef38038e3 100755 --- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl @@ -30,7 +30,8 @@ __kernel void depth_conv2d_3x3( __private const int input_width, /* of one block */ __private const int input_height, /* of one block */ __private const int output_width, - __private const int output_height) { + __private const int output_height, + __read_only image2d_t prelu_alpha) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); @@ -205,7 +206,21 @@ __kernel void depth_conv2d_3x3( output += inputs[i] * filters[i]; } - output = activation_type4(output); +CL_DTYPE4 alpha0; +#ifdef PRELU_CH //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); + //} +#elif defined(PRELU_ELE) //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); + //} +#else //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; + //} +#endif + output = activation_type4(output, alpha0); #ifdef SCALE_ACTIVATION output = fuse_scale(output, 1.f, 0.f, 0.f); @@ -252,7 +267,8 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, __private const int in_w, /* of one block */ __private const int in_h, /* of one block */ __private const int ou_w, - __private const int ou_h) { + __private const int ou_h, + __read_only image2d_t prelu_alpha) { const int ou_ch_blk_id = get_global_id(0); const int ou_w_blk_id = get_global_id(1); @@ -363,8 +379,29 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, output[0] = mad(inputs[10], filters[8], output[0]); output[1] = mad(inputs[11], filters[8], output[1]); - output[0] = activation_type4(output[0]); - output[1] = activation_type4(output[1]); +CL_DTYPE4 alpha[2]; +#ifdef PRELU_CH //{ + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(ou_ch_blk_id, 0)); + alpha[1] = alpha[0]; + //} +#elif defined(PRELU_ELE) //{ + alpha[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(ou_x, ou_nh_id)); + if (ou_col_id + 1 < ou_w) { + alpha[1] = + READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(ou_x + 1, ou_nh_id)); + } + //} +#else //{ + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha[0].y = alpha[0].x; + alpha[0].z = alpha[0].x; + alpha[0].w = alpha[0].x; + alpha[1] = alpha[0]; + //} +#endif + output[0] = activation_type4(output[0], alpha[0]); + output[1] = activation_type4(output[1], alpha[1]); #ifdef SCALE_ACTIVATION output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl b/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl index 0dc287ed847..a7cc062b664 100644 --- a/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl @@ -33,7 +33,8 @@ __kernel void elementwise_add(__read_only image2d_t input, CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, coords); #endif CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, SAMPLER, coords); - CL_DTYPE4 output = activation_type4(in + biase); + CL_DTYPE4 alpha; + CL_DTYPE4 output = activation_type4(in + biase, alpha); WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage,coords,output); } diff --git a/lite/backends/opencl/cl_kernel/image/elementwise_sub_kernel.cl b/lite/backends/opencl/cl_kernel/image/elementwise_sub_kernel.cl index 3bcc2159705..c31131f15ae 100644 --- a/lite/backends/opencl/cl_kernel/image/elementwise_sub_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/elementwise_sub_kernel.cl @@ -26,7 +26,8 @@ __kernel void elementwise_sub(__read_only image2d_t input, CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, coords); CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, SAMPLER, coords); - CL_DTYPE4 output = activation_type4(in - biase); + CL_DTYPE4 alpha; + CL_DTYPE4 output = activation_type4(in - biase, alpha); WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage,coords,output); } diff --git a/lite/core/mir/fusion/conv_activation_fuse_pass.cc b/lite/core/mir/fusion/conv_activation_fuse_pass.cc index 5d8a1fece3a..20c0b978cec 100644 --- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc +++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc @@ -56,6 +56,7 @@ void ConvActivationFusePass::Apply(const std::unique_ptr& graph) { act_types.push_back("leaky_relu"); act_types.push_back("hard_swish"); act_types.push_back("hard_sigmoid"); + act_types.push_back("prelu"); } if (!has_int8 && has_cuda) { act_types.push_back("leaky_relu"); @@ -64,10 +65,17 @@ void ConvActivationFusePass::Apply(const std::unique_ptr& graph) { act_types.push_back("relu"); act_types.push_back("relu6"); } + bool has_alpha = false; for (auto conv_type : {"conv2d", "depthwise_conv2d", "conv2d_transpose"}) { for (auto act_type : act_types) { + if (act_type == "prelu") { + has_alpha = true; + } else { + has_alpha = false; + } for (auto has_bias : {true, false}) { - fusion::ConvActivationFuser fuser(conv_type, act_type, has_bias); + fusion::ConvActivationFuser fuser( + conv_type, act_type, has_bias, has_alpha); fuser(graph.get()); } } diff --git a/lite/core/mir/fusion/conv_activation_fuser.cc b/lite/core/mir/fusion/conv_activation_fuser.cc index 413941d301a..49c5029ff9c 100644 --- a/lite/core/mir/fusion/conv_activation_fuser.cc +++ b/lite/core/mir/fusion/conv_activation_fuser.cc @@ -28,9 +28,13 @@ void ConvActivationFuser::BuildPattern() { auto* filter = VarNode("filter")->assert_is_op_input(conv_type_, "Filter")->AsInput(); PMNode* bias = nullptr; + PMNode* alpha = nullptr; if (has_bias_) { bias = VarNode("bias")->assert_is_op_input(conv_type_, "Bias")->AsInput(); } + if (has_alpha_) { + alpha = VarNode("alpha")->assert_is_op_input(act_type_, "Alpha")->AsInput(); + } auto* conv2d = OpNode("conv2d", conv_type_)->AsIntermediate(); auto* act = OpNode("act", act_type_)->AsIntermediate(); @@ -49,6 +53,9 @@ void ConvActivationFuser::BuildPattern() { if (has_bias_) { *bias >> *conv2d; } + if (has_alpha_) { + *alpha >> *act; + } } void ConvActivationFuser::InsertNewNode(SSAGraph* graph, @@ -67,6 +74,9 @@ void ConvActivationFuser::InsertNewNode(SSAGraph* graph, if (has_bias_) { IR_NODE_LINK_TO(matched.at("bias"), new_op_node); } + if (has_alpha_) { + IR_NODE_LINK_TO(matched.at("alpha"), new_op_node); + } IR_NODE_LINK_TO(new_op_node, matched.at("output")); } @@ -97,6 +107,10 @@ cpp::OpDesc ConvActivationFuser::GenOpDesc(const key2nodes_t& matched) { float offset = act_op_desc.GetAttr("offset"); op_desc.SetAttr("slope", slope); op_desc.SetAttr("offset", offset); + } else if (act_type_ == "prelu") { + auto prelu_mode = act_op_desc.GetAttr("mode"); + op_desc.SetAttr("prelu_mode", prelu_mode); + op_desc.SetInput("Prelu_alpha", {matched.at("alpha")->arg()->name}); } return op_desc; } diff --git a/lite/core/mir/fusion/conv_activation_fuser.h b/lite/core/mir/fusion/conv_activation_fuser.h index d352a32f9f8..04951b4c2cb 100644 --- a/lite/core/mir/fusion/conv_activation_fuser.h +++ b/lite/core/mir/fusion/conv_activation_fuser.h @@ -27,10 +27,12 @@ class ConvActivationFuser : public FuseBase { public: explicit ConvActivationFuser(const std::string& conv_type, const std::string& act_type, - bool has_bias) { + bool has_bias, + bool has_alpha) { conv_type_ = conv_type; act_type_ = act_type; has_bias_ = has_bias; + has_alpha_ = has_alpha; } void BuildPattern() override; @@ -41,6 +43,7 @@ class ConvActivationFuser : public FuseBase { std::string conv_type_; std::string act_type_; bool has_bias_; + bool has_alpha_; }; } // namespace fusion diff --git a/lite/kernels/arm/conv_compute.cc b/lite/kernels/arm/conv_compute.cc index ad80a8763f6..03dfd5df4dd 100644 --- a/lite/kernels/arm/conv_compute.cc +++ b/lite/kernels/arm/conv_compute.cc @@ -199,6 +199,7 @@ typedef paddle::lite::kernels::arm::ConvCompute(new Tensor); kernel_func_name_ = "prelu_channel"; auto& out_dims = act_param_->Out->dims(); - width_ = out_dims[3]; + if (out_dims.size() == 4) { + width_ = out_dims[3]; + CLImageConverterFolder alpha_converter; + const DDim& alpha_image_dims = alpha_converter.InitImageDimInfoWith( + act_param_->Prelu_alpha->dims()); + tensor_hold_alpha_image_->Resize( + {1, alpha_image_dims[0], alpha_image_dims[1], 4}); - CLImageConverterFolder alpha_converter; - const DDim& alpha_image_dims = alpha_converter.InitImageDimInfoWith( - act_param_->Prelu_alpha->dims()); - tensor_hold_alpha_image_->Resize( - {1, alpha_image_dims[0], alpha_image_dims[1], 4}); + auto* alpha_image_data = MUTABLE_DATA_CPU(tensor_hold_alpha_image_); + auto* alpha_cpu_data = + act_param_->Prelu_alpha->mutable_data(); + alpha_converter.NCHWToImage(alpha_cpu_data, + alpha_image_data, + act_param_->Prelu_alpha->dims()); - auto* alpha_image_data = MUTABLE_DATA_CPU(tensor_hold_alpha_image_); - auto* alpha_cpu_data = act_param_->Prelu_alpha->mutable_data(); - alpha_converter.NCHWToImage(alpha_cpu_data, - alpha_image_data, - act_param_->Prelu_alpha->dims()); + MUTABLE_DATA_GPU(alpha_gpu_image_, + alpha_image_dims[0], + alpha_image_dims[1], + alpha_image_data); + } else if (out_dims.size() == 2) { + width_ = 1; + CLImageConverterDefault alpha_converter; + const DDim& alpha_image_dims = alpha_converter.InitImageDimInfoWith( + act_param_->Prelu_alpha->dims()); + tensor_hold_alpha_image_->Resize( + {1, alpha_image_dims[0], alpha_image_dims[1], 4}); - MUTABLE_DATA_GPU(alpha_gpu_image_, - alpha_image_dims[0], - alpha_image_dims[1], - alpha_image_data); + auto* alpha_image_data = MUTABLE_DATA_CPU(tensor_hold_alpha_image_); + auto* alpha_cpu_data = + act_param_->Prelu_alpha->mutable_data(); + alpha_converter.NCHWToImage(alpha_cpu_data, + alpha_image_data, + act_param_->Prelu_alpha->dims()); + + MUTABLE_DATA_GPU(alpha_gpu_image_, + alpha_image_dims[0], + alpha_image_dims[1], + alpha_image_data); + } else { + LOG(FATAL) << "unsupport dims.size(): " << out_dims.size(); + } } else { alpha_gpu_image_ = std::unique_ptr(new Tensor); tensor_hold_alpha_image_ = std::unique_ptr(new Tensor); kernel_func_name_ = "prelu_element"; auto& in_dim = act_param_->X->dims(); - height_ = in_dim[2]; + if (in_dim.size() > 3) { + height_ = in_dim[2]; + } else { + height_ = 1; + } scale_ = act_param_->Leaky_relu_alpha; - CLImageConverterFolder alpha_converter; + CLImageConverterDefault alpha_converter; const DDim& alpha_image_dims = alpha_converter.InitImageDimInfoWith( act_param_->Prelu_alpha->dims()); tensor_hold_alpha_image_->Resize( diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc index da56a971cc8..17e1941bae8 100644 --- a/lite/kernels/opencl/conv_image_compute.cc +++ b/lite/kernels/opencl/conv_image_compute.cc @@ -301,6 +301,14 @@ void ConvImageCompute::PrepareForRun() { << static_cast(conv_param_->activation_param.active_type) << " conv_param_->activation_param.has_active:" << conv_param_->activation_param.has_active; + // alpha_image_p_ init + alpha_gpu_image_ = std::unique_ptr(new Tensor); + std::unique_ptr tensor_hold_alpha_image = + std::unique_ptr(new Tensor); + tensor_hold_alpha_image->Resize({1, 1, 1, 4}); + auto* alpha_image_data = DATA_GPU(tensor_hold_alpha_image); + MUTABLE_DATA_GPU(alpha_gpu_image_, 1, 1, alpha_image_data); + alpha_image_p_ = DATA_GPU(alpha_gpu_image_); if (conv_param_->activation_param.has_active) { if (conv_param_->activation_param.active_type == lite_api::ActivationType::kRelu) { @@ -333,6 +341,34 @@ void ConvImageCompute::PrepareForRun() { std::to_string(conv_param_->activation_param.hard_sigmoid_offset); build_options_single += " -DHARD_SIGMOID -DHARD_SIGMOID_SLOPE=" + slope + "f" + " -DHARD_SIGMOID_OFFSET=" + offset + "f"; + } else if (conv_param_->activation_param.active_type == + lite_api::ActivationType::kPRelu) { + std::string prelu_mode = conv_param_->activation_param.Prelu_mode; + build_options_single += " -DPRELU"; + if (prelu_mode == "channel") { + build_options_single += " -DPRELU_CH"; + } else if (prelu_mode == "element") { + build_options_single += " -DPRELU_ELE"; + } else { + build_options_single += " -DPRELU_ALL"; + } + CLImageConverterFolder alpha_converter; + const DDim& alpha_image_dims = alpha_converter.InitImageDimInfoWith( + conv_param_->activation_param.Prelu_alpha->dims()); + tensor_hold_alpha_image->Resize( + {1, alpha_image_dims[0], alpha_image_dims[1], 4}); + auto* alpha_image_data = MUTABLE_DATA_CPU(tensor_hold_alpha_image); + auto* alpha_cpu_data = + conv_param_->activation_param.Prelu_alpha->mutable_data(); + alpha_converter.NCHWToImage( + alpha_cpu_data, + alpha_image_data, + conv_param_->activation_param.Prelu_alpha->dims()); + MUTABLE_DATA_GPU(alpha_gpu_image_, + alpha_image_dims[0], + alpha_image_dims[1], + alpha_image_data); + alpha_image_p_ = DATA_GPU(alpha_gpu_image_); } else { LOG(FATAL) << "Unsupported activation type:" << static_cast(conv_param_->activation_param.active_type); @@ -678,6 +714,8 @@ void ConvImageCompute::Conv2d1x1opt() { CL_CHECK_FATAL(status_); status_ = kernel_.setArg(16, default_w_blk_); CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(17, *alpha_image_p_); + CL_CHECK_FATAL(status_); } void ConvImageCompute::Conv2d3x3() { @@ -724,6 +762,8 @@ void ConvImageCompute::Conv2d3x3() { CL_CHECK_FATAL(status_); status_ = kernel_.setArg(20, input_tensor_c_); CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(21, *alpha_image_p_); + CL_CHECK_FATAL(status_); } void ConvImageCompute::Conv2d3x3opt() { @@ -759,6 +799,8 @@ void ConvImageCompute::Conv2d3x3opt() { CL_CHECK_FATAL(status_); status_ = kernel_.setArg(15, output_tensor_h_); CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(16, *alpha_image_p_); + CL_CHECK_FATAL(status_); } void ConvImageCompute::Conv2d5x5() { @@ -793,6 +835,8 @@ void ConvImageCompute::Conv2d5x5() { CL_CHECK_FATAL(status_); status_ = kernel_.setArg(14, output_tensor_h_); CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(15, *alpha_image_p_); + CL_CHECK_FATAL(status_); } void ConvImageCompute::Conv2d5x5opt() { @@ -828,6 +872,8 @@ void ConvImageCompute::Conv2d5x5opt() { CL_CHECK_FATAL(status_); status_ = kernel_.setArg(15, output_tensor_h_); CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(16, *alpha_image_p_); + CL_CHECK_FATAL(status_); } void ConvImageCompute::Conv2d7x7() { @@ -862,6 +908,8 @@ void ConvImageCompute::Conv2d7x7() { CL_CHECK_FATAL(status_); status_ = kernel_.setArg(13, output_tensor_h_); CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, *alpha_image_p_); + CL_CHECK_FATAL(status_); } void ConvImageCompute::Conv2d7x7opt() { @@ -897,6 +945,8 @@ void ConvImageCompute::Conv2d7x7opt() { CL_CHECK_FATAL(status_); status_ = kernel_.setArg(15, output_tensor_h_); CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(16, *alpha_image_p_); + CL_CHECK_FATAL(status_); } void ConvImageCompute::DepthwiseConv2d3x3s1() { @@ -930,6 +980,8 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() { CL_CHECK_FATAL(status_); status_ = kernel_.setArg(14, output_tensor_h_); CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(15, *alpha_image_p_); + CL_CHECK_FATAL(status_); } void ConvImageCompute::DepthwiseConv2d3x3() { @@ -966,6 +1018,8 @@ void ConvImageCompute::DepthwiseConv2d3x3() { CL_CHECK_FATAL(status_); status_ = kernel_.setArg(15, output_tensor_h_); CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(16, *alpha_image_p_); + CL_CHECK_FATAL(status_); } void ConvImageCompute::DepthwiseConv2d() { @@ -1008,6 +1062,8 @@ void ConvImageCompute::DepthwiseConv2d() { CL_CHECK_FATAL(status_); status_ = kernel_.setArg(18, filter_tensor_h_); CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(19, *alpha_image_p_); + CL_CHECK_FATAL(status_); } void ConvImageCompute::Conv2dCommon() { @@ -1052,6 +1108,8 @@ void ConvImageCompute::Conv2dCommon() { CL_CHECK_FATAL(status_); status_ = kernel_.setArg(19, dilation_h_); CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(20, *alpha_image_p_); + CL_CHECK_FATAL(status_); } void ConvImageCompute::Run() { @@ -1163,6 +1221,7 @@ REGISTER_LITE_KERNEL(conv2d, DATALAYOUT(kImageDefault))}) .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Prelu_alpha", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL), PRECISION(kFP16), @@ -1182,6 +1241,7 @@ REGISTER_LITE_KERNEL(depthwise_conv2d, DATALAYOUT(kImageDefault))}) .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Prelu_alpha", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL), PRECISION(kFP16), @@ -1202,6 +1262,7 @@ REGISTER_LITE_KERNEL(conv2d, DATALAYOUT(kImageDefault))}) .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kHost))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("Prelu_alpha", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL), PRECISION(kFP16), @@ -1221,6 +1282,7 @@ REGISTER_LITE_KERNEL(depthwise_conv2d, DATALAYOUT(kImageDefault))}) .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kHost))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("Prelu_alpha", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL), PRECISION(kFP16), diff --git a/lite/kernels/opencl/conv_image_compute.h b/lite/kernels/opencl/conv_image_compute.h index 0e66a9701cd..f96887e1f11 100644 --- a/lite/kernels/opencl/conv_image_compute.h +++ b/lite/kernels/opencl/conv_image_compute.h @@ -85,6 +85,7 @@ class ConvImageCompute : public KernelLite filter_gpu_image_{nullptr}; std::unique_ptr bias_gpu_image_{nullptr}; + std::unique_ptr alpha_gpu_image_{nullptr}; std::unique_ptr tensor_hold_filter_image_{nullptr}; std::unique_ptr tensor_hold_bias_image_{nullptr}; cl::NDRange global_work_size_ = cl::NDRange{ @@ -98,6 +99,7 @@ class ConvImageCompute : public KernelLite("slope"); param_.activation_param.hard_sigmoid_offset = op_desc.GetAttr("offset"); + } else if (act_type == "prelu") { + param_.activation_param.active_type = lite_api::ActivationType::kPRelu; + param_.activation_param.Prelu_mode = + op_desc.GetAttr("prelu_mode"); + auto prelu_alpha_name = op_desc.Input("Prelu_alpha").front(); + auto prelu_alpha_var = scope->FindVar(prelu_alpha_name); + param_.activation_param.Prelu_alpha = + const_cast(&(prelu_alpha_var->Get())); } else { LOG(FATAL) << "The fused conv only supports fuse with relu, leaky " "relu, hard_swish, while the given activation type is "