diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_transpose_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_transpose_kernel.cl index f9e3654e5aa..de6abd039c3 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_transpose_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_transpose_kernel.cl @@ -28,6 +28,8 @@ __kernel void conv2d_transpose( __private const int2 align_shape, __private const int2 padding_shape, __private const int2 kernel_shape, + __private const int2 dilation_shape, + __private const int2 kernel_prev_shape, __private const int kernel_size, __private const int input_c_blks) { const int out_c_blk_idx = get_global_id(0); @@ -67,7 +69,7 @@ __kernel void conv2d_transpose( CL_DTYPE4 weights0, weights1, weights2, weights3; #ifndef IS_DEPTHWISE for (int ic = 0; ic < input_c_blks; ic++) { - int kernel_y_base = mul24(ic, kernel_size); + int kernel_y_base = mul24(ic, kernel_prev_shape.x * kernel_prev_shape.y); int in_idx = mul24(ic, input_shape.x); kernel_x_0 = out_c_blk_idx << 2; kernel_x_1 = kernel_x_0 + 1; @@ -85,23 +87,35 @@ __kernel void conv2d_transpose( int in_width0 = kernel_start_x; for (int k_x = valid_kernel_width; k_x >= 0; k_x -= stride_shape.x) { #ifndef IS_DEPTHWISE - kernel_y = mad24(k_y, - kernel_shape.x, - k_x + kernel_y_base); // (k_y * k_w + k_x) + k_y_base -#endif -#ifndef IS_DEPTHWISE - weights0 = READ_IMG_TYPE( - CL_DTYPE_CHAR, filter, SAMPLER, (int2)(kernel_x_0, kernel_y)); - weights1 = READ_IMG_TYPE( - CL_DTYPE_CHAR, filter, SAMPLER, (int2)(kernel_x_1, kernel_y)); - weights2 = READ_IMG_TYPE( - CL_DTYPE_CHAR, filter, SAMPLER, (int2)(kernel_x_2, kernel_y)); - weights3 = READ_IMG_TYPE( - CL_DTYPE_CHAR, filter, SAMPLER, (int2)(kernel_x_3, kernel_y)); + if (k_x % dilation_shape.x == 0 && k_y % dilation_shape.y == 0) { + kernel_y = mad24(k_y / dilation_shape.y, + kernel_prev_shape.x, + k_x / dilation_shape.x + kernel_y_base); + weights0 = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, SAMPLER, (int2)(kernel_x_0, kernel_y)); + weights1 = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, SAMPLER, (int2)(kernel_x_1, kernel_y)); + weights2 = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, SAMPLER, (int2)(kernel_x_2, kernel_y)); + weights3 = READ_IMG_TYPE( + CL_DTYPE_CHAR, filter, SAMPLER, (int2)(kernel_x_3, kernel_y)); + } else { + weights0 = (CL_DTYPE4)(0.0f); + weights1 = (CL_DTYPE4)(0.0f); + weights2 = (CL_DTYPE4)(0.0f); + weights3 = (CL_DTYPE4)(0.0f); + } #else - int kernel_x = mad24(out_c_blk_idx, kernel_shape.x, k_x); - weights0 = - READ_IMG_TYPE(CL_DTYPE_CHAR, filter, SAMPLER, (int2)(kernel_x, k_y)); + if (k_x % dilation_shape.x == 0 && k_y % dilation_shape.y == 0) { + int kernel_x = + mad24(out_c_blk_idx, kernel_prev_shape.x, k_x / dilation_shape.x); + weights0 = READ_IMG_TYPE(CL_DTYPE_CHAR, + filter, + SAMPLER, + (int2)(kernel_x, k_y / dilation_shape.y)); + } else { + weights0 = (CL_DTYPE4)(0.0f); + } #endif int in_width_value0 = in_width0; in_width_value0 = diff --git a/lite/kernels/opencl/conv_transpose_image_compute.cc b/lite/kernels/opencl/conv_transpose_image_compute.cc index 6d386892262..85013a9945c 100644 --- a/lite/kernels/opencl/conv_transpose_image_compute.cc +++ b/lite/kernels/opencl/conv_transpose_image_compute.cc @@ -64,6 +64,8 @@ void ConvTransposeImageCompute::PrepareForRun() { /********************************************* * Upload filter, bias to opencl device *********************************************/ + filter_tensor_h_ = dilation_h_ * (filter_tensor_h_ - 1) + 1; + filter_tensor_w_ = dilation_w_ * (filter_tensor_w_ - 1) + 1; auto* filter_cpu = conv_param_->filter->mutable_data(); filter_gpu_image_ = std::unique_ptr(new Tensor); @@ -279,7 +281,10 @@ void ConvTransposeImageCompute::SetArgs() { cl_int2 output_wh = {output_tensor_w_, output_tensor_h_}; cl_int2 filter_wh = {filter_tensor_w_, filter_tensor_h_}; cl_int2 stride_wh = {stride_w_, stride_h_}; - + cl_int2 dilation_wh = {dilation_w_, dilation_h_}; + cl_int2 filter_prev_wh = { + static_cast(conv_param_->filter->dims()[3]), + static_cast(conv_param_->filter->dims()[2])}; auto kernel = &kernel_; uint32_t idx = 0; @@ -308,6 +313,10 @@ void ConvTransposeImageCompute::SetArgs() { CL_CHECK_FATAL(status); kernel->setArg(idx++, filter_wh); CL_CHECK_FATAL(status); + kernel->setArg(idx++, dilation_wh); + CL_CHECK_FATAL(status); + kernel->setArg(idx++, filter_prev_wh); + CL_CHECK_FATAL(status); kernel->setArg(idx++, static_cast(filter_tensor_w_ * filter_tensor_h_)); CL_CHECK_FATAL(status); diff --git a/lite/tests/unittest_py/op/test_conv2d_transpose_op.py b/lite/tests/unittest_py/op/test_conv2d_transpose_op.py index 1342bcb71fa..ca28adeceee 100644 --- a/lite/tests/unittest_py/op/test_conv2d_transpose_op.py +++ b/lite/tests/unittest_py/op/test_conv2d_transpose_op.py @@ -67,20 +67,20 @@ def __init__(self, *args, **kwargs): PrecisionType.FP16, DataLayoutType.NCHW, thread=[1, 4]) - # opencl_valid_places = [ - # Place(TargetType.OpenCL, PrecisionType.FP16, - # DataLayoutType.ImageDefault), Place( - # TargetType.OpenCL, PrecisionType.FP16, - # DataLayoutType.ImageFolder), - # Place(TargetType.OpenCL, PrecisionType.FP32, DataLayoutType.NCHW), - # Place(TargetType.OpenCL, PrecisionType.Any, - # DataLayoutType.ImageDefault), Place( - # TargetType.OpenCL, PrecisionType.Any, - # DataLayoutType.ImageFolder), - # Place(TargetType.OpenCL, PrecisionType.Any, DataLayoutType.NCHW), - # Place(TargetType.Host, PrecisionType.FP32) - # ] - # self.enable_testing_on_place(places=opencl_valid_places) + opencl_valid_places = [ + Place(TargetType.OpenCL, PrecisionType.FP16, + DataLayoutType.ImageDefault), Place( + TargetType.OpenCL, PrecisionType.FP16, + DataLayoutType.ImageFolder), + Place(TargetType.OpenCL, PrecisionType.FP32, DataLayoutType.NCHW), + Place(TargetType.OpenCL, PrecisionType.Any, + DataLayoutType.ImageDefault), Place( + TargetType.OpenCL, PrecisionType.Any, + DataLayoutType.ImageFolder), + Place(TargetType.OpenCL, PrecisionType.Any, DataLayoutType.NCHW), + Place(TargetType.Host, PrecisionType.FP32) + ] + self.enable_testing_on_place(places=opencl_valid_places) def is_program_valid(self, program_config: ProgramConfig, @@ -89,10 +89,10 @@ def is_program_valid(self, def sample_program_configs(self, draw): input_n = draw(st.integers(min_value=1, max_value=4)) - input_c = draw(st.integers(min_value=1, max_value=128)) - input_h = draw(st.integers(min_value=1, max_value=128)) - input_w = draw(st.integers(min_value=1, max_value=128)) - filter_m = draw(st.integers(min_value=1, max_value=16)) + input_c = draw(st.integers(min_value=1, max_value=64)) + input_h = draw(st.integers(min_value=1, max_value=64)) + input_w = draw(st.integers(min_value=1, max_value=64)) + filter_m = draw(st.integers(min_value=1, max_value=64)) filter_c = input_c filter_h = draw(st.integers(min_value=1, max_value=7)) filter_w = draw(st.integers(min_value=1, max_value=7)) @@ -253,13 +253,25 @@ def generate_bias(*args, **kwargs): return program_config def sample_predictor_configs(self): - return self.get_predictor_configs(), ["conv2d_transpose"], (1e-5, 1e-5) + atol, rtol = 1e-5, 1e-5 + target_str = self.get_target() + if target_str == "OpenCL": + atol, rtol = 1e-4, 1e-4 + return self.get_predictor_configs(), ["conv2d_transpose"], (atol, rtol) def add_ignore_pass_case(self): - pass + def teller1(program_config, predictor_config): + groups = program_config.ops[0].attrs["groups"] + if predictor_config.target() == TargetType.OpenCL and groups > 1: + return True + + self.add_ignore_check_case( + teller1, IgnoreReasons.PADDLELITE_NOT_SUPPORT, + "Lite does not support this op in a specific case on opencl. We need to fix it as soon as possible." + ) def test(self, *args, **kwargs): - self.run_and_statis(quant=False, max_examples=300) + self.run_and_statis(quant=False, max_examples=100) if __name__ == "__main__": diff --git a/lite/tests/unittest_py/op/test_depthwise_conv2d_transpose_op.py b/lite/tests/unittest_py/op/test_depthwise_conv2d_transpose_op.py index 11e3fd87379..f146f4381fe 100644 --- a/lite/tests/unittest_py/op/test_depthwise_conv2d_transpose_op.py +++ b/lite/tests/unittest_py/op/test_depthwise_conv2d_transpose_op.py @@ -63,20 +63,20 @@ def __init__(self, *args, **kwargs): PrecisionType.FP16, DataLayoutType.NCHW, thread=[1, 4]) - # opencl_places = [ - # Place(TargetType.OpenCL, PrecisionType.FP16, - # DataLayoutType.ImageDefault), Place( - # TargetType.OpenCL, PrecisionType.FP16, - # DataLayoutType.ImageFolder), - # Place(TargetType.OpenCL, PrecisionType.FP32, DataLayoutType.NCHW), - # Place(TargetType.OpenCL, PrecisionType.Any, - # DataLayoutType.ImageDefault), Place( - # TargetType.OpenCL, PrecisionType.Any, - # DataLayoutType.ImageFolder), - # Place(TargetType.OpenCL, PrecisionType.Any, DataLayoutType.NCHW), - # Place(TargetType.Host, PrecisionType.FP32) - # ] - # self.enable_testing_on_place(places=opencl_places) + opencl_places = [ + Place(TargetType.OpenCL, PrecisionType.FP16, + DataLayoutType.ImageDefault), Place( + TargetType.OpenCL, PrecisionType.FP16, + DataLayoutType.ImageFolder), + Place(TargetType.OpenCL, PrecisionType.FP32, DataLayoutType.NCHW), + Place(TargetType.OpenCL, PrecisionType.Any, + DataLayoutType.ImageDefault), Place( + TargetType.OpenCL, PrecisionType.Any, + DataLayoutType.ImageFolder), + Place(TargetType.OpenCL, PrecisionType.Any, DataLayoutType.NCHW), + Place(TargetType.Host, PrecisionType.FP32) + ] + self.enable_testing_on_place(places=opencl_places) def is_program_valid(self, program_config: ProgramConfig, @@ -247,7 +247,7 @@ def add_ignore_pass_case(self): pass def test(self, *args, **kwargs): - self.run_and_statis(quant=False, max_examples=300) + self.run_and_statis(quant=False, max_examples=100) if __name__ == "__main__":