diff --git a/lite/core/mir/elimination/ssd_boxes_calc_offline_pass.cc b/lite/core/mir/elimination/ssd_boxes_calc_offline_pass.cc index c71a1cce5ba..78d0f364780 100644 --- a/lite/core/mir/elimination/ssd_boxes_calc_offline_pass.cc +++ b/lite/core/mir/elimination/ssd_boxes_calc_offline_pass.cc @@ -469,4 +469,4 @@ void SSDBoxesCalcOfflinePass::ComputeConcat( REGISTER_MIR_PASS(ssd_boxes_calc_offline_pass, paddle::lite::mir::SSDBoxesCalcOfflinePass) - .BindTargets({TARGET(kRKNPU), TARGET(kNPU)}); + .BindTargets({TARGET(kRKNPU), TARGET(kNPU), TARGET(kOpenCL)}); diff --git a/lite/kernels/opencl/box_coder_image_compute.cc b/lite/kernels/opencl/box_coder_image_compute.cc index 1fafb2f6f6b..01202b0c55e 100644 --- a/lite/kernels/opencl/box_coder_image_compute.cc +++ b/lite/kernels/opencl/box_coder_image_compute.cc @@ -48,6 +48,44 @@ class BoxCoderComputeImage : public KernelLitecode_type << " doesn't support"; } + + // H2D: prior_box, prior_box + CLImageConverterNormal converter; + priorbox_gpu_image_ = std::unique_ptr(new Tensor); + priorboxvar_gpu_image_ = std::unique_ptr(new Tensor); + auto priorbox_cpu_image = std::unique_ptr(new Tensor); + auto priorboxvar_cpu_image = std::unique_ptr(new Tensor); + + const auto* priorbox_cpu = boxcoder_param_->prior_box->data(); + const auto& priorbox_dims = boxcoder_param_->prior_box->dims(); + auto image_shape = InitImageDimInfoWith(priorbox_dims); + priorbox_cpu_image->Resize( + {1, image_shape["width"], image_shape["height"], 4}); + auto* priorbox_image_data = MUTABLE_DATA_CPU(priorbox_cpu_image); + converter.NCHWToImage( + const_cast(priorbox_cpu), priorbox_image_data, priorbox_dims); + MUTABLE_DATA_GPU(priorbox_gpu_image_, + image_shape["width"], + image_shape["height"], + priorbox_image_data); + + const auto* priorboxvar_cpu = boxcoder_param_->prior_box_var->data(); + const auto& priorboxvar_dims = boxcoder_param_->prior_box_var->dims(); + image_shape = InitImageDimInfoWith(priorboxvar_dims); + priorboxvar_cpu_image->Resize( + {1, image_shape["width"], image_shape["height"], 4}); + auto* priorboxvar_image_data = MUTABLE_DATA_CPU(priorboxvar_cpu_image); + converter.NCHWToImage(const_cast(priorboxvar_cpu), + priorboxvar_image_data, + priorboxvar_dims); + MUTABLE_DATA_GPU(priorboxvar_gpu_image_, + image_shape["width"], + image_shape["height"], + priorboxvar_image_data); + + priorbox_image_ = DATA_GPU(priorbox_gpu_image_); + priorboxvar_image_ = DATA_GPU(priorboxvar_gpu_image_); + CHECK(context.cl_context() != nullptr); VLOG(1) << "kernel_func_name_:" << kernel_func_name_; context.cl_context()->AddKernel(kernel_func_name_, @@ -65,18 +103,9 @@ class BoxCoderComputeImage : public KernelLiteproposals->mutable_data( image_shape["width"], image_shape["height"]); -#ifdef LITE_WITH_LOG - VLOG(4) << "boxcoder input shape: "; - -#endif - const auto* input_priorbox = boxcoder_param_->prior_box; - const auto* input_priorboxvar = boxcoder_param_->prior_box_var; const auto* input_targetbox = boxcoder_param_->target_box; const auto& code_type = boxcoder_param_->code_type; if (code_type == "decode_center_size") { - auto* prior_box_image = input_priorbox->data(); - auto* prior_box_var_image = - input_priorboxvar->data(); auto* target_box_image = input_targetbox->data(); int new_dims[4] = {1, 1, 1, 1}; @@ -99,8 +128,13 @@ class BoxCoderComputeImage : public KernelLiteproposals->target()); - VLOG(4) << "output shape: " << out_dims[0] << ", " << out_dims[1] << ", " - << out_dims[2] << ", " << out_dims[3]; + VLOG(4) << "input[PriorBox] shape: " + << boxcoder_param_->prior_box->dims(); + VLOG(4) << "input[PriorBoxVar] shape: " + << boxcoder_param_->prior_box_var->dims(); + VLOG(4) << "input[TargetBox] shape: " + << boxcoder_param_->target_box->dims(); + VLOG(4) << "output[OutputBox] shape: " << out_dims; VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " " << image_shape["height"]; VLOG(4) << "out_C = " << out_C; @@ -109,9 +143,9 @@ class BoxCoderComputeImage : public KernelLite priorbox_gpu_image_{nullptr}; + std::unique_ptr priorboxvar_gpu_image_{nullptr}; + const cl::Image2D* priorbox_image_{nullptr}; + const cl::Image2D* priorboxvar_image_{nullptr}; }; } // namespace opencl @@ -165,13 +198,9 @@ typedef paddle::lite::kernels::opencl::BoxCoderComputeImage BoxCoder_image; REGISTER_LITE_KERNEL( box_coder, kOpenCL, kFP16, kImageDefault, BoxCoder_image, ImageDefault) .BindInput("PriorBox", - {LiteType::GetTensorTy(TARGET(kOpenCL), - PRECISION(kFP16), - DATALAYOUT(kImageDefault))}) + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kFloat))}) .BindInput("PriorBoxVar", - {LiteType::GetTensorTy(TARGET(kOpenCL), - PRECISION(kFP16), - DATALAYOUT(kImageDefault))}) + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kFloat))}) .BindInput("TargetBox", {LiteType::GetTensorTy(TARGET(kOpenCL), PRECISION(kFP16), diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc index ed36dae0ef3..30946ccec1f 100644 --- a/lite/kernels/opencl/conv_image_compute.cc +++ b/lite/kernels/opencl/conv_image_compute.cc @@ -1346,9 +1346,11 @@ void ConvImageCompute::Run() { (this->*impl_)(); auto& context = ctx_->As(); + /* status_ = context.cl_context()->RunKernel( kernel_, global_work_size_, local_work_size_, &event_); - /* + */ + status_ = EnqueueNDRangeKernel(context, kernel_, cl::NullRange, @@ -1356,7 +1358,6 @@ void ConvImageCompute::Run() { local_work_size_, nullptr, event_); - */ CL_CHECK_FATAL(status_); }