Skip to content

Commit

Permalink
[OpenCL]Fix conv PrepareForRun to reduce first frame time (#8576)
Browse files Browse the repository at this point in the history
* fix conv PrepareForRun test=develop

* fix conv bias flag bug test=develop
  • Loading branch information
sprouteer authored Mar 12, 2022
1 parent 9c744e8 commit a3b1bc7
Showing 1 changed file with 74 additions and 66 deletions.
140 changes: 74 additions & 66 deletions lite/kernels/opencl/conv_image_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ namespace opencl {
void ConvImageCompute::PrepareForRun() {
ReInitWhenNeeded();

bool bias_buffer_flag = false;
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
is_mali_ = context.cl_context()->IsArmMali();
Expand Down Expand Up @@ -243,6 +244,7 @@ void ConvImageCompute::PrepareForRun() {
if (task_size <= threshold_2) {
CLImageConverterNBlock converter;
kernel_func_names_.push_back("conv2d_1x1_mali_h1w2c1");
bias_buffer_flag = true;
const DDim& filter_image_dims =
converter.InitImageDimInfoWith(filter_dims);
filter_image_h_ = filter_image_dims[1];
Expand All @@ -259,14 +261,10 @@ void ConvImageCompute::PrepareForRun() {
tensor_hold_filter_image_->raw_data(),
tensor_hold_filter_image_->memory_size(),
IoDirection::HtoD);

MUTABLE_DATA_GPU(filter_gpu_image_,
filter_image_w_,
filter_image_h_,
filter_image_data);
} else if (task_size <= threshold_4) {
CLImageConverterN2Block converter;
kernel_func_names_.push_back("conv2d_1x1_mali_h1w2c2");
bias_buffer_flag = true;
const DDim& filter_image_dims =
converter.InitImageDimInfoWith(filter_dims);
filter_image_h_ = filter_image_dims[1];
Expand All @@ -283,14 +281,10 @@ void ConvImageCompute::PrepareForRun() {
tensor_hold_filter_image_->raw_data(),
tensor_hold_filter_image_->memory_size(),
IoDirection::HtoD);

MUTABLE_DATA_GPU(filter_gpu_image_,
filter_image_w_,
filter_image_h_,
filter_image_data);
} else {
CLImageConverterN2Block converter;
kernel_func_names_.push_back("conv2d_1x1_mali_h2w2c2");
bias_buffer_flag = true;
const DDim& filter_image_dims =
converter.InitImageDimInfoWith(filter_dims);
filter_image_h_ = filter_image_dims[1];
Expand All @@ -306,11 +300,6 @@ void ConvImageCompute::PrepareForRun() {
tensor_hold_filter_image_->raw_data(),
tensor_hold_filter_image_->memory_size(),
IoDirection::HtoD);

MUTABLE_DATA_GPU(filter_gpu_image_,
filter_image_w_,
filter_image_h_,
filter_image_data);
}
kernel_func_paths_.push_back("image/conv2d_1x1_default_mali_kernel.cl");
impl_ = &ConvImageCompute::Conv2d1x1opt;
Expand Down Expand Up @@ -385,6 +374,7 @@ void ConvImageCompute::PrepareForRun() {
if (is_mali_) {
kernel_func_names_.push_back("matrix_inner_product_mali");
kernel_func_names_.push_back("transform_to_output_mali");
bias_buffer_flag = true;
} else {
kernel_func_names_.push_back("matrix_inner_product");
kernel_func_names_.push_back("transform_to_output");
Expand All @@ -403,21 +393,24 @@ void ConvImageCompute::PrepareForRun() {
converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);

// for mali
w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
auto* w_gpu_data = w_gpu_t_->mutable_data(
TARGET(kOpenCL), tensor_hold_filter_image_->memory_size());
TargetWrapperCL::MemcpySync(w_gpu_data,
tensor_hold_filter_image_->raw_data(),
tensor_hold_filter_image_->memory_size(),
IoDirection::HtoD);

MUTABLE_DATA_GPU(filter_gpu_image_,
filter_image_w_,
filter_image_h_,
filter_image_data);
if (is_mali_) {
w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
auto* w_gpu_data = w_gpu_t_->mutable_data(
TARGET(kOpenCL), tensor_hold_filter_image_->memory_size());
TargetWrapperCL::MemcpySync(w_gpu_data,
tensor_hold_filter_image_->raw_data(),
tensor_hold_filter_image_->memory_size(),
IoDirection::HtoD);
} else {
MUTABLE_DATA_GPU(filter_gpu_image_,
filter_image_w_,
filter_image_h_,
filter_image_data);
}
} else if (groups_ == 1) {
if (is_mali_ && input_tensor_n_ == 1) {
kernel_func_names_.push_back("conv2d_3x3_opt_mali");
bias_buffer_flag = true;
} else {
kernel_func_names_.push_back(
input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch" : "conv2d_3x3_opt");
Expand All @@ -439,17 +432,20 @@ void ConvImageCompute::PrepareForRun() {
auto* filter_image_data = MUTABLE_DATA_CPU(tensor_hold_filter_image_);
converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);

w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
auto* w_gpu_data = w_gpu_t_->mutable_data(
TARGET(kOpenCL), tensor_hold_filter_image_->memory_size());
TargetWrapperCL::MemcpySync(w_gpu_data,
tensor_hold_filter_image_->raw_data(),
tensor_hold_filter_image_->memory_size(),
IoDirection::HtoD);
MUTABLE_DATA_GPU(filter_gpu_image_,
filter_image_w_,
filter_image_h_,
filter_image_data);
if (is_mali_ && input_tensor_n_ == 1) {
w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
auto* w_gpu_data = w_gpu_t_->mutable_data(
TARGET(kOpenCL), tensor_hold_filter_image_->memory_size());
TargetWrapperCL::MemcpySync(w_gpu_data,
tensor_hold_filter_image_->raw_data(),
tensor_hold_filter_image_->memory_size(),
IoDirection::HtoD);
} else {
MUTABLE_DATA_GPU(filter_gpu_image_,
filter_image_w_,
filter_image_h_,
filter_image_data);
}
} else { // groups_ > 1
kernel_func_names_.push_back("conv2d_3x3");
kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
Expand Down Expand Up @@ -538,6 +534,7 @@ void ConvImageCompute::PrepareForRun() {
// conv2d_7x7
if (is_mali_ && input_tensor_n_ == 1) {
kernel_func_names_.push_back("conv2d_7x7_opt_mali");
bias_buffer_flag = true;
} else {
kernel_func_names_.push_back(
input_tensor_n_ > 1 ? "conv2d_7x7_multi_batch" : "conv2d_7x7_opt");
Expand All @@ -552,15 +549,20 @@ void ConvImageCompute::PrepareForRun() {

auto* filter_image_data = MUTABLE_DATA_CPU(tensor_hold_filter_image_);
converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
auto* w_gpu_data = w_gpu_t_->mutable_data(
TARGET(kOpenCL), tensor_hold_filter_image_->memory_size());
TargetWrapperCL::MemcpySync(w_gpu_data,
tensor_hold_filter_image_->raw_data(),
tensor_hold_filter_image_->memory_size(),
IoDirection::HtoD);
MUTABLE_DATA_GPU(
filter_gpu_image_, filter_image_w_, filter_image_h_, filter_image_data);
if (is_mali_ && input_tensor_n_ == 1) {
w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
auto* w_gpu_data = w_gpu_t_->mutable_data(
TARGET(kOpenCL), tensor_hold_filter_image_->memory_size());
TargetWrapperCL::MemcpySync(w_gpu_data,
tensor_hold_filter_image_->raw_data(),
tensor_hold_filter_image_->memory_size(),
IoDirection::HtoD);
} else {
MUTABLE_DATA_GPU(filter_gpu_image_,
filter_image_w_,
filter_image_h_,
filter_image_data);
}

impl_ = &ConvImageCompute::Conv2d7x7opt;
#endif
Expand Down Expand Up @@ -766,31 +768,37 @@ void ConvImageCompute::PrepareForRun() {
bias_converter.NCHWToImage(
bias_cpu_data, bias_image_data, conv_param_->bias->dims());

bias_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
auto* f_gpu_data = bias_gpu_t_->mutable_data(
TARGET(kOpenCL), tensor_hold_bias_image_->memory_size());
TargetWrapperCL::MemcpySync(f_gpu_data,
tensor_hold_bias_image_->raw_data(),
tensor_hold_bias_image_->memory_size(),
IoDirection::HtoD);
MUTABLE_DATA_GPU(bias_gpu_image_,
bias_image_dims[0],
bias_image_dims[1],
bias_image_data);
if (bias_buffer_flag) {
bias_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
auto* f_gpu_data = bias_gpu_t_->mutable_data(
TARGET(kOpenCL), tensor_hold_bias_image_->memory_size());
TargetWrapperCL::MemcpySync(f_gpu_data,
tensor_hold_bias_image_->raw_data(),
tensor_hold_bias_image_->memory_size(),
IoDirection::HtoD);
} else {
MUTABLE_DATA_GPU(bias_gpu_image_,
bias_image_dims[0],
bias_image_dims[1],
bias_image_data);
}
} else {
bias_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
CLImageConverterFolder bias_converter;
tensor_hold_bias_image_->Resize({1, 1, 1, 4});
auto* bias_image_data = DATA_GPU(tensor_hold_bias_image_);

bias_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
auto* f_gpu_data = bias_gpu_t_->mutable_data(
TARGET(kOpenCL), tensor_hold_bias_image_->memory_size());
TargetWrapperCL::MemcpySync(f_gpu_data,
tensor_hold_bias_image_->raw_data(),
tensor_hold_bias_image_->memory_size(),
IoDirection::HtoD);
MUTABLE_DATA_GPU(bias_gpu_image_, 1, 1, bias_image_data);
if (bias_buffer_flag) {
bias_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
auto* f_gpu_data = bias_gpu_t_->mutable_data(
TARGET(kOpenCL), tensor_hold_bias_image_->memory_size());
TargetWrapperCL::MemcpySync(f_gpu_data,
tensor_hold_bias_image_->raw_data(),
tensor_hold_bias_image_->memory_size(),
IoDirection::HtoD);
} else {
MUTABLE_DATA_GPU(bias_gpu_image_, 1, 1, bias_image_data);
}
}

// scale options
Expand Down

0 comments on commit a3b1bc7

Please sign in to comment.