From 09d6e54ea88af3f9d753a869d7c39452d42ad98a Mon Sep 17 00:00:00 2001 From: sprouteer <89541335+sprouteer@users.noreply.github.com> Date: Fri, 31 Dec 2021 10:27:00 +0800 Subject: [PATCH] [OpenCL]instance_norm support fp32 (#8021) * instance_norm support fp32 test=develop --- .../cl_kernel/image/instance_norm_kernel.cl | 54 ++-- lite/kernels/opencl/CMakeLists.txt | 5 +- .../opencl/instance_norm_image_compute.cc | 74 +++-- .../instance_norm_image_compute_test.cc | 300 ++++++++++-------- 4 files changed, 253 insertions(+), 180 deletions(-) diff --git a/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl b/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl index 48a49b16eba..44bc61ba599 100644 --- a/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl @@ -33,26 +33,27 @@ __kernel void instance_norm(__private const int in_width, const int local_total_size = local_work_size_x * local_work_size_y; #ifdef LOCAL_MEM_128 - __local float4 shared_mem[128]; + __local CL_COMPUTE_DTYPE4 shared_mem[128]; #elif defined(LOCAL_MEM_64) - __local float4 shared_mem[64]; + __local CL_COMPUTE_DTYPE4 shared_mem[64]; #else - __local float4 shared_mem[256]; + __local CL_COMPUTE_DTYPE4 shared_mem[256]; #endif int xOffset = c * in_width; int yOffset = n * in_height; - float4 sum = 0.0f; + + CL_COMPUTE_DTYPE4 sum = (CL_COMPUTE_DTYPE4)(0.0f); for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) { for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) { - sum += read_imagef( - input, SAMPLER, (int2)(xOffset + xIndex, yOffset + yIndex)); + sum += READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, + input, + SAMPLER, + (int2)(xOffset + xIndex, yOffset + yIndex)); } } shared_mem[local_id] = sum; - barrier(CLK_LOCAL_MEM_FENCE); - sum = 0.0f; if (local_id < 32) { for (int i = local_id + 32; i < local_total_size; i += 32) { @@ -74,16 +75,18 @@ __kernel void instance_norm(__private const int in_width, barrier(CLK_LOCAL_MEM_FENCE); - const float4 mean_val = shared_mem[0]; + const CL_COMPUTE_DTYPE4 mean_val = shared_mem[0]; barrier(CLK_LOCAL_MEM_FENCE); sum = 0.0f; for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) { for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) { - float4 temp = - read_imagef( - input, SAMPLER, (int2)(xOffset + xIndex, yOffset + yIndex)) - + CL_COMPUTE_DTYPE4 temp = + READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, + input, + SAMPLER, + (int2)(xOffset + xIndex, yOffset + yIndex)) - mean_val; sum += temp * temp; } @@ -113,22 +116,33 @@ __kernel void instance_norm(__private const int in_width, barrier(CLK_LOCAL_MEM_FENCE); - const float4 sigma = sqrt(shared_mem[0] + (float4)(epsilon)); + const CL_COMPUTE_DTYPE4 sigma = + sqrt(shared_mem[0] + (CL_COMPUTE_DTYPE4)(epsilon)); + + CL_COMPUTE_DTYPE4 s = 1 / sigma; + + CL_COMPUTE_DTYPE4 vscale = + READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, scale, SAMPLER, (int2)(c, n)); + CL_COMPUTE_DTYPE4 vbias = + READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, bias, SAMPLER, (int2)(c, n)); - float4 s = 1 / sigma; - float4 vscale = read_imagef(scale, SAMPLER, (int2)(c, n * in_c_group)); - float4 vbias = read_imagef(bias, SAMPLER, (int2)(c, n * in_c_group)); vscale *= s; for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) { for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) { int2 intout_pos = (int2)(xOffset + xIndex, yOffset + yIndex); - float4 in_val = read_imagef(input, SAMPLER, intout_pos); - half4 out_val = convert_half4((in_val - mean_val) * vscale + vbias); + CL_COMPUTE_DTYPE4 in_val = + READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, SAMPLER, intout_pos); + CL_COMPUTE_DTYPE4 output0 = (in_val - mean_val) * vscale + vbias; + CL_DTYPE4 out_val; + out_val.x = CONVERT_TYPE_TO(output0.x, CL_DTYPE); + out_val.y = CONVERT_TYPE_TO(output0.y, CL_DTYPE); + out_val.z = CONVERT_TYPE_TO(output0.z, CL_DTYPE); + out_val.w = CONVERT_TYPE_TO(output0.w, CL_DTYPE); #ifdef RELU - out_val = max((half4)(0.0f, 0.0f, 0.0f, 0.0f), out_val); + out_val = max((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), out_val); #endif - write_imageh(output, intout_pos, out_val); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, intout_pos, out_val); } } } diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt index e7a2e4754e0..08e683b8aaa 100644 --- a/lite/kernels/opencl/CMakeLists.txt +++ b/lite/kernels/opencl/CMakeLists.txt @@ -63,6 +63,7 @@ add_kernel(matmul_opencl_image OPENCL basic SRCS matmul_image_compute.cc) ###################### # image kernel test # ###################### + lite_cc_test(test_gather_image_opencl SRCS gather_image_compute_test.cpp DEPS kernels core) @@ -127,8 +128,8 @@ lite_cc_test(test_bilinear_interp_image_opencl SRCS bilinear_interp_image_comput #lite_cc_test(test_slice_image_opencl SRCS slice_image_compute_test.cc # DEPS kernels core) -#lite_cc_test(test_instance_norm_image_opencl SRCS instance_norm_image_compute_test.cc -# DEPS kernels core) +lite_cc_test(test_instance_norm_image_opencl SRCS instance_norm_image_compute_test.cc + DEPS kernels core) lite_cc_test(test_dropout_image_opencl SRCS dropout_image_compute_test.cc DEPS kernels core) diff --git a/lite/kernels/opencl/instance_norm_image_compute.cc b/lite/kernels/opencl/instance_norm_image_compute.cc index f077f5f6d05..fae435a2985 100644 --- a/lite/kernels/opencl/instance_norm_image_compute.cc +++ b/lite/kernels/opencl/instance_norm_image_compute.cc @@ -49,6 +49,11 @@ class InstanceNormImageCompute : public KernelLiteget_precision() == lite_api::CL_PRECISION_FP16; + if (enable_fp16) { + build_options_ += " -DCL_DTYPE_half -DCL_DTYPE_FLOAT_FORCE "; + } if (out_h == 128) { build_options_ += " -DLOCAL_MEM_128"; } else if (out_h == 64) { @@ -75,29 +80,60 @@ class InstanceNormImageCompute : public KernelLite scale_img(cround * batch); - std::vector bias_img(cround * batch); const float* scale_data = instance_norm_param_->scale->data(); const float* bias_data = instance_norm_param_->bias->data(); - for (int i = 0; i < channel; ++i) { - scale_img[i] = Float2Half(scale_data[i]); - bias_img[i] = Float2Half(bias_data[i]); - } + std::vector scale_img(cround * batch); + std::vector bias_img(cround * batch); + + std::vector scale_img_h(cround * batch); + std::vector bias_img_h(cround * batch); - for (int i = 1; i < batch; ++i) { - memcpy(scale_img.data() + i * cround, - scale_img.data(), - cround * sizeof(half_t)); - memcpy(bias_img.data() + i * cround, - bias_img.data(), - cround * sizeof(half_t)); - } DDim scale_img_size{{ cgroup, batch }}; - MUTABLE_DATA_GPU( - &scale_image_, scale_img_size[0], scale_img_size[1], scale_img.data()); - MUTABLE_DATA_GPU( - &bias_image_, scale_img_size[0], scale_img_size[1], bias_img.data()); + + if (enable_fp16) { + for (int i = 0; i < channel; ++i) { + scale_img_h[i] = Float2Half(scale_data[i]); + bias_img_h[i] = Float2Half(bias_data[i]); + } + + for (int i = 1; i < batch; ++i) { + memcpy(scale_img_h.data() + i * cround, + scale_img_h.data(), + cround * sizeof(half_t)); + memcpy(bias_img_h.data() + i * cround, + bias_img_h.data(), + cround * sizeof(half_t)); + } + MUTABLE_DATA_GPU(&scale_image_, + scale_img_size[0], + scale_img_size[1], + scale_img_h.data()); + MUTABLE_DATA_GPU(&bias_image_, + scale_img_size[0], + scale_img_size[1], + bias_img_h.data()); + } else { + for (int i = 0; i < channel; ++i) { + scale_img[i] = scale_data[i]; + bias_img[i] = bias_data[i]; + } + + for (int i = 1; i < batch; ++i) { + memcpy(scale_img.data() + i * cround, + scale_img.data(), + cround * sizeof(float)); + memcpy(bias_img.data() + i * cround, + bias_img.data(), + cround * sizeof(float)); + } + MUTABLE_DATA_GPU(&scale_image_, + scale_img_size[0], + scale_img_size[1], + scale_img.data()); + MUTABLE_DATA_GPU( + &bias_image_, scale_img_size[0], scale_img_size[1], bias_img.data()); + } } void ReInitWhenNeeded() override { @@ -182,6 +218,7 @@ class InstanceNormImageCompute : public KernelLitedoc(); - - lite::Tensor x, out, out_ref, scale, bias, saved_mean, saved_variance; - operators::InstanceNormParam param; - param.x = &x; - param.out = &out; - param.scale = &scale; - param.bias = &bias; - param.saved_mean = &saved_mean; - param.saved_variance = &saved_variance; - param.epsilon = 1e-5; - std::unique_ptr context(new KernelContext); - context->As().InitOnce(); - - kernel->SetParam(param); - std::unique_ptr instance_context(new KernelContext); - context->As().CopySharedTo( - &(instance_context->As())); - kernel->SetContext(std::move(instance_context)); - - const DDim in_dim = DDim(std::vector{n, c, h, w}); - x.Resize(in_dim); - out.Resize(in_dim); - out_ref.Resize(in_dim); - scale.Resize({c}); - bias.Resize({c}); - saved_mean.Resize({n * c}); - saved_variance.Resize({n * c}); - auto* x_data = x.mutable_data(); - auto* scale_data = scale.mutable_data(); - auto* bias_data = bias.mutable_data(); - auto* saved_mean_data = saved_mean.mutable_data(); - auto* saved_variance_data = saved_variance.mutable_data(); - std::default_random_engine engine; - std::uniform_real_distribution dist(-1, 1); - int sum = n * c * h * w; - for (int i = 0; i < sum; ++i) { - x_data[i] = dist(engine); - } - for (int i = 0; i < c; ++i) { - scale_data[i] = dist(engine); - bias_data[i] = dist(engine); - } - //! run reference instance norm - instance_norm_ref( - &x, &out_ref, &scale, &bias, &saved_mean, &saved_variance, 1e-5); - LOG(INFO) << "prepare input"; - CLImageConverterDefault* default_converter = - new CLImageConverterDefault(); - DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim); - LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " " - << x_image_shape[1]; - std::vector x_image_data(x_image_shape.production() * - 4); // 4 : RGBA - default_converter->NCHWToImage(x_data, x_image_data.data(), in_dim); - auto* x_image = x.mutable_data( - x_image_shape[0], x_image_shape[1], x_image_data.data()); - - auto* out_image = out.mutable_data( - x_image_shape[0], x_image_shape[1]); - - //! warm up - for (int i = 0; i < FLAGS_warmup; ++i) { - kernel->Launch(); - } - context->As().cl_context()->GetCommandQueue().finish(); - //! compute - Timer t0; - t0.Start(); - for (int i = 0; i < FLAGS_repeats; ++i) { - kernel->Launch(); - } - context->As().cl_context()->GetCommandQueue().finish(); - t0.Stop(); - double gops = 6 * sum; - LOG(INFO) << "avg time: " << t0.LapTimes().Avg() / FLAGS_repeats - << " ms, " - << "avg GOPs: " - << 1e-6 * gops * FLAGS_repeats / t0.LapTimes().Avg() - << " GOPs"; - const size_t cl_image2d_row_pitch{0}; - const size_t cl_image2d_slice_pitch{0}; - half_t* out_image_data = new half_t[x_image_shape.production() * 4]; - TargetWrapperCL::ImgcpySync(out_image_data, - out_image, - x_image_shape[0], - x_image_shape[1], - cl_image2d_row_pitch, - cl_image2d_slice_pitch, - IoDirection::DtoH); - float* out_data = new float[x_image_shape.production() * 4]; - default_converter->ImageToNCHW( - out_image_data, out_data, x_image_shape, in_dim); -// result + LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c + << " " << h << " " << w << " ========"; + std::unique_ptr context(new KernelContext); + context->As().InitOnce(); + CLRuntime::Global()->set_precision(precision_type); + const bool fp16_flag = + (precision_type == + lite_api::CLPrecisionType::CL_PRECISION_FP16); + LOG(INFO) << "\n\t[ START ] Test Precision=" + << lite_api::CLPrecisionTypeToStr(precision_type); + auto kernels = + KernelRegistry::Global().Create("instance_norm", + TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault)); + ASSERT_FALSE(kernels.empty()); + auto kernel = std::move(kernels.front()); + LOG(INFO) << "get kernel:" << kernel->doc(); + + lite::Tensor x, out, out_ref, scale, bias, saved_mean, + saved_variance; + operators::InstanceNormParam param; + param.x = &x; + param.out = &out; + param.scale = &scale; + param.bias = &bias; + param.saved_mean = &saved_mean; + param.saved_variance = &saved_variance; + param.epsilon = 1e-5; + + kernel->SetParam(param); + kernel->SetContext(std::move(context)); + + const DDim in_dim = DDim(std::vector{n, c, h, w}); + x.Resize(in_dim); + out.Resize(in_dim); + out_ref.Resize(in_dim); + scale.Resize({c}); + bias.Resize({c}); + saved_mean.Resize({n * c}); + saved_variance.Resize({n * c}); + auto* x_data = x.mutable_data(); + auto* scale_data = scale.mutable_data(); + auto* bias_data = bias.mutable_data(); + auto* saved_mean_data = saved_mean.mutable_data(); + auto* saved_variance_data = saved_variance.mutable_data(); + std::default_random_engine engine; + std::uniform_real_distribution dist(-1, 1); + int sum = n * c * h * w; + for (int i = 0; i < sum; ++i) { + x_data[i] = dist(engine); + } + for (int i = 0; i < c; ++i) { + scale_data[i] = dist(engine); + bias_data[i] = dist(engine); + } + //! run reference instance norm + instance_norm_ref(&x, + &out_ref, + &scale, + &bias, + &saved_mean, + &saved_variance, + 1e-5); + LOG(INFO) << "prepare input"; + CLImageConverterDefault* default_converter = + new CLImageConverterDefault(); + DDim x_image_shape = + default_converter->InitImageDimInfoWith(in_dim); + LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " " + << x_image_shape[1]; + + const size_t dtype_size = + fp16_flag ? sizeof(half_t) : sizeof(float); + std::vector x_image_data(x_image_shape.production() * 4 * + dtype_size); // 4 : RGBA + default_converter->NCHWToImage(x_data, x_image_data.data(), in_dim); + MUTABLE_DATA_GPU( + &x, x_image_shape[0], x_image_shape[1], x_image_data.data()); + + auto* out_image = MUTABLE_DATA_GPU( + &out, x_image_shape[0], x_image_shape[1], nullptr); + + //! warm up + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel->Launch(); + } + CLRuntime::Global()->command_queue().finish(); + //! compute + Timer t0; + t0.Start(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel->Launch(); + } + CLRuntime::Global()->command_queue().finish(); + t0.Stop(); + double gops = 6 * sum; + LOG(INFO) << "avg time: " << t0.LapTimes().Avg() / FLAGS_repeats + << " ms, " + << "avg GOPs: " + << 1e-6 * gops * FLAGS_repeats / t0.LapTimes().Avg() + << " GOPs"; + const size_t cl_image2d_row_pitch{0}; + const size_t cl_image2d_slice_pitch{0}; + std::vector out_image_data(x_image_shape.production() * 4 * + dtype_size); // 4 : RGBA + TargetWrapperCL::ImgcpySync(out_image_data.data(), + out_image, + x_image_shape[0], + x_image_shape[1], + cl_image2d_row_pitch, + cl_image2d_slice_pitch, + IoDirection::DtoH); + float* out_data = new float[x_image_shape.production() * 4]; + default_converter->ImageToNCHW( + out_image_data.data(), out_data, x_image_shape, in_dim); + auto* out_ref_data = out_ref.data(); + #ifdef INSTANCE_NORM_FP16_PRINT_RESULT - LOG(INFO) << "---- print kernel result (input -> output) ----"; - for (int eidx = 0; eidx < in_dim.production(); ++eidx) { - std::cout << x_data[eidx] << " -> " << out_data[eidx] << std::endl; - } + LOG(INFO) << "---- print kernel result (input -> output) ----"; + for (int eidx = 0; eidx < in_dim.production(); ++eidx) { + std::cout << x_data[eidx] << " -> " << out_data[eidx] + << " out_ref_data: " << out_ref_data[eidx] << std::endl; + } #endif // INSTANCE_NORM_FP16_PRINT_RESULT - auto* out_ref_data = out_ref.data(); - for (int i = 0; i < in_dim.production(); i++) { - auto abs_diff = abs(out_data[i] - out_ref_data[i]); - auto relative_diff = - COMPUTE_RELATIVE_DIFF(out_data[i], out_ref_data[i]); - EXPECT_EQ( - (relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF), - true); - if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) { - LOG(ERROR) << "error idx:" << i << ", in_data[" << i - << "]: " << x_data[i] << ", out_data[" << i - << "]: " << out_data[i] << ", out_ref[" << i - << "]: " << out_ref_data[i] - << ", abs_diff: " << abs_diff - << ", relative_diff: " << relative_diff - << ", FP16_MAX_DIFF: " << FP16_MAX_DIFF; + + for (int i = 0; i < in_dim.production(); i++) { + auto abs_diff = abs(out_data[i] - out_ref_data[i]); + auto relative_diff = + COMPUTE_RELATIVE_DIFF(out_data[i], out_ref_data[i]); + EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || + (abs_diff <= FP16_MAX_DIFF), + true); + if ((relative_diff > FP16_MAX_DIFF) && + (abs_diff > FP16_MAX_DIFF)) { + LOG(ERROR) << "error idx:" << i << ", in_data[" << i + << "]: " << x_data[i] << ", out_data[" << i + << "]: " << out_data[i] << ", out_ref[" << i + << "]: " << out_ref_data[i] + << ", abs_diff: " << abs_diff + << ", relative_diff: " << relative_diff + << ", FP16_MAX_DIFF: " << FP16_MAX_DIFF; + } } - } - delete[] out_data; - delete[] out_image_data; + delete[] out_data; +// delete[] out_image_data; #ifdef INSTANCE_NORM_FP16_LOOP_TEST - } // w - } // h - } // c - } // n + } // w + } // h + } // c + } // n #else // nothing to do. #endif + } } } // namespace lite