From 09d6e54ea88af3f9d753a869d7c39452d42ad98a Mon Sep 17 00:00:00 2001
From: sprouteer <89541335+sprouteer@users.noreply.github.com>
Date: Fri, 31 Dec 2021 10:27:00 +0800
Subject: [PATCH] [OpenCL]instance_norm support fp32 (#8021)

* instance_norm support fp32 test=develop
---
 .../cl_kernel/image/instance_norm_kernel.cl   |  54 ++--
 lite/kernels/opencl/CMakeLists.txt            |   5 +-
 .../opencl/instance_norm_image_compute.cc     |  74 +++--
 .../instance_norm_image_compute_test.cc       | 300 ++++++++++--------
 4 files changed, 253 insertions(+), 180 deletions(-)

diff --git a/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl b/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl
index 48a49b16eba..44bc61ba599 100644
--- a/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl
@@ -33,26 +33,27 @@ __kernel void instance_norm(__private const int in_width,
   const int local_total_size = local_work_size_x * local_work_size_y;
 
 #ifdef LOCAL_MEM_128
-  __local float4 shared_mem[128];
+  __local CL_COMPUTE_DTYPE4 shared_mem[128];
 #elif defined(LOCAL_MEM_64)
-  __local float4 shared_mem[64];
+  __local CL_COMPUTE_DTYPE4 shared_mem[64];
 #else
-  __local float4 shared_mem[256];
+  __local CL_COMPUTE_DTYPE4 shared_mem[256];
 #endif
 
   int xOffset = c * in_width;
   int yOffset = n * in_height;
-  float4 sum = 0.0f;
+
+  CL_COMPUTE_DTYPE4 sum = (CL_COMPUTE_DTYPE4)(0.0f);
   for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) {
     for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
-      sum += read_imagef(
-          input, SAMPLER, (int2)(xOffset + xIndex, yOffset + yIndex));
+      sum += READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR,
+                           input,
+                           SAMPLER,
+                           (int2)(xOffset + xIndex, yOffset + yIndex));
     }
   }
   shared_mem[local_id] = sum;
-
   barrier(CLK_LOCAL_MEM_FENCE);
-
   sum = 0.0f;
   if (local_id < 32) {
     for (int i = local_id + 32; i < local_total_size; i += 32) {
@@ -74,16 +75,18 @@ __kernel void instance_norm(__private const int in_width,
 
   barrier(CLK_LOCAL_MEM_FENCE);
 
-  const float4 mean_val = shared_mem[0];
+  const CL_COMPUTE_DTYPE4 mean_val = shared_mem[0];
 
   barrier(CLK_LOCAL_MEM_FENCE);
 
   sum = 0.0f;
   for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) {
     for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
-      float4 temp =
-          read_imagef(
-              input, SAMPLER, (int2)(xOffset + xIndex, yOffset + yIndex)) -
+      CL_COMPUTE_DTYPE4 temp =
+          READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR,
+                        input,
+                        SAMPLER,
+                        (int2)(xOffset + xIndex, yOffset + yIndex)) -
           mean_val;
       sum += temp * temp;
     }
@@ -113,22 +116,33 @@ __kernel void instance_norm(__private const int in_width,
 
   barrier(CLK_LOCAL_MEM_FENCE);
 
-  const float4 sigma = sqrt(shared_mem[0] + (float4)(epsilon));
+  const CL_COMPUTE_DTYPE4 sigma =
+      sqrt(shared_mem[0] + (CL_COMPUTE_DTYPE4)(epsilon));
+
+  CL_COMPUTE_DTYPE4 s = 1 / sigma;
+
+  CL_COMPUTE_DTYPE4 vscale =
+      READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, scale, SAMPLER, (int2)(c, n));
+  CL_COMPUTE_DTYPE4 vbias =
+      READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, bias, SAMPLER, (int2)(c, n));
 
-  float4 s = 1 / sigma;
-  float4 vscale = read_imagef(scale, SAMPLER, (int2)(c, n * in_c_group));
-  float4 vbias = read_imagef(bias, SAMPLER, (int2)(c, n * in_c_group));
   vscale *= s;
 
   for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) {
     for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
       int2 intout_pos = (int2)(xOffset + xIndex, yOffset + yIndex);
-      float4 in_val = read_imagef(input, SAMPLER, intout_pos);
-      half4 out_val = convert_half4((in_val - mean_val) * vscale + vbias);
+      CL_COMPUTE_DTYPE4 in_val =
+          READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, SAMPLER, intout_pos);
+      CL_COMPUTE_DTYPE4 output0 = (in_val - mean_val) * vscale + vbias;
+      CL_DTYPE4 out_val;
+      out_val.x = CONVERT_TYPE_TO(output0.x, CL_DTYPE);
+      out_val.y = CONVERT_TYPE_TO(output0.y, CL_DTYPE);
+      out_val.z = CONVERT_TYPE_TO(output0.z, CL_DTYPE);
+      out_val.w = CONVERT_TYPE_TO(output0.w, CL_DTYPE);
 #ifdef RELU
-      out_val = max((half4)(0.0f, 0.0f, 0.0f, 0.0f), out_val);
+      out_val = max((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), out_val);
 #endif
-      write_imageh(output, intout_pos, out_val);
+      WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, intout_pos, out_val);
     }
   }
 }
diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt
index e7a2e4754e0..08e683b8aaa 100644
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -63,6 +63,7 @@ add_kernel(matmul_opencl_image OPENCL basic SRCS matmul_image_compute.cc)
 ######################
 # image kernel test  #
 ######################
+
 lite_cc_test(test_gather_image_opencl SRCS gather_image_compute_test.cpp
              DEPS kernels core)
 
@@ -127,8 +128,8 @@ lite_cc_test(test_bilinear_interp_image_opencl SRCS bilinear_interp_image_comput
 #lite_cc_test(test_slice_image_opencl SRCS slice_image_compute_test.cc
 #             DEPS kernels core)
 
-#lite_cc_test(test_instance_norm_image_opencl SRCS instance_norm_image_compute_test.cc
-#             DEPS kernels core)
+lite_cc_test(test_instance_norm_image_opencl SRCS instance_norm_image_compute_test.cc
+             DEPS kernels core)
 
 lite_cc_test(test_dropout_image_opencl SRCS dropout_image_compute_test.cc
              DEPS kernels core)
diff --git a/lite/kernels/opencl/instance_norm_image_compute.cc b/lite/kernels/opencl/instance_norm_image_compute.cc
index f077f5f6d05..fae435a2985 100644
--- a/lite/kernels/opencl/instance_norm_image_compute.cc
+++ b/lite/kernels/opencl/instance_norm_image_compute.cc
@@ -49,6 +49,11 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
 
     // TODO(ysh329): add instance_norm + relu pass
     // std::string build_options_ += "-DRELU";
+    const bool enable_fp16 =
+        CLRuntime::Global()->get_precision() == lite_api::CL_PRECISION_FP16;
+    if (enable_fp16) {
+      build_options_ += " -DCL_DTYPE_half -DCL_DTYPE_FLOAT_FORCE ";
+    }
     if (out_h == 128) {
       build_options_ += " -DLOCAL_MEM_128";
     } else if (out_h == 64) {
@@ -75,29 +80,60 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
     int cgroup = (channel + 3) / 4;
     int cround = cgroup * 4;
 
-    std::vector<half_t> scale_img(cround * batch);
-    std::vector<half_t> bias_img(cround * batch);
     const float* scale_data = instance_norm_param_->scale->data<float>();
     const float* bias_data = instance_norm_param_->bias->data<float>();
 
-    for (int i = 0; i < channel; ++i) {
-      scale_img[i] = Float2Half(scale_data[i]);
-      bias_img[i] = Float2Half(bias_data[i]);
-    }
+    std::vector<float> scale_img(cround * batch);
+    std::vector<float> bias_img(cround * batch);
+
+    std::vector<half_t> scale_img_h(cround * batch);
+    std::vector<half_t> bias_img_h(cround * batch);
 
-    for (int i = 1; i < batch; ++i) {
-      memcpy(scale_img.data() + i * cround,
-             scale_img.data(),
-             cround * sizeof(half_t));
-      memcpy(bias_img.data() + i * cround,
-             bias_img.data(),
-             cround * sizeof(half_t));
-    }
     DDim scale_img_size{{ cgroup, batch }};
-    MUTABLE_DATA_GPU(
-        &scale_image_, scale_img_size[0], scale_img_size[1], scale_img.data());
-    MUTABLE_DATA_GPU(
-        &bias_image_, scale_img_size[0], scale_img_size[1], bias_img.data());
+
+    if (enable_fp16) {
+      for (int i = 0; i < channel; ++i) {
+        scale_img_h[i] = Float2Half(scale_data[i]);
+        bias_img_h[i] = Float2Half(bias_data[i]);
+      }
+
+      for (int i = 1; i < batch; ++i) {
+        memcpy(scale_img_h.data() + i * cround,
+               scale_img_h.data(),
+               cround * sizeof(half_t));
+        memcpy(bias_img_h.data() + i * cround,
+               bias_img_h.data(),
+               cround * sizeof(half_t));
+      }
+      MUTABLE_DATA_GPU(&scale_image_,
+                       scale_img_size[0],
+                       scale_img_size[1],
+                       scale_img_h.data());
+      MUTABLE_DATA_GPU(&bias_image_,
+                       scale_img_size[0],
+                       scale_img_size[1],
+                       bias_img_h.data());
+    } else {
+      for (int i = 0; i < channel; ++i) {
+        scale_img[i] = scale_data[i];
+        bias_img[i] = bias_data[i];
+      }
+
+      for (int i = 1; i < batch; ++i) {
+        memcpy(scale_img.data() + i * cround,
+               scale_img.data(),
+               cround * sizeof(float));
+        memcpy(bias_img.data() + i * cround,
+               bias_img.data(),
+               cround * sizeof(float));
+      }
+      MUTABLE_DATA_GPU(&scale_image_,
+                       scale_img_size[0],
+                       scale_img_size[1],
+                       scale_img.data());
+      MUTABLE_DATA_GPU(
+          &bias_image_, scale_img_size[0], scale_img_size[1], bias_img.data());
+    }
   }
 
   void ReInitWhenNeeded() override {
@@ -182,6 +218,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
 
     status = EnqueueNDRangeKernel(
         context, kernel_, cl::NullRange, gws_, lws_, nullptr, event_);
+
     CL_CHECK_FATAL(status);
   }
 
@@ -324,7 +361,6 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
   std::string time_stamp_{GetTimeStamp()};
   cl::Kernel kernel_;
   cl::NDRange gws_, lws_;
-
   Tensor scale_image_;
   Tensor bias_image_;
 };
diff --git a/lite/kernels/opencl/instance_norm_image_compute_test.cc b/lite/kernels/opencl/instance_norm_image_compute_test.cc
index 63d172f5edc..10fdec058b3 100644
--- a/lite/kernels/opencl/instance_norm_image_compute_test.cc
+++ b/lite/kernels/opencl/instance_norm_image_compute_test.cc
@@ -82,156 +82,178 @@ void instance_norm_ref(Tensor* x,
 // #define INSTANCE_NORM_FP16_LOOP_TEST
 // #define INSTANCE_NORM_FP16_PRINT_RESULT
 TEST(instance_norm_image2d, compute) {
+  for (const auto precision_type :
+       {lite_api::CLPrecisionType::CL_PRECISION_FP32,
+        lite_api::CLPrecisionType::CL_PRECISION_FP16}) {
 #ifdef INSTANCE_NORM_FP16_LOOP_TEST
-  for (auto n : {1, 3}) {
-    for (auto c : {1, 3, 8, 32, 65}) {
-      for (auto h : {4, 20, 64, 112, 224}) {
-        for (auto w : {2, 20, 64, 112, 224}) {
+    for (auto n : {1, 3}) {
+      for (auto c : {1, 3, 8, 32}) {
+        for (auto h : {4, 20, 64, 112}) {
+          for (auto w : {
+                   2, 20, 64, 112,
+               }) {
 #else
-  const int n = 1;
-  const int c = 32;
-  const int h = 224;
-  const int w = 224;
+    const int n = 2;
+    const int c = 7;
+    const int h = 71;
+    const int w = 11;
 #endif  // INSTANCE_NORM_FP16_LOOP_TEST
 
-          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
-                    << h << " " << w << " ========";
-
-          auto kernels =
-              KernelRegistry::Global().Create("instance_norm",
-                                              TARGET(kOpenCL),
-                                              PRECISION(kFP16),
-                                              DATALAYOUT(kImageDefault));
-          ASSERT_FALSE(kernels.empty());
-          auto kernel = std::move(kernels.front());
-          LOG(INFO) << "get kernel:" << kernel->doc();
-
-          lite::Tensor x, out, out_ref, scale, bias, saved_mean, saved_variance;
-          operators::InstanceNormParam param;
-          param.x = &x;
-          param.out = &out;
-          param.scale = &scale;
-          param.bias = &bias;
-          param.saved_mean = &saved_mean;
-          param.saved_variance = &saved_variance;
-          param.epsilon = 1e-5;
-          std::unique_ptr<KernelContext> context(new KernelContext);
-          context->As<OpenCLContext>().InitOnce();
-
-          kernel->SetParam(param);
-          std::unique_ptr<KernelContext> instance_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(instance_context->As<OpenCLContext>()));
-          kernel->SetContext(std::move(instance_context));
-
-          const DDim in_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
-          x.Resize(in_dim);
-          out.Resize(in_dim);
-          out_ref.Resize(in_dim);
-          scale.Resize({c});
-          bias.Resize({c});
-          saved_mean.Resize({n * c});
-          saved_variance.Resize({n * c});
-          auto* x_data = x.mutable_data<float>();
-          auto* scale_data = scale.mutable_data<float>();
-          auto* bias_data = bias.mutable_data<float>();
-          auto* saved_mean_data = saved_mean.mutable_data<float>();
-          auto* saved_variance_data = saved_variance.mutable_data<float>();
-          std::default_random_engine engine;
-          std::uniform_real_distribution<float> dist(-1, 1);
-          int sum = n * c * h * w;
-          for (int i = 0; i < sum; ++i) {
-            x_data[i] = dist(engine);
-          }
-          for (int i = 0; i < c; ++i) {
-            scale_data[i] = dist(engine);
-            bias_data[i] = dist(engine);
-          }
-          //! run reference instance norm
-          instance_norm_ref(
-              &x, &out_ref, &scale, &bias, &saved_mean, &saved_variance, 1e-5);
-          LOG(INFO) << "prepare input";
-          CLImageConverterDefault* default_converter =
-              new CLImageConverterDefault();
-          DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim);
-          LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
-                    << x_image_shape[1];
-          std::vector<half_t> x_image_data(x_image_shape.production() *
-                                           4);  // 4 : RGBA
-          default_converter->NCHWToImage(x_data, x_image_data.data(), in_dim);
-          auto* x_image = x.mutable_data<half_t, cl::Image2D>(
-              x_image_shape[0], x_image_shape[1], x_image_data.data());
-
-          auto* out_image = out.mutable_data<half_t, cl::Image2D>(
-              x_image_shape[0], x_image_shape[1]);
-
-          //! warm up
-          for (int i = 0; i < FLAGS_warmup; ++i) {
-            kernel->Launch();
-          }
-          context->As<OpenCLContext>().cl_context()->GetCommandQueue().finish();
-          //! compute
-          Timer t0;
-          t0.Start();
-          for (int i = 0; i < FLAGS_repeats; ++i) {
-            kernel->Launch();
-          }
-          context->As<OpenCLContext>().cl_context()->GetCommandQueue().finish();
-          t0.Stop();
-          double gops = 6 * sum;
-          LOG(INFO) << "avg time: " << t0.LapTimes().Avg() / FLAGS_repeats
-                    << " ms, "
-                    << "avg GOPs: "
-                    << 1e-6 * gops * FLAGS_repeats / t0.LapTimes().Avg()
-                    << " GOPs";
-          const size_t cl_image2d_row_pitch{0};
-          const size_t cl_image2d_slice_pitch{0};
-          half_t* out_image_data = new half_t[x_image_shape.production() * 4];
-          TargetWrapperCL::ImgcpySync(out_image_data,
-                                      out_image,
-                                      x_image_shape[0],
-                                      x_image_shape[1],
-                                      cl_image2d_row_pitch,
-                                      cl_image2d_slice_pitch,
-                                      IoDirection::DtoH);
-          float* out_data = new float[x_image_shape.production() * 4];
-          default_converter->ImageToNCHW(
-              out_image_data, out_data, x_image_shape, in_dim);
-// result
+            LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c
+                      << " " << h << " " << w << " ========";
+            std::unique_ptr<KernelContext> context(new KernelContext);
+            context->As<OpenCLContext>().InitOnce();
+            CLRuntime::Global()->set_precision(precision_type);
+            const bool fp16_flag =
+                (precision_type ==
+                 lite_api::CLPrecisionType::CL_PRECISION_FP16);
+            LOG(INFO) << "\n\t[  START  ] Test Precision="
+                      << lite_api::CLPrecisionTypeToStr(precision_type);
+            auto kernels =
+                KernelRegistry::Global().Create("instance_norm",
+                                                TARGET(kOpenCL),
+                                                PRECISION(kFP16),
+                                                DATALAYOUT(kImageDefault));
+            ASSERT_FALSE(kernels.empty());
+            auto kernel = std::move(kernels.front());
+            LOG(INFO) << "get kernel:" << kernel->doc();
+
+            lite::Tensor x, out, out_ref, scale, bias, saved_mean,
+                saved_variance;
+            operators::InstanceNormParam param;
+            param.x = &x;
+            param.out = &out;
+            param.scale = &scale;
+            param.bias = &bias;
+            param.saved_mean = &saved_mean;
+            param.saved_variance = &saved_variance;
+            param.epsilon = 1e-5;
+
+            kernel->SetParam(param);
+            kernel->SetContext(std::move(context));
+
+            const DDim in_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+            x.Resize(in_dim);
+            out.Resize(in_dim);
+            out_ref.Resize(in_dim);
+            scale.Resize({c});
+            bias.Resize({c});
+            saved_mean.Resize({n * c});
+            saved_variance.Resize({n * c});
+            auto* x_data = x.mutable_data<float>();
+            auto* scale_data = scale.mutable_data<float>();
+            auto* bias_data = bias.mutable_data<float>();
+            auto* saved_mean_data = saved_mean.mutable_data<float>();
+            auto* saved_variance_data = saved_variance.mutable_data<float>();
+            std::default_random_engine engine;
+            std::uniform_real_distribution<float> dist(-1, 1);
+            int sum = n * c * h * w;
+            for (int i = 0; i < sum; ++i) {
+              x_data[i] = dist(engine);
+            }
+            for (int i = 0; i < c; ++i) {
+              scale_data[i] = dist(engine);
+              bias_data[i] = dist(engine);
+            }
+            //! run reference instance norm
+            instance_norm_ref(&x,
+                              &out_ref,
+                              &scale,
+                              &bias,
+                              &saved_mean,
+                              &saved_variance,
+                              1e-5);
+            LOG(INFO) << "prepare input";
+            CLImageConverterDefault* default_converter =
+                new CLImageConverterDefault();
+            DDim x_image_shape =
+                default_converter->InitImageDimInfoWith(in_dim);
+            LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
+                      << x_image_shape[1];
+
+            const size_t dtype_size =
+                fp16_flag ? sizeof(half_t) : sizeof(float);
+            std::vector<char> x_image_data(x_image_shape.production() * 4 *
+                                           dtype_size);  // 4 : RGBA
+            default_converter->NCHWToImage(x_data, x_image_data.data(), in_dim);
+            MUTABLE_DATA_GPU(
+                &x, x_image_shape[0], x_image_shape[1], x_image_data.data());
+
+            auto* out_image = MUTABLE_DATA_GPU(
+                &out, x_image_shape[0], x_image_shape[1], nullptr);
+
+            //! warm up
+            for (int i = 0; i < FLAGS_warmup; ++i) {
+              kernel->Launch();
+            }
+            CLRuntime::Global()->command_queue().finish();
+            //! compute
+            Timer t0;
+            t0.Start();
+            for (int i = 0; i < FLAGS_repeats; ++i) {
+              kernel->Launch();
+            }
+            CLRuntime::Global()->command_queue().finish();
+            t0.Stop();
+            double gops = 6 * sum;
+            LOG(INFO) << "avg time: " << t0.LapTimes().Avg() / FLAGS_repeats
+                      << " ms, "
+                      << "avg GOPs: "
+                      << 1e-6 * gops * FLAGS_repeats / t0.LapTimes().Avg()
+                      << " GOPs";
+            const size_t cl_image2d_row_pitch{0};
+            const size_t cl_image2d_slice_pitch{0};
+            std::vector<char> out_image_data(x_image_shape.production() * 4 *
+                                             dtype_size);  // 4 : RGBA
+            TargetWrapperCL::ImgcpySync(out_image_data.data(),
+                                        out_image,
+                                        x_image_shape[0],
+                                        x_image_shape[1],
+                                        cl_image2d_row_pitch,
+                                        cl_image2d_slice_pitch,
+                                        IoDirection::DtoH);
+            float* out_data = new float[x_image_shape.production() * 4];
+            default_converter->ImageToNCHW(
+                out_image_data.data(), out_data, x_image_shape, in_dim);
+            auto* out_ref_data = out_ref.data<float>();
+
 #ifdef INSTANCE_NORM_FP16_PRINT_RESULT
-          LOG(INFO) << "---- print kernel result (input -> output) ----";
-          for (int eidx = 0; eidx < in_dim.production(); ++eidx) {
-            std::cout << x_data[eidx] << " -> " << out_data[eidx] << std::endl;
-          }
+            LOG(INFO) << "---- print kernel result (input -> output) ----";
+            for (int eidx = 0; eidx < in_dim.production(); ++eidx) {
+              std::cout << x_data[eidx] << " -> " << out_data[eidx]
+                        << " out_ref_data: " << out_ref_data[eidx] << std::endl;
+            }
 #endif  // INSTANCE_NORM_FP16_PRINT_RESULT
-          auto* out_ref_data = out_ref.data<float>();
-          for (int i = 0; i < in_dim.production(); i++) {
-            auto abs_diff = abs(out_data[i] - out_ref_data[i]);
-            auto relative_diff =
-                COMPUTE_RELATIVE_DIFF(out_data[i], out_ref_data[i]);
-            EXPECT_EQ(
-                (relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
-                true);
-            if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
-              LOG(ERROR) << "error idx:" << i << ", in_data[" << i
-                         << "]: " << x_data[i] << ", out_data[" << i
-                         << "]: " << out_data[i] << ", out_ref[" << i
-                         << "]: " << out_ref_data[i]
-                         << ", abs_diff: " << abs_diff
-                         << ", relative_diff: " << relative_diff
-                         << ", FP16_MAX_DIFF: " << FP16_MAX_DIFF;
+
+            for (int i = 0; i < in_dim.production(); i++) {
+              auto abs_diff = abs(out_data[i] - out_ref_data[i]);
+              auto relative_diff =
+                  COMPUTE_RELATIVE_DIFF(out_data[i], out_ref_data[i]);
+              EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) ||
+                            (abs_diff <= FP16_MAX_DIFF),
+                        true);
+              if ((relative_diff > FP16_MAX_DIFF) &&
+                  (abs_diff > FP16_MAX_DIFF)) {
+                LOG(ERROR) << "error idx:" << i << ", in_data[" << i
+                           << "]: " << x_data[i] << ", out_data[" << i
+                           << "]: " << out_data[i] << ", out_ref[" << i
+                           << "]: " << out_ref_data[i]
+                           << ", abs_diff: " << abs_diff
+                           << ", relative_diff: " << relative_diff
+                           << ", FP16_MAX_DIFF: " << FP16_MAX_DIFF;
+              }
             }
-          }
-          delete[] out_data;
-          delete[] out_image_data;
+            delete[] out_data;
+// delete[] out_image_data;
 #ifdef INSTANCE_NORM_FP16_LOOP_TEST
-        }  // w
-      }    // h
-    }      // c
-  }        // n
+          }  // w
+        }    // h
+      }      // c
+    }        // n
 #else
 // nothing to do.
 #endif
+  }
 }
 
 }  // namespace lite