[X86] add grid_sample (#5895)

PaddlePaddle · Apr 14, 2021 · fd45946 · fd45946
1 parent e642cdc
commit fd45946
Show file tree

Hide file tree

Showing 5 changed files with 403 additions and 14 deletions.
diff --git a/lite/kernels/x86/CMakeLists.txt b/lite/kernels/x86/CMakeLists.txt
@@ -38,11 +38,8 @@ add_kernel(gru_unit_compute_x86 X86 basic SRCS gru_unit_compute.cc DEPS ${lite_k
 add_kernel(sequence_expand_as_compute_x86 X86 basic SRCS sequence_expand_as_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(sequence_conv_compute_x86 X86 basic SRCS sequence_conv_compute.cc DEPS ${lite_kernel_deps} math_function blas context_project)
 
-# lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
 add_kernel(gather_compute_x86 X86 extra SRCS gather_compute.cc DEPS ${lite_kernel_deps} fluid_data_type)
-# lite_cc_test(test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86)
-# lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86)
-# lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_x86)
+add_kernel(grid_sampler_compute_x86 X86 extra SRCS grid_sampler_compute.cc DEPS ${lite_kernel_deps} math_function)
 add_kernel(mul_compute_x86 X86 basic SRCS mul_compute.cc DEPS ${lite_kernel_deps} blas)
 add_kernel(concat_compute_x86 X86 basic SRCS concat_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(sequence_pool_compute_x86 X86 basic SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} sequence_pooling)

diff --git a/lite/kernels/x86/grid_sampler_compute.cc b/lite/kernels/x86/grid_sampler_compute.cc
@@ -0,0 +1,347 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/grid_sampler_compute.h"
+#include <string>
+#include "lite/backends/x86/math/math_function.h"
+#include "lite/core/op_registry.h"
+#include "lite/fluid/eigen.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = lite::fluid::EigenTensor<T, D, MajorType, IndexType>;
+
+using Array4 = Eigen::DSizes<int64_t, 4>;
+
+template <typename T>
+inline bool IsInBound(T x, T y, T x_max, T y_max) {
+  return !(x < static_cast<T>(0) || x > x_max || y < static_cast<T>(0) ||
+           y > y_max);
+}
+
+template <typename T>
+void Unnormalize(const X86Context& ctx,
+                 Tensor* grid_slice,
+                 const int max_val,  // height-1 or width-1
+                 bool align_corners) {
+  auto place = lite::fluid::EigenDeviceType<TARGET(kX86)>();
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+
+  if (!align_corners) {
+    auto factor = static_cast<T>((max_val + 1) * 0.5);
+    grid_slice_t.device(place) =
+        (grid_slice_t + static_cast<T>(1)) * factor - static_cast<T>(0.5);
+  } else {
+    auto factor = static_cast<T>(max_val * 0.5);
+    grid_slice_t.device(place) = (grid_slice_t + static_cast<T>(1)) * factor;
+  }
+}
+
+template <typename T>
+void Clip(const X86Context& ctx,
+          Tensor* grid_slice,
+          const int max_val,  // height-1 or width-1
+          bool align_corners,
+          std::string padding_mode) {
+  auto place = lite::fluid::EigenDeviceType<TARGET(kX86)>();
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+  if (padding_mode == "border") {
+    grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
+                                     .cwiseMin(static_cast<T>(max_val));
+  } else if (padding_mode == "reflection") {
+    if (align_corners) {
+      auto double_range = static_cast<T>(max_val * 2);
+      auto grid_abs = grid_slice_t.abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
+    } else {
+      auto double_range = static_cast<T>((max_val + 1) * 2);
+      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      grid_slice_t.device(place) =
+          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
+      grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
+                                       .cwiseMin(static_cast<T>(max_val));
+    }
+  }
+}
+
+template <class T>
+void CalcGridLocations(const X86Context& ctx,
+                       const Tensor& grid,
+                       const int in_h,
+                       const int in_w,
+                       bool align_corners,
+                       std::string padding_mode,
+                       Tensor* grid_x,
+                       Tensor* grid_y) {
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+
+  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
+  DDim grid_dim{{n, out_h, out_w}};
+  grid_x->Resize(grid_dim);
+  grid_y->Resize(grid_dim);
+  T* grid_x_data = grid_x->template mutable_data<T>();
+  T* grid_y_data = grid_y->template mutable_data<T>();
+  const T* grid_data = grid.data<T>();
+  for (int i = 0; i < n * out_h * out_w; i++) {
+    grid_x_data[i] = grid_data[2 * i];
+    grid_y_data[i] = grid_data[(2 * i) + 1];
+  }
+
+  Unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
+  Unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
+
+  Clip<T>(ctx, grid_x, in_w - 1, align_corners, padding_mode);
+  Clip<T>(ctx, grid_y, in_h - 1, align_corners, padding_mode);
+}
+
+template <typename T>
+void GetGridPointValue(const Tensor& input,
+                       Tensor* output,
+                       const Tensor& x,
+                       const Tensor& y) {
+  const int n = input.dims()[0];
+  const int c = input.dims()[1];
+  const int in_h = input.dims()[2];
+  const int in_w = input.dims()[3];
+  const int out_h = x.dims()[1];
+  const int out_w = x.dims()[2];
+  auto x_t = EigenTensor<T, 3>::From(x);
+  auto y_t = EigenTensor<T, 3>::From(y);
+  auto output_t =
+      EigenTensor<T, 4>::From(*output).setConstant(static_cast<T>(0));
+  auto input_t = EigenTensor<T, 4>::From(input);
+
+  for (int i = 0; i < n; i++) {
+    for (int k = 0; k < out_h; k++) {
+      for (int l = 0; l < out_w; l++) {
+        if (IsInBound(x_t(i, k, l),
+                      y_t(i, k, l),
+                      static_cast<T>(in_w - 1),
+                      static_cast<T>(in_h - 1))) {
+          for (int j = 0; j < c; j++) {
+            output_t(i, j, k, l) =
+                input_t(i,
+                        j,
+                        static_cast<int>(round(y_t(i, k, l))),
+                        static_cast<int>(round(x_t(i, k, l))));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void AllNeigbors(const X86Context& ctx,
+                 const Tensor& input,
+                 Tensor* grid_x,
+                 Tensor* grid_y,
+                 Tensor* x_w,
+                 Tensor* x_e,
+                 Tensor* y_n,
+                 Tensor* y_s,  // positions
+                 Tensor* d_w,
+                 Tensor* d_e,
+                 Tensor* d_n,
+                 Tensor* d_s,  // distance
+                 Tensor* v_wn,
+                 Tensor* v_en,
+                 Tensor* v_ws,
+                 Tensor* v_es) {  // values
+  auto place = lite::fluid::EigenDeviceType<TARGET(kX86)>();
+
+  const int c = input.dims()[1];
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
+  // calculate coords of 4 corner points
+  DDim dim{{n, out_h, out_w}};
+  x_w->Resize(dim);
+  x_e->Resize(dim);
+  y_n->Resize(dim);
+  y_s->Resize(dim);
+  x_w->template mutable_data<T>();
+  x_e->template mutable_data<T>();
+  y_n->template mutable_data<T>();
+  y_s->template mutable_data<T>();
+  auto x_w_t = EigenTensor<T, 3>::From(*x_w);
+  auto x_e_t = EigenTensor<T, 3>::From(*x_e);
+  auto y_n_t = EigenTensor<T, 3>::From(*y_n);
+  auto y_s_t = EigenTensor<T, 3>::From(*y_s);
+
+  auto grid_x_t = EigenTensor<T, 3>::From(*grid_x);
+  auto grid_y_t = EigenTensor<T, 3>::From(*grid_y);
+
+  x_w_t.device(place) = grid_x_t.floor();
+  x_e_t.device(place) = x_w_t + static_cast<T>(1);
+  y_n_t.device(place) = grid_y_t.floor();
+  y_s_t.device(place) = y_n_t + static_cast<T>(1);
+
+  // calculate distances to 4 sides
+  d_w->Resize(dim);
+  d_e->Resize(dim);
+  d_n->Resize(dim);
+  d_s->Resize(dim);
+  d_w->template mutable_data<T>();
+  d_e->template mutable_data<T>();
+  d_n->template mutable_data<T>();
+  d_s->template mutable_data<T>();
+  auto d_w_t = EigenTensor<T, 3>::From(*d_w);
+  auto d_e_t = EigenTensor<T, 3>::From(*d_e);
+  auto d_n_t = EigenTensor<T, 3>::From(*d_n);
+  auto d_s_t = EigenTensor<T, 3>::From(*d_s);
+  d_w_t.device(place) = grid_x_t - x_w_t;
+  d_e_t.device(place) = x_e_t - grid_x_t;
+  d_n_t.device(place) = grid_y_t - y_n_t;
+  d_s_t.device(place) = y_s_t - grid_y_t;
+
+  // calc 4 corner points value
+  DDim v_dim{{n, c, out_h, out_w}};
+  v_wn->Resize(v_dim);
+  v_en->Resize(v_dim);
+  v_ws->Resize(v_dim);
+  v_es->Resize(v_dim);
+  v_wn->template mutable_data<T>();
+  v_en->template mutable_data<T>();
+  v_ws->template mutable_data<T>();
+  v_es->template mutable_data<T>();
+  GetGridPointValue<T>(input, v_wn, *x_w, *y_n);
+  GetGridPointValue<T>(input, v_en, *x_e, *y_n);
+  GetGridPointValue<T>(input, v_ws, *x_w, *y_s);
+  GetGridPointValue<T>(input, v_es, *x_e, *y_s);
+}
+
+template <typename T>
+void BilinearInter(const X86Context& ctx,
+                   const Tensor& input,
+                   Tensor* grid_x,
+                   Tensor* grid_y,
+                   Tensor* out) {
+  auto place = lite::fluid::EigenDeviceType<TARGET(kX86)>();
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
+  const int c = input.dims()[1];
+
+  Tensor x_w, x_e, y_n, y_s;
+  Tensor d_w, d_e, d_n, d_s;
+  Tensor v_wn, v_en, v_ws, v_es;
+
+  AllNeigbors<T>(ctx,
+                 input,
+                 grid_x,
+                 grid_y,
+                 &x_w,
+                 &x_e,
+                 &y_n,
+                 &y_s,
+                 &d_w,
+                 &d_e,
+                 &d_n,
+                 &d_s,
+                 &v_wn,
+                 &v_en,
+                 &v_ws,
+                 &v_es);
+
+  auto d_w_t = EigenTensor<T, 3>::From(d_w);
+  auto d_e_t = EigenTensor<T, 3>::From(d_e);
+  auto d_n_t = EigenTensor<T, 3>::From(d_n);
+  auto d_s_t = EigenTensor<T, 3>::From(d_s);
+
+  auto d_w_scaled_t =
+      d_w_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_e_scaled_t =
+      d_e_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_n_scaled_t =
+      d_n_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_s_scaled_t =
+      d_s_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
+  auto v_en_t = EigenTensor<T, 4>::From(v_en);
+  auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
+  auto v_es_t = EigenTensor<T, 4>::From(v_es);
+  auto output_t = EigenTensor<T, 4>::From(*out);
+  // bilinear interpolaetion by 4 corner points
+  output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t +
+                           v_en_t * d_w_scaled_t * d_s_scaled_t +
+                           v_ws_t * d_e_scaled_t * d_n_scaled_t +
+                           v_es_t * d_w_scaled_t * d_n_scaled_t;
+}
+
+template <class T>
+void GridSamplerCompute<T>::Run() {
+  auto& param = this->Param<param_t>();
+  auto& context = ctx_->As<X86Context>();
+  auto* input = param.x;
+  auto* grid = param.grid;
+  auto* output = param.out;
+  const std::string padding_mode = param.padding_mode;
+  const std::string mode = param.mode;
+  const bool align_corners = param.align_corners;
+
+  auto input_dims = input->dims();
+  const int in_h = input_dims[2];
+  const int in_w = input_dims[3];
+
+  output->template mutable_data<T>();
+  lite::x86::math::SetConstant<TARGET(kX86), T> set_zero;
+  set_zero(context, output, static_cast<T>(0));
+
+  Tensor grid_x, grid_y;
+  CalcGridLocations<T>(context,
+                       *grid,
+                       in_h,
+                       in_w,
+                       align_corners,
+                       padding_mode,
+                       &grid_x,
+                       &grid_y);
+  if (mode == "bilinear") {
+    BilinearInter<T>(context, *input, &grid_x, &grid_y, output);
+  } else if (mode == "nearest") {
+    auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
+    auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
+    grid_x_t = grid_x_t.round();
+    grid_y_t = grid_y_t.round();
+    GetGridPointValue<T>(*input, output, grid_x, grid_y);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(grid_sampler,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::GridSamplerCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Grid", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/grid_sampler_compute.h b/lite/kernels/x86/grid_sampler_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <class T>
+class GridSamplerCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::GridSamplerParam;
+
+  void Run() override;
+
+  virtual ~GridSamplerCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle