Skip to content

Commit

Permalink
[XPU]. Add support fp16 to some op. (#9254)
Browse files Browse the repository at this point in the history
  • Loading branch information
wbn03 authored Jul 25, 2022
1 parent 608b813 commit 1d2d339
Show file tree
Hide file tree
Showing 20 changed files with 588 additions and 164 deletions.
9 changes: 8 additions & 1 deletion lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h
Original file line number Diff line number Diff line change
Expand Up @@ -327,11 +327,18 @@ class XPUStaticKernelPickPass : public mir::StmtPass {
"conv2d_transpose",
"elementwise_mul",
"elementwise_add",
"elementwise_mod",
"elementwise_floordiv",
"reduce_mean",
"bilinear_interp",
"bilinear_interp_v2",
"nearest_interp",
"nearest_interp_v2"};
"nearest_interp_v2",
"tile",
"transpose",
"pixel_shuffle",
"expand_v2",
"meshgrid"};
const std::set<std::string> xpu_inplace_op_{"reshape",
"reshape2",
"flatten",
Expand Down
1 change: 1 addition & 0 deletions lite/kernels/xpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ add_kernel(expand_v2_compute_xpu XPU basic SRCS expand_v2_compute.cc)
add_kernel(range_compute_xpu XPU extra SRCS range_compute.cc)
add_kernel(where_compute_xpu XPU extra SRCS where_compute.cc)
add_kernel(gather_nd_compute_xpu XPU extra SRCS gather_nd_compute.cc)
add_kernel(meshgrid_compute_xpu XPU basic SRCS meshgrid_compute.cc)

# extra
add_kernel(lookup_table_compute_xpu XPU extra SRCS lookup_table_compute.cc)
Expand Down
38 changes: 28 additions & 10 deletions lite/kernels/xpu/activation_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,24 +33,26 @@ void ReluCompute<T, PType>::Run() {
CHECK_EQ(r, 0);
}

void Relu6Compute::Run() {
template <typename T, PrecisionType PType>
void Relu6Compute<T, PType>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<XPUContext>();

int r = xdnn::relu6(ctx.GetRawContext(),
param.X->data<float>(),
param.Out->mutable_data<float>(TARGET(kXPU)),
param.X->template data<T>(),
param.Out->template mutable_data<T>(TARGET(kXPU)),
param.X->numel());
CHECK_EQ(r, 0);
}

void GeluCompute::Run() {
template <typename T, PrecisionType PType>
void GeluCompute<T, PType>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<XPUContext>();

int r = xdnn::gelu(ctx.GetRawContext(),
param.X->data<float>(),
param.Out->mutable_data<float>(TARGET(kXPU)),
param.X->template data<T>(),
param.Out->template mutable_data<T>(TARGET(kXPU)),
param.X->numel());
CHECK_EQ(r, 0);
}
Expand Down Expand Up @@ -303,17 +305,33 @@ REGISTER_LITE_KERNEL(relu, kXPU, kFP16, kNCHW, reluFP16, reluFP16)
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(
relu6, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::Relu6Compute, def)
using relu6FP32 =
paddle::lite::kernels::xpu::Relu6Compute<float, PRECISION(kFloat)>;
using relu6FP16 =
paddle::lite::kernels::xpu::Relu6Compute<float16, PRECISION(kFP16)>;
REGISTER_LITE_KERNEL(relu6, kXPU, kFloat, kNCHW, relu6FP32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(relu6, kXPU, kFP16, kNCHW, relu6FP16, relu6FP16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(
gelu, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::GeluCompute, def)
using geluFP32 =
paddle::lite::kernels::xpu::GeluCompute<float, PRECISION(kFloat)>;
using geluFP16 =
paddle::lite::kernels::xpu::GeluCompute<float16, PRECISION(kFP16)>;
REGISTER_LITE_KERNEL(gelu, kXPU, kFloat, kNCHW, geluFP32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
using gelu_fp16 =
paddle::lite::kernels::xpu::GeluCompute<float16, PRECISION(kFP16)>;
REGISTER_LITE_KERNEL(gelu, kXPU, kFP16, kNCHW, geluFP16, geluFP16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

using tanhFP32 =
paddle::lite::kernels::xpu::TanhCompute<float, PRECISION(kFloat)>;
Expand Down
6 changes: 4 additions & 2 deletions lite/kernels/xpu/activation_compute.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ class ReluCompute : public KernelLite<TARGET(kXPU), PType> {
virtual ~ReluCompute() = default;
};

class Relu6Compute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
template <typename T, PrecisionType PType>
class Relu6Compute : public KernelLite<TARGET(kXPU), PType> {
public:
using param_t = operators::ActivationParam;

Expand All @@ -39,7 +40,8 @@ class Relu6Compute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
virtual ~Relu6Compute() = default;
};

class GeluCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
template <typename T, PrecisionType PType>
class GeluCompute : public KernelLite<TARGET(kXPU), PType> {
public:
using param_t = operators::ActivationParam;

Expand Down
203 changes: 188 additions & 15 deletions lite/kernels/xpu/elementwise_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,44 @@ struct MaxFunctor {
}
};

template <class T, class Functor>
void ElementwiseCompute<T, Functor>::Run() {
template <typename T>
struct MinFunctor {
inline int operator()(xdnn::Context* ctx,
const T* x,
const T* y,
T* z,
const std::vector<int>& xshape,
const std::vector<int>& yshape) const {
return xdnn::broadcast_min<T>(ctx, x, y, z, xshape, yshape);
}
};

template <typename T>
struct ModFunctor {
inline int operator()(xdnn::Context* ctx,
const T* x,
const T* y,
T* z,
const std::vector<int>& xshape,
const std::vector<int>& yshape) const {
return xdnn::broadcast_mod<T>(ctx, x, y, z, xshape, yshape);
}
};

template <typename T>
struct FloordivFunctor {
inline int operator()(xdnn::Context* ctx,
const T* x,
const T* y,
T* z,
const std::vector<int>& xshape,
const std::vector<int>& yshape) const {
return xdnn::broadcast_floordiv<T>(ctx, x, y, z, xshape, yshape);
}
};

template <class T, class Functor, PrecisionType PType>
void ElementwiseCompute<T, Functor, PType>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<XPUContext>();
const Tensor* x = param.X;
Expand Down Expand Up @@ -131,20 +167,73 @@ void ElementwiseCompute<T, Functor>::Run() {
} // namespace paddle

namespace xpu = paddle::lite::kernels::xpu;
using AddFloat32 = xpu::ElementwiseCompute<float, xpu::AddFunctor<float>>;
using AddFloat16 = xpu::ElementwiseCompute<float16, xpu::AddFunctor<float16>>;
using AddInt32 = xpu::ElementwiseCompute<int, xpu::AddFunctor<int>>;
using AddInt64 = xpu::ElementwiseCompute<int64_t, xpu::AddFunctor<int64_t>>;
using AddFloat32 =
xpu::ElementwiseCompute<float, xpu::AddFunctor<float>, PRECISION(kFloat)>;
using AddFloat16 = xpu::ElementwiseCompute<float16,
xpu::AddFunctor<float16>,
PRECISION(kFP16)>;
using AddInt32 =
xpu::ElementwiseCompute<int, xpu::AddFunctor<int>, PRECISION(kFloat)>;
using AddInt64 = xpu::ElementwiseCompute<int64_t,
xpu::AddFunctor<int64_t>,
PRECISION(kFloat)>;

using SubFloat32 =
xpu::ElementwiseCompute<float, xpu::SubFunctor<float>, PRECISION(kFloat)>;
using SubFloat16 = xpu::ElementwiseCompute<float16,
xpu::SubFunctor<float16>,
PRECISION(kFP16)>;

using MulFloat32 =
xpu::ElementwiseCompute<float, xpu::MulFunctor<float>, PRECISION(kFloat)>;
using MulFloat16 = xpu::ElementwiseCompute<float16,
xpu::MulFunctor<float16>,
PRECISION(kFP16)>;
using MulInt64 = xpu::ElementwiseCompute<int64_t,
xpu::MulFunctor<int64_t>,
PRECISION(kFloat)>;

using DivFloat32 =
xpu::ElementwiseCompute<float, xpu::DivFunctor<float>, PRECISION(kFloat)>;
using DivFloat16 = xpu::ElementwiseCompute<float16,
xpu::DivFunctor<float16>,
PRECISION(kFP16)>;

using SubFloat32 = xpu::ElementwiseCompute<float, xpu::SubFunctor<float>>;
using MaxFloat32 =
xpu::ElementwiseCompute<float, xpu::MaxFunctor<float>, PRECISION(kFloat)>;
using MaxFloat16 = xpu::ElementwiseCompute<float16,
xpu::MaxFunctor<float16>,
PRECISION(kFP16)>;
using MaxInt32 =
xpu::ElementwiseCompute<int, xpu::MaxFunctor<int>, PRECISION(kFloat)>;

using MulFloat32 = xpu::ElementwiseCompute<float, xpu::MulFunctor<float>>;
using MulFloat16 = xpu::ElementwiseCompute<float16, xpu::MulFunctor<float16>>;
using MinFloat32 =
xpu::ElementwiseCompute<float, xpu::MinFunctor<float>, PRECISION(kFloat)>;

using MulInt64 = xpu::ElementwiseCompute<int64_t, xpu::MulFunctor<int64_t>>;
using DivFloat32 = xpu::ElementwiseCompute<float, xpu::DivFunctor<float>>;
using MaxFloat32 = xpu::ElementwiseCompute<float, xpu::MaxFunctor<float>>;
using MaxInt32 = xpu::ElementwiseCompute<int, xpu::MaxFunctor<int>>;
using MinFloat16 = xpu::ElementwiseCompute<float16,
xpu::MinFunctor<float16>,
PRECISION(kFP16)>;
using MinInt32 =
xpu::ElementwiseCompute<int, xpu::MinFunctor<int>, PRECISION(kFloat)>;

using ModFloat32 =
xpu::ElementwiseCompute<float, xpu::ModFunctor<float>, PRECISION(kFloat)>;

using ModFloat16 = xpu::ElementwiseCompute<float16,
xpu::ModFunctor<float16>,
PRECISION(kFP16)>;
using ModInt32 =
xpu::ElementwiseCompute<int, xpu::ModFunctor<int>, PRECISION(kFloat)>;

using FloordivFloat32 = xpu::ElementwiseCompute<float,
xpu::FloordivFunctor<float>,
PRECISION(kFloat)>;

using FloordivFloat16 = xpu::ElementwiseCompute<float16,
xpu::FloordivFunctor<float16>,
PRECISION(kFP16)>;
using FloordivInt32 =
xpu::ElementwiseCompute<int, xpu::FloordivFunctor<int>, PRECISION(kFloat)>;

REGISTER_LITE_KERNEL(elementwise_add, kXPU, kFloat, kNCHW, AddFloat32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
Expand All @@ -153,7 +242,7 @@ REGISTER_LITE_KERNEL(elementwise_add, kXPU, kFloat, kNCHW, AddFloat32, def)
.Finalize();

REGISTER_LITE_KERNEL(
elementwise_add, kXPU, kFloat, kNCHW, AddFloat16, DISABLE_XPU1_AddFloat16)
elementwise_add, kXPU, kFP16, kNCHW, AddFloat16, DISABLE_XPU1_AddFloat16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
Expand All @@ -177,14 +266,21 @@ REGISTER_LITE_KERNEL(elementwise_sub, kXPU, kFloat, kNCHW, SubFloat32, def)
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();

REGISTER_LITE_KERNEL(
elementwise_sub, kXPU, kFP16, kNCHW, SubFloat16, DISABLE_XPU1_SubFloat16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(elementwise_mul, kXPU, kFloat, kNCHW, MulFloat32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();

REGISTER_LITE_KERNEL(
elementwise_mul, kXPU, kFloat, kNCHW, MulFloat16, DISABLE_XPU1_MulFloat16)
elementwise_mul, kXPU, kFP16, kNCHW, MulFloat16, DISABLE_XPU1_MulFloat16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
Expand All @@ -202,14 +298,91 @@ REGISTER_LITE_KERNEL(elementwise_div, kXPU, kFloat, kNCHW, DivFloat32, def)
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();

REGISTER_LITE_KERNEL(
elementwise_div, kXPU, kFP16, kNCHW, DivFloat16, DISABLE_XPU1_DivFloat16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(elementwise_max, kXPU, kFloat, kNCHW, MaxFloat32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();

REGISTER_LITE_KERNEL(
elementwise_max, kXPU, kFP16, kNCHW, MaxFloat16, DISABLE_XPU1_MaxFloat16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(elementwise_max, kXPU, kFloat, kNCHW, MaxInt32, int32)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
.Finalize();

REGISTER_LITE_KERNEL(elementwise_min, kXPU, kFloat, kNCHW, MinFloat32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();

REGISTER_LITE_KERNEL(
elementwise_min, kXPU, kFP16, kNCHW, MinFloat16, DISABLE_XPU1_MinFloat16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(elementwise_min, kXPU, kFloat, kNCHW, MinInt32, int32)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
.Finalize();

REGISTER_LITE_KERNEL(elementwise_mod, kXPU, kFloat, kNCHW, ModFloat32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();

REGISTER_LITE_KERNEL(
elementwise_mod, kXPU, kFP16, kNCHW, ModFloat16, DISABLE_XPU1_ModFloat16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(elementwise_mod, kXPU, kFloat, kNCHW, ModInt32, int32)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
.Finalize();

REGISTER_LITE_KERNEL(
elementwise_floordiv, kXPU, kFloat, kNCHW, FloordivFloat32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();

REGISTER_LITE_KERNEL(elementwise_floordiv,
kXPU,
kFP16,
kNCHW,
FloordivFloat16,
DISABLE_XPU1_FloordivFloat16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(
elementwise_floordiv, kXPU, kFloat, kNCHW, FloordivInt32, int32)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
.Finalize();
4 changes: 2 additions & 2 deletions lite/kernels/xpu/elementwise_compute.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ namespace lite {
namespace kernels {
namespace xpu {

template <class T, class Functor>
class ElementwiseCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
template <class T, class Functor, PrecisionType PType>
class ElementwiseCompute : public KernelLite<TARGET(kXPU), PType> {
public:
using param_t = operators::ElementwiseParam;

Expand Down
Loading

0 comments on commit 1d2d339

Please sign in to comment.