Skip to content

Commit

Permalink
[XPU] Support several ops on precision of fp16.
Browse files Browse the repository at this point in the history
  • Loading branch information
stevenshen36 committed Feb 23, 2023
1 parent e79c751 commit 3393166
Show file tree
Hide file tree
Showing 11 changed files with 474 additions and 22 deletions.
91 changes: 91 additions & 0 deletions lite/kernels/x86/calib_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,26 @@ void CalibComputeFp32ToInt8<Ptype, DLType>::Run() {
din, dout, scale.data(), 1, 1, param.input->numel());
}

template <PrecisionType Ptype, DataLayoutType DLType>
void CalibComputeFp32ToFp16<Ptype, DLType>::Run() {
auto& param = this->template Param<operators::CalibParam>();
const auto* din = param.input->template data<float>();
auto* dout = param.output->template mutable_data<float16>();
for (auto i = 0; i < param.input->numel(); ++i) {
dout[i] = static_cast<float16>(din[i]);
}
}

template <PrecisionType Ptype, DataLayoutType DLType>
void CalibComputeFp16ToFp32<Ptype, DLType>::Run() {
auto& param = this->template Param<operators::CalibParam>();
const auto* din = param.input->template data<float16>();
auto* dout = param.output->template mutable_data<float>();
for (auto i = 0; i < param.input->numel(); ++i) {
dout[i] = static_cast<float>(din[i]);
}
}

template <PrecisionType Ptype, DataLayoutType DLType>
void CalibComputeInt64ToInt32<Ptype, DLType>::Run() {
auto& param = this->template Param<operators::CalibParam>();
Expand Down Expand Up @@ -84,6 +104,26 @@ void CalibComputeFp32ToInt32<Ptype, DLType>::Run() {
}
}

template <PrecisionType Ptype, DataLayoutType DLType>
void CalibComputeInt32ToFp16<Ptype, DLType>::Run() {
auto& param = this->template Param<operators::CalibParam>();
const auto* din = param.input->template data<int32_t>();
auto* dout = param.output->template mutable_data<float16>();
for (auto i = 0; i < param.input->numel(); ++i) {
dout[i] = static_cast<float16>(din[i]);
}
}

template <PrecisionType Ptype, DataLayoutType DLType>
void CalibComputeFp16ToInt32<Ptype, DLType>::Run() {
auto& param = this->template Param<operators::CalibParam>();
const auto* din = param.input->template data<float16>();
auto* dout = param.output->template mutable_data<int32_t>();
for (auto i = 0; i < param.input->numel(); ++i) {
dout[i] = static_cast<int32_t>(din[i]);
}
}

template <PrecisionType Ptype, DataLayoutType DLType>
void CalibComputeInt64ToFp32<Ptype, DLType>::Run() {
auto& param = this->template Param<operators::CalibParam>();
Expand Down Expand Up @@ -171,6 +211,39 @@ REGISTER_LITE_KERNEL(
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))})
.Finalize();

typedef paddle::lite::kernels::x86::CalibComputeFp32ToFp16<PRECISION(kFP16),
DATALAYOUT(kNCHW)>
fp16_fp32_to_fp16;
REGISTER_LITE_KERNEL(calib, kX86, kFP16, kNCHW, fp16_fp32_to_fp16, fp32_to_fp16)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFP16))})
.Finalize();

typedef paddle::lite::kernels::x86::CalibComputeFp32ToFp16<PRECISION(kFloat),
DATALAYOUT(kNCHW)>
fp_fp32_to_fp16;
REGISTER_LITE_KERNEL(calib, kX86, kFloat, kNCHW, fp_fp32_to_fp16, fp32_to_fp16)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFP16))})
.Finalize();

typedef paddle::lite::kernels::x86::CalibComputeFp16ToFp32<PRECISION(kFP16),
DATALAYOUT(kNCHW)>
fp16_fp16_to_fp32;
REGISTER_LITE_KERNEL(calib, kX86, kFP16, kNCHW, fp16_fp16_to_fp32, fp16_to_fp32)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))})
.Finalize();
typedef paddle::lite::kernels::x86::CalibComputeFp32ToFp16<PRECISION(kFloat),
DATALAYOUT(kNCHW)>
fp_fp16_to_fp32;
REGISTER_LITE_KERNEL(calib, kX86, kFloat, kNCHW, fp_fp16_to_fp32, fp16_to_fp32)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))})
.Finalize();

REGISTER_LITE_KERNEL(
calib_once, kX86, kInt8, kNCHW, i8_fp32_to_int8, fp32_to_int8)
.BindInput("Input",
Expand Down Expand Up @@ -223,3 +296,21 @@ REGISTER_LITE_KERNEL(
{LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))})
.Finalize();

typedef paddle::lite::kernels::x86::CalibComputeInt32ToFp16<PRECISION(kFloat),
DATALAYOUT(kNCHW)>
fp_int32_to_fp16;
REGISTER_LITE_KERNEL(
calib, kX86, kFloat, kNCHW, fp_int32_to_fp16, int32_to_fp16)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFP16))})
.Finalize();
typedef paddle::lite::kernels::x86::CalibComputeFp16ToInt32<PRECISION(kFloat),
DATALAYOUT(kNCHW)>
fp_fp16_to_int32;
REGISTER_LITE_KERNEL(
calib, kX86, kFloat, kNCHW, fp_fp16_to_int32, fp16_to_int32)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))})
.Finalize();
47 changes: 47 additions & 0 deletions lite/kernels/x86/calib_compute.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,30 @@ class CalibComputeFp32ToInt8 : public KernelLite<TARGET(kX86), Ptype, DLType> {
private:
};

template <PrecisionType Ptype, DataLayoutType DLType>
class CalibComputeFp32ToFp16 : public KernelLite<TARGET(kX86), Ptype, DLType> {
public:
using param_t = operators::CalibParam;

void Run() override;

~CalibComputeFp32ToFp16() override{};

private:
};

template <PrecisionType Ptype, DataLayoutType DLType>
class CalibComputeFp16ToFp32 : public KernelLite<TARGET(kX86), Ptype, DLType> {
public:
using param_t = operators::CalibParam;

void Run() override;

~CalibComputeFp16ToFp32() override{};

private:
};

template <PrecisionType Ptype, DataLayoutType DLType>
class CalibComputeInt64ToInt32
: public KernelLite<TARGET(kX86), Ptype, DLType> {
Expand Down Expand Up @@ -107,6 +131,29 @@ class CalibComputeInt64ToFp32 : public KernelLite<TARGET(kX86), Ptype, DLType> {
private:
};

template <PrecisionType Ptype, DataLayoutType DLType>
class CalibComputeInt32ToFp16 : public KernelLite<TARGET(kX86), Ptype, DLType> {
public:
using param_t = operators::CalibParam;

void Run() override;

~CalibComputeInt32ToFp16() override{};

private:
};
template <PrecisionType Ptype, DataLayoutType DLType>
class CalibComputeFp16ToInt32 : public KernelLite<TARGET(kX86), Ptype, DLType> {
public:
using param_t = operators::CalibParam;

void Run() override;

~CalibComputeFp16ToInt32() override{};

private:
};

} // namespace x86
} // namespace kernels
} // namespace lite
Expand Down
2 changes: 2 additions & 0 deletions lite/kernels/xpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ add_kernel(lod_reset_compute_xpu XPU extra SRCS lod_reset_compute.cc)
add_kernel(select_input_compute_xpu XPU extra SRCS select_input_compute.cc)
add_kernel(group_norm_compute_xpu XPU extra SRCS group_norm_compute.cc)
add_kernel(deformable_conv_compute_xpu XPU extra SRCS deformable_conv_compute.cc)
add_kernel(sin_compute_xpu XPU extra SRCS sin_compute.cc)
add_kernel(cos_compute_xpu XPU extra SRCS cos_compute.cc)

# extra(fused kernel)
add_kernel(__xpu__resnet50_compute_xpu XPU extra SRCS __xpu__resnet50_compute.cc)
Expand Down
33 changes: 22 additions & 11 deletions lite/kernels/xpu/activation_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -385,31 +385,42 @@ REGISTER_LITE_KERNEL(sigmoid, kXPU, kFP16, kNCHW, sigmoidFP16, sigmoidFP16)
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

using siluFP32 =
using siluFP32_FP32 =
paddle::lite::kernels::xpu::SiluCompute<float, PRECISION(kFloat)>;
using siluFP16 =
using siluFP16_FP32 =
paddle::lite::kernels::xpu::SiluCompute<float16, PRECISION(kFloat)>;
using siluFP32_FP16 =
paddle::lite::kernels::xpu::SiluCompute<float, PRECISION(kFP16)>;
using siluFP16_FP16 =
paddle::lite::kernels::xpu::SiluCompute<float16, PRECISION(kFP16)>;
REGISTER_LITE_KERNEL(silu, kXPU, kFloat, kNCHW, siluFP32, def)
using eluFP32 =
paddle::lite::kernels::xpu::EluCompute<float, PRECISION(kFloat)>;
using eluFP16 =
paddle::lite::kernels::xpu::EluCompute<float16, PRECISION(kFP16)>;
REGISTER_LITE_KERNEL(elu, kXPU, kFloat, kNCHW, eluFP32, DISABLE_XPU1_eluFP32)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(silu, kXPU, kFP16, kNCHW, siluFP16, def)
REGISTER_LITE_KERNEL(elu, kXPU, kFloat, kNCHW, eluFP16, DISABLE_XPU1_eluFP16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();

using eluFP32 =
paddle::lite::kernels::xpu::EluCompute<float, PRECISION(kFloat)>;
using eluFP16 =
paddle::lite::kernels::xpu::EluCompute<float16, PRECISION(kFP16)>;
REGISTER_LITE_KERNEL(elu, kXPU, kFloat, kNCHW, eluFP32, DISABLE_XPU1_eluFP32)
REGISTER_LITE_KERNEL(silu, kXPU, kFloat, kNCHW, siluFP32_FP32, silf_fp32_fp32)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(elu, kXPU, kFP16, kNCHW, eluFP16, DISABLE_XPU1_eluFP16)
REGISTER_LITE_KERNEL(silu, kXPU, kFloat, kNCHW, siluFP16_FP32, silf_fp16_fp32)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();
REGISTER_LITE_KERNEL(silu, kXPU, kFP16, kNCHW, siluFP32_FP16, silf_fp32_fp16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(silu, kXPU, kFP16, kNCHW, siluFP16_FP16, silf_fp16_fp16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(
abs, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::AbsCompute, def)
Expand Down
83 changes: 83 additions & 0 deletions lite/kernels/xpu/calib_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,29 @@ using xpu_calib_fp32_to_fp16 =
using xpu_calib_fp16_to_fp32 =
paddle::lite::kernels::xpu::CalibCompute<float16, float, PRECISION(kFloat)>;

using xpu_calib_fp32_to_fp16_kfp16 =
paddle::lite::kernels::xpu::CalibCompute<float, float16, PRECISION(kFP16)>;
using xpu_calib_fp16_to_fp32_kfp16 =
paddle::lite::kernels::xpu::CalibCompute<float16, float, PRECISION(kFP16)>;

using xpu_calib_int64_to_fp16 =
paddle::lite::kernels::xpu::CalibCompute<int64_t,
float16,
PRECISION(kFP16)>;
using xpu_calib_fp16_to_int64 =
paddle::lite::kernels::xpu::CalibCompute<float16,
int64_t,
PRECISION(kFP16)>;

using xpu_calib_int32_to_fp16 =
paddle::lite::kernels::xpu::CalibCompute<int32_t,
float16,
PRECISION(kFP16)>;
using xpu_calib_fp16_to_int32 =
paddle::lite::kernels::xpu::CalibCompute<float16,
int32_t,
PRECISION(kFP16)>;

REGISTER_LITE_KERNEL(
calib, kXPU, kFloat, kNCHW, xpu_calib_int64_to_int32, calib_int64_to_int32)
.BindInput("Input",
Expand Down Expand Up @@ -140,6 +163,45 @@ REGISTER_LITE_KERNEL(
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
.Finalize();

REGISTER_LITE_KERNEL(
calib, kXPU, kFP16, kNCHW, xpu_calib_fp16_to_fp32_kfp16, calib_fp16_to_fp32)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
.Finalize();

REGISTER_LITE_KERNEL(
calib, kXPU, kFP16, kNCHW, xpu_calib_fp32_to_fp16_kfp16, calib_fp32_to_fp16)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(
calib, kXPU, kFP16, kNCHW, xpu_calib_int64_to_fp16, calib_int64_to_fp16)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(
calib, kXPU, kFP16, kNCHW, xpu_calib_fp16_to_int64, calib_fp16_to_int64)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
.Finalize();

REGISTER_LITE_KERNEL(
calib, kXPU, kFP16, kNCHW, xpu_calib_int32_to_fp16, calib_int32_to_fp16)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(
calib, kXPU, kFP16, kNCHW, xpu_calib_fp16_to_int32, calib_fp16_to_int32)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
.Finalize();

REGISTER_LITE_KERNEL(calib_once,
kXPU,
kFloat,
Expand Down Expand Up @@ -175,6 +237,27 @@ REGISTER_LITE_KERNEL(
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
.Finalize();

REGISTER_LITE_KERNEL(calib_once,
kXPU,
kFP16,
kNCHW,
xpu_calib_int64_to_fp16,
calib_int64_to_fp16)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(calib_once,
kXPU,
kFP16,
kNCHW,
xpu_calib_fp16_to_int64,
calib_fp16_to_int64)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
.Finalize();

using xpu_calib_fp32_to_int8 =
paddle::lite::kernels::xpu::CalibCompute<float, int8_t, PRECISION(kInt8)>;

Expand Down
54 changes: 54 additions & 0 deletions lite/kernels/xpu/cos_compute.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "lite/kernels/xpu/cos_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"

namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {

template <typename T, PrecisionType PType>
void CosCompute<T, PType>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<XPUContext>();

int r = xdnn::cos(ctx.GetRawContext(),
param.X->template data<T>(),
param.Out->template mutable_data<T>(TARGET(kXPU)),
param.X->numel());
CHECK_EQ(r, 0);
}

} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle

namespace xpu = paddle::lite::kernels::xpu;

using cosFP32 =
paddle::lite::kernels::xpu::CosCompute<float, PRECISION(kFloat)>;
using cosFP16 =
paddle::lite::kernels::xpu::CosCompute<float16, PRECISION(kFP16)>;
REGISTER_LITE_KERNEL(cos, kXPU, kFloat, kNCHW, cosFP32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(cos, kXPU, kFP16, kNCHW, cosFP16, cosFP16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();
Loading

0 comments on commit 3393166

Please sign in to comment.